YJIT: Break register cycles for C arguments (take 2) (#8018)

* Revert "Revert "YJIT: Break register cycles for C arguments (#7918)""

This reverts commit 78ca085785.

* Use shfited_live_ranges for the last-insn check
This commit is contained in:
Takashi Kokubun 2023-07-04 12:57:32 -07:00 коммит произвёл GitHub
Родитель f6af5a1128
Коммит a1d4dada6b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 229 добавлений и 15 удалений

Просмотреть файл

@ -3973,3 +3973,11 @@ assert_equal 'true', %q{
true
end
} unless defined?(RubyVM::RJIT) && RubyVM::RJIT.enabled? # Not yet working on RJIT
# Regresssion test: register allocator on expandarray
assert_equal '[]', %q{
func = proc { [] }
proc do
_x, _y = func.call
end.call
}

Просмотреть файл

@ -74,6 +74,7 @@ impl From<Opnd> for A64Opnd {
Opnd::Mem(Mem { base: MemBase::InsnOut(_), .. }) => {
panic!("attempted to lower an Opnd::Mem with a MemBase::InsnOut base")
},
Opnd::CArg(_) => panic!("attempted to lower an Opnd::CArg"),
Opnd::InsnOut { .. } => panic!("attempted to lower an Opnd::InsnOut"),
Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
Opnd::Stack { .. } => panic!("attempted to lower an Opnd::Stack"),
@ -185,9 +186,10 @@ fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
impl Assembler
{
// A special scratch register for intermediate processing.
// Special scratch registers for intermediate processing.
// This register is caller-saved (so we don't have to save it before using it)
const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG);
pub const SCRATCH_REG: Reg = X16_REG;
const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG);
const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);
/// List of registers that can be used for stack temps.
@ -280,7 +282,7 @@ impl Assembler
/// do follow that encoding, and if they don't then we load them first.
fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd, dest_num_bits: u8) -> Opnd {
match opnd {
Opnd::Reg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd,
Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd,
Opnd::Mem(_) => split_load_operand(asm, opnd),
Opnd::Imm(imm) => {
if imm == 0 {
@ -313,7 +315,7 @@ impl Assembler
/// a certain size. If they don't then we need to load them first.
fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
match opnd {
Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } => opnd,
Opnd::Mem(_) => split_load_operand(asm, opnd),
Opnd::Imm(_) => asm.load(opnd),
Opnd::UImm(uimm) => {
@ -452,7 +454,7 @@ impl Assembler
_ => *opnd
};
asm.load_into(C_ARG_OPNDS[idx], value);
asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), value);
}
// Now we push the CCall without any arguments so that it
@ -924,6 +926,9 @@ impl Assembler
let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
insn_gc_offsets.push(ptr_offset);
},
Opnd::CArg { .. } => {
unreachable!("C argument operand was not lowered before arm64_emit");
}
Opnd::Stack { .. } => {
unreachable!("Stack operand was not lowered before arm64_emit");
}

Просмотреть файл

@ -72,6 +72,9 @@ pub enum Opnd
// Immediate Ruby value, may be GC'd, movable
Value(VALUE),
/// C argument register. The alloc_regs resolves its register dependencies.
CArg(Reg),
// Output of a preceding instruction in this block
InsnOut{ idx: usize, num_bits: u8 },
@ -102,6 +105,7 @@ impl fmt::Debug for Opnd {
match self {
Self::None => write!(fmt, "None"),
Value(val) => write!(fmt, "Value({val:?})"),
CArg(reg) => write!(fmt, "CArg({reg:?})"),
Stack { idx, sp_offset, .. } => write!(fmt, "SP[{}]", *sp_offset as i32 - idx - 1),
InsnOut { idx, num_bits } => write!(fmt, "Out{num_bits}({idx})"),
Imm(signed) => write!(fmt, "{signed:x}_i64"),
@ -145,6 +149,14 @@ impl Opnd
Opnd::UImm(ptr as u64)
}
/// Constructor for a C argument operand
pub fn c_arg(reg_opnd: Opnd) -> Self {
match reg_opnd {
Opnd::Reg(reg) => Opnd::CArg(reg),
_ => unreachable!(),
}
}
pub fn is_some(&self) -> bool {
match *self {
Opnd::None => false,
@ -1224,6 +1236,55 @@ impl Assembler
}
}
// Reorder C argument moves, sometimes adding extra moves using SCRATCH_REG,
// so that they will not rewrite each other before they are used.
fn reorder_c_args(c_args: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> {
// Return the index of a move whose destination is not used as a source if any.
fn find_safe_arg(c_args: &Vec<(Reg, Opnd)>) -> Option<usize> {
c_args.iter().enumerate().find(|(_, &(dest_reg, _))| {
c_args.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg))
}).map(|(index, _)| index)
}
// Remove moves whose source and destination are the same
let mut c_args: Vec<(Reg, Opnd)> = c_args.clone().into_iter()
.filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect();
let mut moves = vec![];
while c_args.len() > 0 {
// Keep taking safe moves
while let Some(index) = find_safe_arg(&c_args) {
moves.push(c_args.remove(index));
}
// No safe move. Load the source of one move into SCRATCH_REG, and
// then load SCRATCH_REG into the destination when it's safe.
if c_args.len() > 0 {
// Make sure it's safe to use SCRATCH_REG
assert!(c_args.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG)));
// Move SCRATCH <- opnd, and delay reg <- SCRATCH
let (reg, opnd) = c_args.remove(0);
moves.push((Assembler::SCRATCH_REG, opnd));
c_args.push((reg, Opnd::Reg(Assembler::SCRATCH_REG)));
}
}
moves
}
// Adjust the number of entries in live_ranges so that it can be indexed by mapped indexes.
fn shift_live_ranges(live_ranges: &mut Vec<usize>, start_index: usize, shift_offset: isize) {
if shift_offset >= 0 {
for index in 0..(shift_offset as usize) {
live_ranges.insert(start_index + index, start_index + index);
}
} else {
for _ in 0..-shift_offset {
live_ranges.remove(start_index);
}
}
}
// Dump live registers for register spill debugging.
fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<usize>, num_regs: usize, spill_index: usize) {
// Convert live_ranges to live_regs: the number of live registers at each index
@ -1247,11 +1308,18 @@ impl Assembler
}
}
// We may need to reorder LoadInto instructions with a C argument operand.
// This buffers the operands of such instructions to process them in batches.
let mut c_args: Vec<(Reg, Opnd)> = vec![];
// live_ranges is indexed by original `index` given by the iterator.
let live_ranges: Vec<usize> = take(&mut self.live_ranges);
// shifted_live_ranges is indexed by mapped indexes in insn operands.
let mut shifted_live_ranges: Vec<usize> = live_ranges.clone();
let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits));
let mut iterator = self.into_draining_iter();
while let Some((index, mut insn)) = iterator.next_unmapped() {
while let Some((index, mut insn)) = iterator.next_mapped() {
// Check if this is the last instruction that uses an operand that
// spans more than one instruction. In that case, return the
// allocated register to the pool.
@ -1262,12 +1330,11 @@ impl Assembler
// Since we have an InsnOut, we know it spans more that one
// instruction.
let start_index = *idx;
assert!(start_index < index);
// We're going to check if this is the last instruction that
// uses this operand. If it is, we can return the allocated
// register to the pool.
if live_ranges[start_index] == index {
if shifted_live_ranges[start_index] == index {
if let Some(Opnd::Reg(reg)) = asm.insns[start_index].out_opnd() {
dealloc_reg(&mut pool, &regs, reg);
} else {
@ -1314,7 +1381,7 @@ impl Assembler
let mut opnd_iter = insn.opnd_iter();
if let Some(Opnd::InsnOut{ idx, .. }) = opnd_iter.next() {
if live_ranges[*idx] == index {
if shifted_live_ranges[*idx] == index {
if let Some(Opnd::Reg(reg)) = asm.insns[*idx].out_opnd() {
out_reg = Some(take_reg(&mut pool, &regs, reg));
}
@ -1371,7 +1438,27 @@ impl Assembler
}
}
asm.push_insn(insn);
// Push instruction(s). Batch and reorder C argument operations if needed.
if let Insn::LoadInto { dest: Opnd::CArg(reg), opnd } = insn {
// Buffer C arguments
c_args.push((reg, opnd));
} else {
// C arguments are buffered until CCall
if c_args.len() > 0 {
// Resolve C argument dependencies
let c_args_len = c_args.len() as isize;
let moves = reorder_c_args(&c_args.drain(..).into_iter().collect());
shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len);
// Push batched C arguments
for (reg, opnd) in moves {
asm.load_into(Opnd::Reg(reg), opnd);
}
}
// Other instructions are pushed as is
asm.push_insn(insn);
}
iterator.map_insn_index(&mut asm);
}
assert_eq!(pool, 0, "Expected all registers to be returned to the pool");
@ -1442,7 +1529,7 @@ impl AssemblerDrainingIterator {
/// end of the current list of instructions in order to maintain that
/// alignment.
pub fn map_insn_index(&mut self, asm: &mut Assembler) {
self.indices.push(asm.insns.len() - 1);
self.indices.push(asm.insns.len().saturating_sub(1));
}
/// Map an operand by using this iterator's list of mapped indices.

Просмотреть файл

@ -87,9 +87,9 @@ impl From<&Opnd> for X86Opnd {
impl Assembler
{
// A special scratch register for intermediate processing.
// Note: right now this is only used by LeaLabel because label_ref accepts
// a closure and we don't want it to have to capture anything.
const SCRATCH0: X86Opnd = X86Opnd::Reg(R11_REG);
// This register is caller-saved (so we don't have to save it before using it)
pub const SCRATCH_REG: Reg = R11_REG;
const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG);
/// List of registers that can be used for stack temps.
pub const TEMP_REGS: [Reg; 5] = [RSI_REG, RDI_REG, R8_REG, R9_REG, R10_REG];
@ -347,7 +347,7 @@ impl Assembler
// Load each operand into the corresponding argument
// register.
for (idx, opnd) in opnds.into_iter().enumerate() {
asm.load_into(C_ARG_OPNDS[idx], *opnd);
asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), *opnd);
}
// Now we push the CCall without any arguments so that it
@ -1055,4 +1055,118 @@ mod tests {
assert_eq!(format!("{:x}", cb), "4983f540");
}
#[test]
fn test_reorder_c_args_no_cycle() {
let (mut asm, mut cb) = setup_asm();
asm.ccall(0 as _, vec![
C_ARG_OPNDS[0], // mov rdi, rdi (optimized away)
C_ARG_OPNDS[1], // mov rsi, rsi (optimized away)
]);
asm.compile_with_num_regs(&mut cb, 0);
assert_disasm!(cb, "b800000000ffd0", {"
0x0: mov eax, 0
0x5: call rax
"});
}
#[test]
fn test_reorder_c_args_single_cycle() {
let (mut asm, mut cb) = setup_asm();
// rdi and rsi form a cycle
asm.ccall(0 as _, vec![
C_ARG_OPNDS[1], // mov rdi, rsi
C_ARG_OPNDS[0], // mov rsi, rdi
C_ARG_OPNDS[2], // mov rdx, rdx (optimized away)
]);
asm.compile_with_num_regs(&mut cb, 0);
assert_disasm!(cb, "4989f34889fe4c89dfb800000000ffd0", {"
0x0: mov r11, rsi
0x3: mov rsi, rdi
0x6: mov rdi, r11
0x9: mov eax, 0
0xe: call rax
"});
}
#[test]
fn test_reorder_c_args_two_cycles() {
let (mut asm, mut cb) = setup_asm();
// rdi and rsi form a cycle, and rdx and rcx form another cycle
asm.ccall(0 as _, vec![
C_ARG_OPNDS[1], // mov rdi, rsi
C_ARG_OPNDS[0], // mov rsi, rdi
C_ARG_OPNDS[3], // mov rdx, rcx
C_ARG_OPNDS[2], // mov rcx, rdx
]);
asm.compile_with_num_regs(&mut cb, 0);
assert_disasm!(cb, "4989f34889fe4c89df4989cb4889d14c89dab800000000ffd0", {"
0x0: mov r11, rsi
0x3: mov rsi, rdi
0x6: mov rdi, r11
0x9: mov r11, rcx
0xc: mov rcx, rdx
0xf: mov rdx, r11
0x12: mov eax, 0
0x17: call rax
"});
}
#[test]
fn test_reorder_c_args_large_cycle() {
let (mut asm, mut cb) = setup_asm();
// rdi, rsi, and rdx form a cycle
asm.ccall(0 as _, vec![
C_ARG_OPNDS[1], // mov rdi, rsi
C_ARG_OPNDS[2], // mov rsi, rdx
C_ARG_OPNDS[0], // mov rdx, rdi
]);
asm.compile_with_num_regs(&mut cb, 0);
assert_disasm!(cb, "4989f34889d64889fa4c89dfb800000000ffd0", {"
0x0: mov r11, rsi
0x3: mov rsi, rdx
0x6: mov rdx, rdi
0x9: mov rdi, r11
0xc: mov eax, 0
0x11: call rax
"});
}
#[test]
fn test_reorder_c_args_with_insn_out() {
let (mut asm, mut cb) = setup_asm();
let rax = asm.load(Opnd::UImm(1));
let rcx = asm.load(Opnd::UImm(2));
let rdx = asm.load(Opnd::UImm(3));
// rcx and rdx form a cycle
asm.ccall(0 as _, vec![
rax, // mov rdi, rax
rcx, // mov rsi, rcx
rcx, // mov rdx, rcx
rdx, // mov rcx, rdx
]);
asm.compile_with_num_regs(&mut cb, 3);
assert_disasm!(cb, "b801000000b902000000ba030000004889c74889ce4989cb4889d14c89dab800000000ffd0", {"
0x0: mov eax, 1
0x5: mov ecx, 2
0xa: mov edx, 3
0xf: mov rdi, rax
0x12: mov rsi, rcx
0x15: mov r11, rcx
0x18: mov rcx, rdx
0x1b: mov rdx, r11
0x1e: mov eax, 0
0x23: call rax
"});
}
}