From 28433e9aa0c765c9d20bc6397439a1b12e66bcbd Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 27 Sep 2022 16:58:01 -0400 Subject: [PATCH] Change IncrCounter lowering on AArch64 (#6455) * Change IncrCounter lowering on AArch64 Previously we were using LDADDAL which is not available on Graviton 1 chips. Instead, we're going to use an exclusive load/store group through the LDAXR/STLXR instructions. * Update yjit/src/backend/arm64/mod.rs Co-authored-by: Maxime Chevalier-Boisvert --- .../asm/arm64/inst/load_store_exclusive.rs | 109 ++++++++++++++++++ yjit/src/asm/arm64/inst/mod.rs | 2 + yjit/src/asm/arm64/mod.rs | 39 +++++++ yjit/src/asm/arm64/opnd.rs | 10 ++ yjit/src/backend/arm64/mod.rs | 32 +++-- 5 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 yjit/src/asm/arm64/inst/load_store_exclusive.rs diff --git a/yjit/src/asm/arm64/inst/load_store_exclusive.rs b/yjit/src/asm/arm64/inst/load_store_exclusive.rs new file mode 100644 index 0000000000..8216c2200a --- /dev/null +++ b/yjit/src/asm/arm64/inst/load_store_exclusive.rs @@ -0,0 +1,109 @@ +/// The operation being performed for this instruction. +enum Op { + Store = 0, + Load = 1 +} + +/// The size of the registers being operated on. +enum Size { + Size32 = 0b10, + Size64 = 0b11 +} + +/// A convenience function so that we can convert the number of bits of an +/// register operand directly into a Size enum variant. +impl From for Size { + fn from(num_bits: u8) -> Self { + match num_bits { + 64 => Size::Size64, + 32 => Size::Size32, + _ => panic!("Invalid number of bits: {}", num_bits) + } + } +} + +/// The struct that represents an A64 load or store exclusive instruction that +/// can be encoded. +/// +/// LDAXR/STLXR +/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+ +/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 | +/// | 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 | +/// | size. op rs.............. rn.............. rt.............. | +/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+ +/// +pub struct LoadStoreExclusive { + /// The number of the register to be loaded. + rt: u8, + + /// The base register with which to form the address. + rn: u8, + + /// The register to be used for the status result if it applies to this + /// operation. Otherwise it's the zero register. + rs: u8, + + /// The operation being performed for this instruction. + op: Op, + + /// The size of the registers being operated on. + size: Size +} + +impl LoadStoreExclusive { + /// LDAXR + /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LDAXR--Load-Acquire-Exclusive-Register- + pub fn ldaxr(rt: u8, rn: u8, num_bits: u8) -> Self { + Self { rt, rn, rs: 31, op: Op::Load, size: num_bits.into() } + } + + /// STLXR + /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/STLXR--Store-Release-Exclusive-Register- + pub fn stlxr(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self { + Self { rt, rn, rs, op: Op::Store, size: num_bits.into() } + } +} + +/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en +const FAMILY: u32 = 0b0100; + +impl From for u32 { + /// Convert an instruction into a 32-bit value. + fn from(inst: LoadStoreExclusive) -> Self { + 0 + | ((inst.size as u32) << 30) + | (FAMILY << 25) + | ((inst.op as u32) << 22) + | ((inst.rs as u32) << 16) + | (0b111111 << 10) + | ((inst.rn as u32) << 5) + | (inst.rt as u32) + } +} + +impl From for [u8; 4] { + /// Convert an instruction into a 4 byte array. + fn from(inst: LoadStoreExclusive) -> [u8; 4] { + let result: u32 = inst.into(); + result.to_le_bytes() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ldaxr() { + let inst = LoadStoreExclusive::ldaxr(16, 0, 64); + let result: u32 = inst.into(); + assert_eq!(0xc85ffc10, result); + } + + #[test] + fn test_stlxr() { + let inst = LoadStoreExclusive::stlxr(17, 16, 0, 64); + let result: u32 = inst.into(); + assert_eq!(0xc811fc10, result); + } +} diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs index b3a77e73c9..9821e6a334 100644 --- a/yjit/src/asm/arm64/inst/mod.rs +++ b/yjit/src/asm/arm64/inst/mod.rs @@ -13,6 +13,7 @@ mod halfword_imm; mod load_literal; mod load_register; mod load_store; +mod load_store_exclusive; mod logical_imm; mod logical_reg; mod mov; @@ -36,6 +37,7 @@ pub use halfword_imm::HalfwordImm; pub use load_literal::LoadLiteral; pub use load_register::LoadRegister; pub use load_store::LoadStore; +pub use load_store_exclusive::LoadStoreExclusive; pub use logical_imm::LogicalImm; pub use logical_reg::LogicalReg; pub use mov::Mov; diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs index d97452a045..88431ce30a 100644 --- a/yjit/src/asm/arm64/mod.rs +++ b/yjit/src/asm/arm64/mod.rs @@ -331,6 +331,20 @@ pub fn ldaddal(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) { cb.write_bytes(&bytes); } +/// LDAXR - atomic load with acquire semantics +pub fn ldaxr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) { + let bytes: [u8; 4] = match (rt, rn) { + (A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => { + assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register."); + + LoadStoreExclusive::ldaxr(rt.reg_no, rn.reg_no, rt.num_bits).into() + }, + _ => panic!("Invalid operand combination to ldaxr instruction."), + }; + + cb.write_bytes(&bytes); +} + /// LDP (signed offset) - load a pair of registers from memory pub fn ldp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) { let bytes: [u8; 4] = match (rt1, rt2, rn) { @@ -707,6 +721,21 @@ pub fn orr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) { cb.write_bytes(&bytes); } +/// STLXR - store a value to memory, release exclusive access +pub fn stlxr(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) { + let bytes: [u8; 4] = match (rs, rt, rn) { + (A64Opnd::Reg(rs), A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => { + assert_eq!(rs.num_bits, 32, "rs must be a 32-bit register."); + assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register."); + + LoadStoreExclusive::stlxr(rs.reg_no, rt.reg_no, rn.reg_no, rn.num_bits).into() + }, + _ => panic!("Invalid operand combination to stlxr instruction.") + }; + + cb.write_bytes(&bytes); +} + /// STP (signed offset) - store a pair of registers to memory pub fn stp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) { let bytes: [u8; 4] = match (rt1, rt2, rn) { @@ -1183,6 +1212,11 @@ mod tests { check_bytes("8b01eaf8", |cb| ldaddal(cb, X10, X11, X12)); } + #[test] + fn test_ldaxr() { + check_bytes("6afd5fc8", |cb| ldaxr(cb, X10, X11)); + } + #[test] fn test_ldp() { check_bytes("8a2d4da9", |cb| ldp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208))); @@ -1333,6 +1367,11 @@ mod tests { check_bytes("80025fd6", |cb| ret(cb, X20)); } + #[test] + fn test_stlxr() { + check_bytes("8bfd0ac8", |cb| stlxr(cb, W10, X11, X12)); + } + #[test] fn test_stp() { check_bytes("8a2d0da9", |cb| stp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208))); diff --git a/yjit/src/asm/arm64/opnd.rs b/yjit/src/asm/arm64/opnd.rs index 52b2a84637..0dc614ab4e 100644 --- a/yjit/src/asm/arm64/opnd.rs +++ b/yjit/src/asm/arm64/opnd.rs @@ -84,6 +84,14 @@ impl A64Opnd { _ => false } } + + /// Unwrap a register from an operand. + pub fn unwrap_reg(&self) -> A64Reg { + match self { + A64Opnd::Reg(reg) => *reg, + _ => panic!("Expected register operand") + } + } } // argument registers @@ -102,6 +110,8 @@ pub const X12_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 12 }; pub const X13_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 13 }; pub const X14_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 14 }; pub const X15_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 15 }; +pub const X16_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 16 }; +pub const X17_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 17 }; // callee-save registers pub const X19_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 19 }; diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs index 0a5068be58..79dff530d1 100644 --- a/yjit/src/backend/arm64/mod.rs +++ b/yjit/src/backend/arm64/mod.rs @@ -70,7 +70,8 @@ impl Assembler { // A special scratch register for intermediate processing. // This register is caller-saved (so we don't have to save it before using it) - const SCRATCH0: A64Opnd = A64Opnd::Reg(X15_REG); + const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG); + const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG); /// Get the list of registers from which we will allocate on this platform /// These are caller-saved registers @@ -373,17 +374,12 @@ impl Assembler asm.csel_ge(opnd0, opnd1); }, Insn::IncrCounter { mem, value } => { - // We'll use LDADD later which only works with registers - // ... Load pointer into register - let counter_addr = split_lea_operand(asm, mem); - - // Load immediates into a register - let addend = match value { - opnd @ Opnd::Imm(_) | opnd @ Opnd::UImm(_) => asm.load(opnd), - opnd => opnd, + let counter_addr = match mem { + Opnd::Mem(_) => split_lea_operand(asm, mem), + _ => mem }; - asm.incr_counter(counter_addr, addend); + asm.incr_counter(counter_addr, value); }, Insn::JmpOpnd(opnd) => { if let Opnd::Mem(_) = opnd { @@ -936,7 +932,21 @@ impl Assembler emit_conditional_jump::<{Condition::VS}>(cb, *target); }, Insn::IncrCounter { mem, value } => { - ldaddal(cb, value.into(), value.into(), mem.into()); + let label = cb.new_label("incr_counter_loop".to_string()); + cb.write_label(label); + + ldaxr(cb, Self::SCRATCH0, mem.into()); + add(cb, Self::SCRATCH0, Self::SCRATCH0, value.into()); + + // The status register that gets used to track whether or + // not the store was successful must be 32 bytes. Since we + // store the SCRATCH registers as their 64-bit versions, we + // need to rewrap it here. + let status = A64Opnd::Reg(Self::SCRATCH1.unwrap_reg().with_num_bits(32)); + stlxr(cb, status, Self::SCRATCH0, mem.into()); + + cmp(cb, Self::SCRATCH1, A64Opnd::new_uimm(0)); + emit_conditional_jump::<{Condition::NE}>(cb, Target::Label(label)); }, Insn::Breakpoint => { brk(cb, A64Opnd::None);