From b8846dd2f8042fc13a0f5ae17e2e2a6f400074dd Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Tue, 9 Aug 2022 10:27:21 -0400
Subject: [PATCH] Load mem displacement when necessary on AArch64
 (https://github.com/Shopify/ruby/pull/382)

* LDR instruction for AArch64

* Split loads in arm64_split when memory address displacements do not fit
---
 yjit/src/asm/arm64/inst/load_literal.rs  |   6 +-
 yjit/src/asm/arm64/inst/load_register.rs | 108 ++++++++++++++++
 yjit/src/asm/arm64/inst/mod.rs           |   2 +
 yjit/src/asm/arm64/mod.rs                |  46 +++++--
 yjit/src/backend/arm64/mod.rs            | 152 +++++++++++++++++------
 5 files changed, 261 insertions(+), 53 deletions(-)
 create mode 100644 yjit/src/asm/arm64/inst/load_register.rs
diff --git a/yjit/src/asm/arm64/inst/load_literal.rs b/yjit/src/asm/arm64/inst/load_literal.rs
index a49130c3eb..d2a5d57eea 100644
--- a/yjit/src/asm/arm64/inst/load_literal.rs
+++ b/yjit/src/asm/arm64/inst/load_literal.rs
@@ -39,7 +39,7 @@ pub struct LoadLiteral {
 impl LoadLiteral {
     /// LDR (load literal)
     /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en
-    pub fn ldr(rt: u8, imm19: i32, num_bits: u8) -> Self {
+    pub fn ldr_literal(rt: u8, imm19: i32, num_bits: u8) -> Self {
         Self { rt, imm19, opc: num_bits.into() }
     }
 }
@@ -75,14 +75,14 @@ mod tests {
 
     #[test]
     fn test_ldr_positive() {
-        let inst = LoadLiteral::ldr(0, 5, 64);
+        let inst = LoadLiteral::ldr_literal(0, 5, 64);
         let result: u32 = inst.into();
         assert_eq!(0x580000a0, result);
     }
 
     #[test]
     fn test_ldr_negative() {
-        let inst = LoadLiteral::ldr(0, -5, 64);
+        let inst = LoadLiteral::ldr_literal(0, -5, 64);
         let result: u32 = inst.into();
         assert_eq!(0x58ffff60, result);
     }
diff --git a/yjit/src/asm/arm64/inst/load_register.rs b/yjit/src/asm/arm64/inst/load_register.rs
new file mode 100644
index 0000000000..3426b9ba5f
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_register.rs
@@ -0,0 +1,108 @@
+/// Whether or not to shift the register.
+enum S {
+    Shift = 1,
+    NoShift = 0
+}
+
+/// The option for this instruction.
+enum Option {
+    UXTW = 0b010,
+    LSL = 0b011,
+    SXTW = 0b110,
+    SXTX = 0b111
+}
+
+/// The size of the operands of this instruction.
+enum Size {
+    Size32 = 0b10,
+    Size64 = 0b11
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into a Size enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load instruction that can be encoded.
+///
+/// LDR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        1  1    1  0  0  0    0  1  1                                   1  0                                   |
+/// | size.                                rm..............   option.. S          rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadRegister {
+    /// The number of the register to load the value into.
+    rt: u8,
+
+    /// The base register with which to form the address.
+    rn: u8,
+
+    /// Whether or not to shift the value of the register.
+    s: S,
+
+    /// The option associated with this instruction that controls the shift.
+    option: Option,
+
+    /// The number of the offset register.
+    rm: u8,
+
+    /// The size of the operands.
+    size: Size
+}
+
+impl LoadRegister {
+    /// LDR
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--?lang=en
+    pub fn ldr(rt: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rt, rn, s: S::NoShift, option: Option::LSL, rm, size: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadRegister> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadRegister) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (0b11 << 28)
+        | (FAMILY << 25)
+        | (0b11 << 21)
+        | ((inst.rm as u32) << 16)
+        | ((inst.option as u32) << 13)
+        | ((inst.s as u32) << 12)
+        | (0b10 << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadRegister> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadRegister) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldr() {
+        let inst = LoadRegister::ldr(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8626820, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs
index 42df2d137a..ab41464013 100644
--- a/yjit/src/asm/arm64/inst/mod.rs
+++ b/yjit/src/asm/arm64/inst/mod.rs
@@ -10,6 +10,7 @@ mod conditional;
 mod data_imm;
 mod data_reg;
 mod load_literal;
+mod load_register;
 mod load_store;
 mod logical_imm;
 mod logical_reg;
@@ -30,6 +31,7 @@ pub use conditional::Conditional;
 pub use data_imm::DataImm;
 pub use data_reg::DataReg;
 pub use load_literal::LoadLiteral;
+pub use load_register::LoadRegister;
 pub use load_store::LoadStore;
 pub use logical_imm::LogicalImm;
 pub use logical_reg::LogicalReg;
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
index 68be36c256..93b44dba4b 100644
--- a/yjit/src/asm/arm64/mod.rs
+++ b/yjit/src/asm/arm64/mod.rs
@@ -374,11 +374,26 @@ pub fn ldp_post(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// LDR - load a memory address into a register with a register offset
+pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn, rm) {
+        (A64Opnd::Reg(rt), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            LoadRegister::ldr(rt.reg_no, rn.reg_no, rm.reg_no, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// LDR - load a PC-relative memory address into a register
-pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: i32) {
+pub fn ldr_literal(cb: &mut CodeBlock, rt: A64Opnd, rn: i32) {
     let bytes: [u8; 4] = match rt {
         A64Opnd::Reg(rt) => {
-            LoadLiteral::ldr(rt.reg_no, rn, rt.num_bits).into()
+            LoadLiteral::ldr_literal(rt.reg_no, rn, rt.num_bits).into()
         },
         _ => panic!("Invalid operand combination to ldr instruction."),
     };
@@ -386,12 +401,18 @@ pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: i32) {
     cb.write_bytes(&bytes);
 }
 
+/// Whether or not a memory address displacement fits into the maximum number of
+/// bits such that it can be used without loading it into a register first.
+pub fn mem_disp_fits_bits(disp: i32) -> bool {
+    imm_fits_bits(disp.into(), 9)
+}
+
 /// LDR (post-index) - load a register from memory, update the base pointer after loading it
 pub fn ldr_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
 
             LoadStore::ldr_post(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -406,7 +427,7 @@ pub fn ldr_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
 
             LoadStore::ldr_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -426,7 +447,7 @@ pub fn ldur(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
         },
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "Expected displacement to be 9 bits or less");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
 
             LoadStore::ldur(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -441,7 +462,7 @@ pub fn ldursw(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "Expected displacement to be 9 bits or less");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
 
             LoadStore::ldursw(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
         },
@@ -670,7 +691,7 @@ pub fn str_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
 
             LoadStore::str_post(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -685,7 +706,7 @@ pub fn str_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
 
             LoadStore::str_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -700,7 +721,7 @@ pub fn stur(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
         (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
             assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
-            assert!(imm_fits_bits(rn.disp.into(), 9), "Expected displacement to be 9 bits or less");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
 
             LoadStore::stur(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
         },
@@ -1024,7 +1045,12 @@ mod tests {
 
     #[test]
     fn test_ldr() {
-        check_bytes("40010058", |cb| ldr(cb, X0, 10));
+        check_bytes("6a696cf8", |cb| ldr(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_ldr_literal() {
+        check_bytes("40010058", |cb| ldr_literal(cb, X0, 10));
     }
 
     #[test]
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index db2a30aec0..196523bf74 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -84,13 +84,66 @@ impl Assembler
     /// have no memory operands.
     fn arm64_split(mut self) -> Assembler
     {
+        /// When we're attempting to load a memory address into a register, the
+        /// displacement must fit into the maximum number of bits for an Op::Add
+        /// immediate. If it doesn't, we have to load the displacement into a
+        /// register first.
+        fn split_lea_operand(asm: &mut Assembler, opnd: Opnd) -> Opnd {
+            match opnd {
+                Opnd::Mem(Mem { base, disp, num_bits }) => {
+                    if disp >= 0 && ShiftedImmediate::try_from(disp as u64).is_ok() {
+                        asm.lea(opnd)
+                    } else {
+                        let disp = asm.load(Opnd::Imm(disp.into()));
+                        let reg = match base {
+                            MemBase::Reg(reg_no) => Opnd::Reg(Reg { reg_no, num_bits }),
+                            MemBase::InsnOut(idx) => Opnd::InsnOut { idx, num_bits }
+                        };
+
+                        asm.add(reg, disp)
+                    }
+                },
+                _ => unreachable!("Op::Lea only accepts Opnd::Mem operands.")
+            }
+        }
+
+        /// When you're storing a register into a memory location or loading a
+        /// memory location into a register, the displacement from the base
+        /// register of the memory location must fit into 9 bits. If it doesn't,
+        /// then we need to load that memory address into a register first.
+        fn split_memory_address(asm: &mut Assembler, opnd: Opnd) -> Opnd {
+            match opnd {
+                Opnd::Mem(mem) => {
+                    if mem_disp_fits_bits(mem.disp) {
+                        opnd
+                    } else {
+                        let base = split_lea_operand(asm, opnd);
+                        Opnd::mem(64, base, 0)
+                    }
+                },
+                _ => unreachable!("Can only split memory addresses.")
+            }
+        }
+
+        /// Any memory operands you're sending into an Op::Load instruction need
+        /// to be split in case their displacement doesn't fit into 9 bits.
+        fn split_load_operand(asm: &mut Assembler, opnd: Opnd) -> Opnd {
+            match opnd {
+                Opnd::Mem(_) => {
+                    let split_opnd = split_memory_address(asm, opnd);
+                    asm.load(split_opnd)
+                },
+                _ => asm.load(opnd)
+            }
+        }
+
         /// Operands that take the place of bitmask immediates must follow a
         /// certain encoding. In this function we ensure that those operands
         /// do follow that encoding, and if they don't then we load them first.
         fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
             match opnd {
                 Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
-                Opnd::Mem(_) => asm.load(opnd),
+                Opnd::Mem(_) => split_load_operand(asm, opnd),
                 Opnd::Imm(imm) => {
                     if imm <= 0 {
                         asm.load(opnd)
@@ -116,7 +169,8 @@ impl Assembler
         fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
             match opnd {
                 Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
-                Opnd::Mem(_) | Opnd::Imm(_) => asm.load(opnd),
+                Opnd::Mem(_) => split_load_operand(asm, opnd),
+                Opnd::Imm(_) => asm.load(opnd),
                 Opnd::UImm(uimm) => {
                     if ShiftedImmediate::try_from(uimm).is_ok() {
                         opnd
@@ -128,24 +182,6 @@ impl Assembler
             }
         }
 
-        /// When you're storing a register into a memory location, the
-        /// displacement from the base register of the memory location must fit
-        /// into 9 bits. If it doesn't, then we need to load that memory address
-        /// into a register first.
-        fn split_store(asm: &mut Assembler, opnd: Opnd) -> Opnd {
-            match opnd {
-                Opnd::Mem(mem) => {
-                    if imm_fits_bits(mem.disp.into(), 9) {
-                        opnd
-                    } else {
-                        let base = asm.lea(opnd);
-                        Opnd::mem(64, base, 0)
-                    }
-                },
-                _ => unreachable!("Can only store memory addresses.")
-            }
-        }
-
         self.forward_pass(|asm, index, op, opnds, target, text, pos_marker, original_opnds| {
             // Load all Value operands into registers that aren't already a part
             // of Load instructions.
@@ -172,7 +208,7 @@ impl Assembler
                             asm.add(reg_opnd, opnd1);
                         },
                         _ => {
-                            let opnd0 = asm.load(opnds[0]);
+                            let opnd0 = split_load_operand(asm, opnds[0]);
                             let opnd1 = split_shifted_immediate(asm, opnds[1]);
                             asm.add(opnd0, opnd1);
                         }
@@ -189,7 +225,7 @@ impl Assembler
                             asm.push_insn(op, vec![reg_opnd, opnd1], target, text, pos_marker);
                         },
                         _ => {
-                            let opnd0 = asm.load(opnds[0]);
+                            let opnd0 = split_load_operand(asm, opnds[0]);
                             let opnd1 = split_bitmask_immediate(asm, opnds[1]);
                             asm.push_insn(op, vec![opnd0, opnd1], target, text, pos_marker);
                         }
@@ -204,7 +240,7 @@ impl Assembler
                     // Note: the iteration order is reversed to avoid corrupting x0,
                     // which is both the return value and first argument register
                     for (idx, opnd) in opnds.into_iter().enumerate().rev() {
-                        let value = asm.load(opnd);
+                        let value = split_load_operand(asm, opnd);
                         asm.mov(C_ARG_OPNDS[idx], value);
                     }
 
@@ -215,16 +251,15 @@ impl Assembler
                 Op::Cmp => {
                     let opnd0 = match opnds[0] {
                         Opnd::Reg(_) | Opnd::InsnOut { .. } => opnds[0],
-                        _ => asm.load(opnds[0])
+                        _ => split_load_operand(asm, opnds[0])
                     };
 
                     let opnd1 = split_shifted_immediate(asm, opnds[1]);
-
-                    asm.push_insn(op, vec![opnd0, opnd1], target, text, pos_marker);
+                    asm.cmp(opnd0, opnd1);
                 },
                 Op::CRet => {
                     if opnds[0] != Opnd::Reg(C_RET_REG) {
-                        let value = asm.load(opnds[0]);
+                        let value = split_load_operand(asm, opnds[0]);
                         asm.mov(C_RET_OPND, value);
                     }
                     asm.cret(C_RET_OPND);
@@ -234,7 +269,7 @@ impl Assembler
                     let new_opnds = opnds.into_iter().map(|opnd| {
                         match opnd {
                             Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
-                            _ => asm.load(opnd)
+                            _ => split_load_operand(asm, opnd)
                         }
                     }).collect();
 
@@ -243,7 +278,7 @@ impl Assembler
                 Op::IncrCounter => {
                     // We'll use LDADD later which only works with registers
                     // ... Load pointer into register
-                    let counter_addr = asm.lea(opnds[0]);
+                    let counter_addr = split_lea_operand(asm, opnds[0]);
 
                     // Load immediates into a register
                     let addend = match opnds[1] {
@@ -255,12 +290,15 @@ impl Assembler
                 },
                 Op::JmpOpnd => {
                     if let Opnd::Mem(_) = opnds[0] {
-                        let opnd0 = asm.load(opnds[0]);
+                        let opnd0 = split_load_operand(asm, opnds[0]);
                         asm.jmp_opnd(opnd0);
                     } else {
                         asm.jmp_opnd(opnds[0]);
                     }
                 },
+                Op::Load => {
+                    split_load_operand(asm, opnds[0]);
+                },
                 Op::LoadSExt => {
                     match opnds[0] {
                         // We only want to sign extend if the operand is a
@@ -295,7 +333,7 @@ impl Assembler
                     // we'll use the normal mov instruction.
                     match opnds[0] {
                         Opnd::Mem(_) => {
-                            let opnd0 = split_store(asm, opnds[0]);
+                            let opnd0 = split_memory_address(asm, opnds[0]);
                             asm.store(opnd0, value);
                         },
                         Opnd::Reg(_) => {
@@ -308,7 +346,7 @@ impl Assembler
                     // The value that is being negated must be in a register, so
                     // if we get anything else we need to load it first.
                     let opnd0 = match opnds[0] {
-                        Opnd::Mem(_) => asm.load(opnds[0]),
+                        Opnd::Mem(_) => split_load_operand(asm, opnds[0]),
                         _ => opnds[0]
                     };
 
@@ -318,13 +356,13 @@ impl Assembler
                     // The displacement for the STUR instruction can't be more
                     // than 9 bits long. If it's longer, we need to load the
                     // memory address into a register first.
-                    let opnd0 = split_store(asm, opnds[0]);
+                    let opnd0 = split_memory_address(asm, opnds[0]);
 
                     // The value being stored must be in a register, so if it's
                     // not already one we'll load it first.
                     let opnd1 = match opnds[1] {
                         Opnd::Reg(_) | Opnd::InsnOut { .. } => opnds[1],
-                        _ => asm.load(opnds[1])
+                        _ => split_load_operand(asm, opnds[1])
                     };
 
                     asm.store(opnd0, opnd1);
@@ -332,19 +370,18 @@ impl Assembler
                 Op::Sub => {
                     let opnd0 = match opnds[0] {
                         Opnd::Reg(_) | Opnd::InsnOut { .. } => opnds[0],
-                        _ => asm.load(opnds[0])
+                        _ => split_load_operand(asm, opnds[0])
                     };
 
                     let opnd1 = split_shifted_immediate(asm, opnds[1]);
-
-                    asm.push_insn(op, vec![opnd0, opnd1], target, text, pos_marker);
+                    asm.sub(opnd0, opnd1);
                 },
                 Op::Test => {
                     // The value being tested must be in a register, so if it's
                     // not already one we'll load it first.
                     let opnd0 = match opnds[0] {
                         Opnd::Reg(_) | Opnd::InsnOut { .. } => opnds[0],
-                        _ => asm.load(opnds[0])
+                        _ => split_load_operand(asm, opnds[0])
                     };
 
                     // The second value must be either a register or an
@@ -352,7 +389,6 @@ impl Assembler
                     // immediate. If it's not one of those, we'll need to load
                     // it first.
                     let opnd1 = split_bitmask_immediate(asm, opnds[1]);
-
                     asm.test(opnd0, opnd1);
                 },
                 _ => {
@@ -611,7 +647,7 @@ impl Assembler
                             // references to GC'd Value operands. If the value
                             // being loaded is a heap object, we'll report that
                             // back out to the gc_offsets list.
-                            ldr(cb, insn.out.into(), 2);
+                            ldr_literal(cb, insn.out.into(), 2);
                             b(cb, A64Opnd::new_imm(1 + (SIZEOF_VALUE as i64) / 4));
                             cb.write_bytes(&value.as_u64().to_le_bytes());
 
@@ -901,6 +937,42 @@ mod tests {
         asm.compile_with_num_regs(&mut cb, 1);
     }
 
+    #[test]
+    fn test_emit_load_mem_disp_fits_into_load() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let opnd = asm.load(Opnd::mem(64, SP, 0));
+        asm.store(Opnd::mem(64, SP, 0), opnd);
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        // Assert that two instructions were written: LDUR and STUR.
+        assert_eq!(8, cb.get_write_pos());
+    }
+
+    #[test]
+    fn test_emit_load_mem_disp_fits_into_add() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let opnd = asm.load(Opnd::mem(64, SP, 1 << 10));
+        asm.store(Opnd::mem(64, SP, 0), opnd);
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        // Assert that three instructions were written: ADD, LDUR, and STUR.
+        assert_eq!(12, cb.get_write_pos());
+    }
+
+    #[test]
+    fn test_emit_load_mem_disp_does_not_fit_into_add() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let opnd = asm.load(Opnd::mem(64, SP, 1 << 12 | 1));
+        asm.store(Opnd::mem(64, SP, 0), opnd);
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        // Assert that three instructions were written: MOVZ, ADD, LDUR, and STUR.
+        assert_eq!(16, cb.get_write_pos());
+    }
+
     #[test]
     fn test_emit_or() {
         let (mut asm, mut cb) = setup_asm();