diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 380d2dfe0bc1..76aaf4ac9509 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -135,12 +135,28 @@ impl Into for StackAMode { // Returns the size of stack space needed to store the // `int_reg` and `vec_reg`. fn saved_reg_stack_size( + call_conv: isa::CallConv, int_reg: &[Writable], vec_reg: &[Writable], ) -> (usize, usize) { // Round up to multiple of 2, to keep 16-byte stack alignment. let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8; - let vec_save_bytes = vec_reg.len() * 16; + // The Baldrdash ABIs require saving and restoring the whole 16-byte + // SIMD & FP registers, so the necessary stack space is always a + // multiple of the mandatory 16-byte stack alignment. However, the + // Procedure Call Standard for the Arm 64-bit Architecture (AAPCS64, + // including several related ABIs such as the one used by Windows) + // mandates saving only the bottom 8 bytes of the vector registers, + // so in that case we round up the number of registers to ensure proper + // stack alignment (similarly to the situation with `int_reg`). + let vec_reg_size = if call_conv.extends_baldrdash() { 16 } else { 8 }; + let vec_save_padding = if call_conv.extends_baldrdash() { + 0 + } else { + vec_reg.len() & 1 + }; + let vec_save_bytes = (vec_reg.len() + vec_save_padding) * vec_reg_size; + (int_save_bytes, vec_save_bytes) } @@ -591,7 +607,8 @@ impl ABIMachineSpec for AArch64MachineDeps { let mut insts = SmallVec::new(); let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers); - let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec); + let (int_save_bytes, vec_save_bytes) = + saved_reg_stack_size(call_conv, &clobbered_int, &clobbered_vec); let total_save_bytes = int_save_bytes + vec_save_bytes; let clobber_size = total_save_bytes as i32; @@ -620,59 +637,170 @@ impl ABIMachineSpec for AArch64MachineDeps { // `frame_offset` tracks offset above start-of-clobbers for unwind-info // purposes. let mut clobber_offset = clobber_size as u32; - for reg_pair in clobbered_int.chunks(2) { - let (r1, r2) = if reg_pair.len() == 2 { - // .to_reg().to_reg(): Writable --> RealReg --> Reg - (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg()) - } else { - (reg_pair[0].to_reg().to_reg(), zero_reg()) - }; + let clobber_offset_change = 16; + let iter = clobbered_int.chunks_exact(2); + + if let [rd] = iter.remainder() { + let rd = rd.to_reg().to_reg(); + + debug_assert_eq!(rd.get_class(), RegClass::I64); + // str rd, [sp, #-16]! + insts.push(Inst::Store64 { + rd, + mem: AMode::PreIndexed( + writable_stack_reg(), + SImm9::maybe_from_i64(-clobber_offset_change).unwrap(), + ), + flags: MemFlags::trusted(), + }); + + if flags.unwind_info() { + clobber_offset -= clobber_offset_change as u32; + insts.push(Inst::Unwind { + inst: UnwindInst::SaveReg { + clobber_offset, + reg: rd.to_real_reg(), + }, + }); + } + } + + let mut iter = iter.rev(); - debug_assert!(r1.get_class() == RegClass::I64); - debug_assert!(r2.get_class() == RegClass::I64); + while let Some([rt, rt2]) = iter.next() { + // .to_reg().to_reg(): Writable --> RealReg --> Reg + let rt = rt.to_reg().to_reg(); + let rt2 = rt2.to_reg().to_reg(); - // stp r1, r2, [sp, #-16]! + debug_assert!(rt.get_class() == RegClass::I64); + debug_assert!(rt2.get_class() == RegClass::I64); + + // stp rt, rt2, [sp, #-16]! insts.push(Inst::StoreP64 { - rt: r1, - rt2: r2, + rt, + rt2, mem: PairAMode::PreIndexed( writable_stack_reg(), - SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + SImm7Scaled::maybe_from_i64(-clobber_offset_change, types::I64).unwrap(), ), flags: MemFlags::trusted(), }); + if flags.unwind_info() { - clobber_offset -= 8; - if r2 != zero_reg() { - insts.push(Inst::Unwind { - inst: UnwindInst::SaveReg { - clobber_offset, - reg: r2.to_real_reg(), - }, - }); + clobber_offset -= clobber_offset_change as u32; + insts.push(Inst::Unwind { + inst: UnwindInst::SaveReg { + clobber_offset, + reg: rt.to_real_reg(), + }, + }); + insts.push(Inst::Unwind { + inst: UnwindInst::SaveReg { + clobber_offset: clobber_offset + (clobber_offset_change / 2) as u32, + reg: rt2.to_real_reg(), + }, + }); + } + } + + let store_vec_reg = |rd| { + if call_conv.extends_baldrdash() { + Inst::FpuStore128 { + rd, + mem: AMode::PreIndexed( + writable_stack_reg(), + SImm9::maybe_from_i64(-clobber_offset_change).unwrap(), + ), + flags: MemFlags::trusted(), + } + } else { + Inst::FpuStore64 { + rd, + mem: AMode::PreIndexed( + writable_stack_reg(), + SImm9::maybe_from_i64(-clobber_offset_change).unwrap(), + ), + flags: MemFlags::trusted(), } - clobber_offset -= 8; + } + }; + let iter = clobbered_vec.chunks_exact(2); + + if let [rd] = iter.remainder() { + let rd = rd.to_reg().to_reg(); + + debug_assert_eq!(rd.get_class(), RegClass::V128); + insts.push(store_vec_reg(rd)); + + if flags.unwind_info() { + clobber_offset -= clobber_offset_change as u32; insts.push(Inst::Unwind { inst: UnwindInst::SaveReg { clobber_offset, - reg: r1.to_real_reg(), + reg: rd.to_real_reg(), }, }); } } - for reg in clobbered_vec.iter() { - insts.push(Inst::FpuStore128 { - rd: reg.to_reg().to_reg(), - mem: AMode::PreIndexed(writable_stack_reg(), SImm9::maybe_from_i64(-16).unwrap()), - flags: MemFlags::trusted(), - }); + let store_vec_reg_pair = |rt, rt2| { + if call_conv.extends_baldrdash() { + let clobber_offset_change = 32; + + ( + Inst::FpuStoreP128 { + rt, + rt2, + mem: PairAMode::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-clobber_offset_change, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + clobber_offset_change as u32, + ) + } else { + let clobber_offset_change = 16; + + ( + Inst::FpuStoreP64 { + rt, + rt2, + mem: PairAMode::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-clobber_offset_change, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + clobber_offset_change as u32, + ) + } + }; + let mut iter = iter.rev(); + + while let Some([rt, rt2]) = iter.next() { + let rt = rt.to_reg().to_reg(); + let rt2 = rt2.to_reg().to_reg(); + + debug_assert_eq!(rt.get_class(), RegClass::V128); + debug_assert_eq!(rt2.get_class(), RegClass::V128); + + let (inst, clobber_offset_change) = store_vec_reg_pair(rt, rt2); + + insts.push(inst); + if flags.unwind_info() { - clobber_offset -= 16; + clobber_offset -= clobber_offset_change; insts.push(Inst::Unwind { inst: UnwindInst::SaveReg { clobber_offset, - reg: reg.to_reg(), + reg: rt.to_real_reg(), + }, + }); + insts.push(Inst::Unwind { + inst: UnwindInst::SaveReg { + clobber_offset: clobber_offset + clobber_offset_change / 2, + reg: rt2.to_real_reg(), }, }); } @@ -700,31 +828,83 @@ impl ABIMachineSpec for AArch64MachineDeps { insts.extend(Self::gen_sp_reg_adjust(fixed_frame_storage_size as i32)); } - for reg in clobbered_vec.iter().rev() { - insts.push(Inst::FpuLoad128 { - rd: Writable::from_reg(reg.to_reg().to_reg()), - mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()), - flags: MemFlags::trusted(), - }); + let load_vec_reg = |rd| { + if call_conv.extends_baldrdash() { + Inst::FpuLoad128 { + rd, + mem: AMode::PostIndexed( + writable_stack_reg(), + SImm9::maybe_from_i64(16).unwrap(), + ), + flags: MemFlags::trusted(), + } + } else { + Inst::FpuLoad64 { + rd, + mem: AMode::PostIndexed( + writable_stack_reg(), + SImm9::maybe_from_i64(16).unwrap(), + ), + flags: MemFlags::trusted(), + } + } + }; + let load_vec_reg_pair = |rt, rt2| { + if call_conv.extends_baldrdash() { + Inst::FpuLoadP128 { + rt, + rt2, + mem: PairAMode::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(32, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + } + } else { + Inst::FpuLoadP64 { + rt, + rt2, + mem: PairAMode::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(16, F64).unwrap(), + ), + flags: MemFlags::trusted(), + } + } + }; + + let mut iter = clobbered_vec.chunks_exact(2); + + while let Some([rt, rt2]) = iter.next() { + let rt = rt.map(|r| r.to_reg()); + let rt2 = rt2.map(|r| r.to_reg()); + + debug_assert_eq!(rt.to_reg().get_class(), RegClass::V128); + debug_assert_eq!(rt2.to_reg().get_class(), RegClass::V128); + insts.push(load_vec_reg_pair(rt, rt2)); } - for reg_pair in clobbered_int.chunks(2).rev() { - let (r1, r2) = if reg_pair.len() == 2 { - ( - reg_pair[0].map(|r| r.to_reg()), - reg_pair[1].map(|r| r.to_reg()), - ) - } else { - (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg()) - }; + debug_assert!(iter.remainder().len() <= 1); + + if let [rd] = iter.remainder() { + let rd = rd.map(|r| r.to_reg()); + + debug_assert_eq!(rd.to_reg().get_class(), RegClass::V128); + insts.push(load_vec_reg(rd)); + } + + let mut iter = clobbered_int.chunks_exact(2); - debug_assert!(r1.to_reg().get_class() == RegClass::I64); - debug_assert!(r2.to_reg().get_class() == RegClass::I64); + while let Some([rt, rt2]) = iter.next() { + let rt = rt.map(|r| r.to_reg()); + let rt2 = rt2.map(|r| r.to_reg()); - // ldp r1, r2, [sp], #16 + debug_assert_eq!(rt.to_reg().get_class(), RegClass::I64); + debug_assert_eq!(rt2.to_reg().get_class(), RegClass::I64); + // ldp rt, rt2, [sp], #16 insts.push(Inst::LoadP64 { - rt: r1, - rt2: r2, + rt, + rt2, mem: PairAMode::PostIndexed( writable_stack_reg(), SImm7Scaled::maybe_from_i64(16, I64).unwrap(), @@ -733,6 +913,20 @@ impl ABIMachineSpec for AArch64MachineDeps { }); } + debug_assert!(iter.remainder().len() <= 1); + + if let [rd] = iter.remainder() { + let rd = rd.map(|r| r.to_reg()); + + debug_assert_eq!(rd.to_reg().get_class(), RegClass::I64); + // ldr rd, [sp], #16 + insts.push(Inst::ULoad64 { + rd, + mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()), + flags: MemFlags::trusted(), + }); + } + // If this is Baldrdash-2020, restore the callee (i.e., our) TLS // register. We may have allocated it for something else and clobbered // it, but the ABI expects us to leave the TLS register unchanged. diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index aa708a85247e..1e8ca78317b4 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -258,6 +258,28 @@ fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable) -> u32 { | machreg_to_vec(rt.to_reg()) } +fn enc_ldst_vec_pair( + opc: u32, + amode: u32, + is_load: bool, + simm7: SImm7Scaled, + rn: Reg, + rt: Reg, + rt2: Reg, +) -> u32 { + debug_assert_eq!(opc & 0b11, opc); + debug_assert_eq!(amode & 0b11, amode); + + 0b00_10110_00_0_0000000_00000_00000_00000 + | opc << 30 + | amode << 23 + | (is_load as u32) << 22 + | simm7.bits() << 15 + | machreg_to_vec(rt2) << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_vec(rt) +} + fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable) -> u32 { (top11 << 21) | (machreg_to_vec(rm) << 16) @@ -923,7 +945,7 @@ impl MachInstEmit for Inst { let srcloc = state.cur_srcloc(); if srcloc != SourceLoc::default() && !flags.notrap() { - // Register the offset at which the actual load instruction starts. + // Register the offset at which the actual store instruction starts. sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } @@ -987,7 +1009,7 @@ impl MachInstEmit for Inst { } => { let srcloc = state.cur_srcloc(); if srcloc != SourceLoc::default() && !flags.notrap() { - // Register the offset at which the actual load instruction starts. + // Register the offset at which the actual store instruction starts. sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } match mem { @@ -1034,6 +1056,120 @@ impl MachInstEmit for Inst { } } } + &Inst::FpuLoadP64 { + rt, + rt2, + ref mem, + flags, + } + | &Inst::FpuLoadP128 { + rt, + rt2, + ref mem, + flags, + } => { + let srcloc = state.cur_srcloc(); + + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + let opc = match self { + &Inst::FpuLoadP64 { .. } => 0b01, + &Inst::FpuLoadP128 { .. } => 0b10, + _ => unreachable!(), + }; + let rt = rt.to_reg(); + let rt2 = rt2.to_reg(); + + match mem { + &PairAMode::SignedOffset(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair(opc, 0b10, true, simm7, reg, rt, rt2)); + } + &PairAMode::PreIndexed(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair( + opc, + 0b11, + true, + simm7, + reg.to_reg(), + rt, + rt2, + )); + } + &PairAMode::PostIndexed(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair( + opc, + 0b01, + true, + simm7, + reg.to_reg(), + rt, + rt2, + )); + } + } + } + &Inst::FpuStoreP64 { + rt, + rt2, + ref mem, + flags, + } + | &Inst::FpuStoreP128 { + rt, + rt2, + ref mem, + flags, + } => { + let srcloc = state.cur_srcloc(); + + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual store instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + let opc = match self { + &Inst::FpuStoreP64 { .. } => 0b01, + &Inst::FpuStoreP128 { .. } => 0b10, + _ => unreachable!(), + }; + + match mem { + &PairAMode::SignedOffset(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair(opc, 0b10, false, simm7, reg, rt, rt2)); + } + &PairAMode::PreIndexed(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair( + opc, + 0b11, + false, + simm7, + reg.to_reg(), + rt, + rt2, + )); + } + &PairAMode::PostIndexed(reg, simm7) => { + assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16); + sink.put4(enc_ldst_vec_pair( + opc, + 0b01, + false, + simm7, + reg.to_reg(), + rt, + rt2, + )); + } + } + } &Inst::Mov64 { rd, rm } => { assert!(rd.to_reg().get_class() == rm.get_class()); assert!(rm.get_class() == RegClass::I64); diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 55e25de5d8a6..505fd2c86b3f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -5105,6 +5105,168 @@ fn test_aarch64_binemit() { "str q16, [x8, x9, LSL #4]", )); + insns.push(( + Inst::FpuLoadP64 { + rt: writable_vreg(0), + rt2: writable_vreg(31), + mem: PairAMode::SignedOffset(xreg(0), SImm7Scaled::zero(F64)), + flags: MemFlags::trusted(), + }, + "007C406D", + "ldp d0, d31, [x0]", + )); + + insns.push(( + Inst::FpuLoadP64 { + rt: writable_vreg(19), + rt2: writable_vreg(11), + mem: PairAMode::PreIndexed( + writable_xreg(25), + SImm7Scaled::maybe_from_i64(-512, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "332FE06D", + "ldp d19, d11, [x25, #-512]!", + )); + + insns.push(( + Inst::FpuLoadP64 { + rt: writable_vreg(7), + rt2: writable_vreg(20), + mem: PairAMode::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(64, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "E753C46C", + "ldp d7, d20, [sp], #64", + )); + + insns.push(( + Inst::FpuStoreP64 { + rt: vreg(4), + rt2: vreg(26), + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64(504, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "E4EB1F6D", + "stp d4, d26, [sp, #504]", + )); + + insns.push(( + Inst::FpuStoreP64 { + rt: vreg(16), + rt2: vreg(8), + mem: PairAMode::PreIndexed( + writable_xreg(15), + SImm7Scaled::maybe_from_i64(48, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "F021836D", + "stp d16, d8, [x15, #48]!", + )); + + insns.push(( + Inst::FpuStoreP64 { + rt: vreg(5), + rt2: vreg(6), + mem: PairAMode::PostIndexed( + writable_xreg(28), + SImm7Scaled::maybe_from_i64(-32, F64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "851BBE6C", + "stp d5, d6, [x28], #-32", + )); + + insns.push(( + Inst::FpuLoadP128 { + rt: writable_vreg(0), + rt2: writable_vreg(17), + mem: PairAMode::SignedOffset(xreg(3), SImm7Scaled::zero(I8X16)), + flags: MemFlags::trusted(), + }, + "604440AD", + "ldp q0, q17, [x3]", + )); + + insns.push(( + Inst::FpuLoadP128 { + rt: writable_vreg(29), + rt2: writable_vreg(9), + mem: PairAMode::PreIndexed( + writable_xreg(16), + SImm7Scaled::maybe_from_i64(-1024, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "1D26E0AD", + "ldp q29, q9, [x16, #-1024]!", + )); + + insns.push(( + Inst::FpuLoadP128 { + rt: writable_vreg(10), + rt2: writable_vreg(20), + mem: PairAMode::PostIndexed( + writable_xreg(26), + SImm7Scaled::maybe_from_i64(256, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "4A53C8AC", + "ldp q10, q20, [x26], #256", + )); + + insns.push(( + Inst::FpuStoreP128 { + rt: vreg(9), + rt2: vreg(31), + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64(1008, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "E9FF1FAD", + "stp q9, q31, [sp, #1008]", + )); + + insns.push(( + Inst::FpuStoreP128 { + rt: vreg(27), + rt2: vreg(13), + mem: PairAMode::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-192, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "FB37BAAD", + "stp q27, q13, [sp, #-192]!", + )); + + insns.push(( + Inst::FpuStoreP128 { + rt: vreg(18), + rt2: vreg(22), + mem: PairAMode::PostIndexed( + writable_xreg(13), + SImm7Scaled::maybe_from_i64(304, I8X16).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "B2D989AC", + "stp q18, q22, [x13], #304", + )); + insns.push(( Inst::LoadFpuConst64 { rd: writable_vreg(16), diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs index 34c2946db0bb..08fc75431daa 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -73,7 +73,7 @@ impl SImm7Scaled { /// Create a SImm7Scaled from a raw offset and the known scale type, if /// possible. pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option { - assert!(scale_ty == I64 || scale_ty == I32); + assert!(scale_ty == I64 || scale_ty == I32 || scale_ty == F64 || scale_ty == I8X16); let scale = scale_ty.bytes(); assert!(scale.is_power_of_two()); let scale = i64::from(scale); diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 03e5c6f47b1f..f6a6aa59d0f2 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -848,7 +848,34 @@ pub enum Inst { mem: AMode, flags: MemFlags, }, - + /// A load of a pair of floating-point registers, double precision (64-bit). + FpuLoadP64 { + rt: Writable, + rt2: Writable, + mem: PairAMode, + flags: MemFlags, + }, + /// A store of a pair of floating-point registers, double precision (64-bit). + FpuStoreP64 { + rt: Reg, + rt2: Reg, + mem: PairAMode, + flags: MemFlags, + }, + /// A load of a pair of floating-point registers, 128-bit. + FpuLoadP128 { + rt: Writable, + rt2: Writable, + mem: PairAMode, + flags: MemFlags, + }, + /// A store of a pair of floating-point registers, 128-bit. + FpuStoreP128 { + rt: Reg, + rt2: Reg, + mem: PairAMode, + flags: MemFlags, + }, LoadFpuConst64 { rd: Writable, const_data: u64, @@ -1908,6 +1935,34 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rd); memarg_regs(mem, collector); } + &Inst::FpuLoadP64 { + rt, rt2, ref mem, .. + } => { + collector.add_def(rt); + collector.add_def(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::FpuStoreP64 { + rt, rt2, ref mem, .. + } => { + collector.add_use(rt); + collector.add_use(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::FpuLoadP128 { + rt, rt2, ref mem, .. + } => { + collector.add_def(rt); + collector.add_def(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::FpuStoreP128 { + rt, rt2, ref mem, .. + } => { + collector.add_use(rt); + collector.add_use(rt2); + pairmemarg_regs(mem, collector); + } &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => { collector.add_def(rd); } @@ -2590,6 +2645,46 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_use(mapper, rd); map_mem(mapper, mem); } + &mut Inst::FpuLoadP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_def(mapper, rt); + map_def(mapper, rt2); + map_pairmem(mapper, mem); + } + &mut Inst::FpuStoreP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_use(mapper, rt); + map_use(mapper, rt2); + map_pairmem(mapper, mem); + } + &mut Inst::FpuLoadP128 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_def(mapper, rt); + map_def(mapper, rt2); + map_pairmem(mapper, mem); + } + &mut Inst::FpuStoreP128 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_use(mapper, rt); + map_use(mapper, rt2); + map_pairmem(mapper, mem); + } &mut Inst::LoadFpuConst64 { ref mut rd, .. } => { map_def(mapper, rd); } @@ -3508,6 +3603,42 @@ impl Inst { let mem = mem.show_rru(mb_rru); format!("{}str {}, {}", mem_str, rd, mem) } + &Inst::FpuLoadP64 { + rt, rt2, ref mem, .. + } => { + let rt = show_vreg_scalar(rt.to_reg(), mb_rru, ScalarSize::Size64); + let rt2 = show_vreg_scalar(rt2.to_reg(), mb_rru, ScalarSize::Size64); + let mem = mem.show_rru(mb_rru); + + format!("ldp {}, {}, {}", rt, rt2, mem) + } + &Inst::FpuStoreP64 { + rt, rt2, ref mem, .. + } => { + let rt = show_vreg_scalar(rt, mb_rru, ScalarSize::Size64); + let rt2 = show_vreg_scalar(rt2, mb_rru, ScalarSize::Size64); + let mem = mem.show_rru(mb_rru); + + format!("stp {}, {}, {}", rt, rt2, mem) + } + &Inst::FpuLoadP128 { + rt, rt2, ref mem, .. + } => { + let rt = show_vreg_scalar(rt.to_reg(), mb_rru, ScalarSize::Size128); + let rt2 = show_vreg_scalar(rt2.to_reg(), mb_rru, ScalarSize::Size128); + let mem = mem.show_rru(mb_rru); + + format!("ldp {}, {}, {}", rt, rt2, mem) + } + &Inst::FpuStoreP128 { + rt, rt2, ref mem, .. + } => { + let rt = show_vreg_scalar(rt, mb_rru, ScalarSize::Size128); + let rt2 = show_vreg_scalar(rt2, mb_rru, ScalarSize::Size128); + let mem = mem.show_rru(mb_rru); + + format!("stp {}, {}, {}", rt, rt2, mem) + } &Inst::LoadFpuConst64 { rd, const_data } => { let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); format!( diff --git a/cranelift/filetests/filetests/isa/aarch64/prologue.clif b/cranelift/filetests/filetests/isa/aarch64/prologue.clif index 20d90c4a6889..40934abd651a 100644 --- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif +++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif @@ -77,22 +77,72 @@ block0(v0: f64): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: str q8, [sp, #-16]! -; nextln: str q9, [sp, #-16]! -; nextln: str q10, [sp, #-16]! -; nextln: str q11, [sp, #-16]! -; nextln: str q12, [sp, #-16]! -; nextln: str q13, [sp, #-16]! -; nextln: str q14, [sp, #-16]! -; nextln: str q15, [sp, #-16]! - -; check: ldr q15, [sp], #16 -; nextln: ldr q14, [sp], #16 -; nextln: ldr q13, [sp], #16 -; nextln: ldr q12, [sp], #16 -; nextln: ldr q11, [sp], #16 -; nextln: ldr q10, [sp], #16 -; nextln: ldr q9, [sp], #16 -; nextln: ldr q8, [sp], #16 +; nextln: stp d14, d15, [sp, #-16]! +; nextln: stp d12, d13, [sp, #-16]! +; nextln: stp d10, d11, [sp, #-16]! +; nextln: stp d8, d9, [sp, #-16]! + +; check: ldp d8, d9, [sp], #16 +; nextln: ldp d10, d11, [sp], #16 +; nextln: ldp d12, d13, [sp], #16 +; nextln: ldp d14, d15, [sp], #16 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f2(i64) -> i64 { +block0(v0: i64): + v1 = iadd.i64 v0, v0 + v2 = iadd.i64 v0, v1 + v3 = iadd.i64 v0, v2 + v4 = iadd.i64 v0, v3 + v5 = iadd.i64 v0, v4 + v6 = iadd.i64 v0, v5 + v7 = iadd.i64 v0, v6 + v8 = iadd.i64 v0, v7 + v9 = iadd.i64 v0, v8 + v10 = iadd.i64 v0, v9 + v11 = iadd.i64 v0, v10 + v12 = iadd.i64 v0, v11 + v13 = iadd.i64 v0, v12 + v14 = iadd.i64 v0, v13 + v15 = iadd.i64 v0, v14 + v16 = iadd.i64 v0, v15 + v17 = iadd.i64 v0, v16 + v18 = iadd.i64 v0, v17 + + v19 = iadd.i64 v0, v1 + v20 = iadd.i64 v2, v3 + v21 = iadd.i64 v4, v5 + v22 = iadd.i64 v6, v7 + v23 = iadd.i64 v8, v9 + v24 = iadd.i64 v10, v11 + v25 = iadd.i64 v12, v13 + v26 = iadd.i64 v14, v15 + v27 = iadd.i64 v16, v17 + + v28 = iadd.i64 v18, v19 + v29 = iadd.i64 v20, v21 + v30 = iadd.i64 v22, v23 + v31 = iadd.i64 v24, v25 + v32 = iadd.i64 v26, v27 + + v33 = iadd.i64 v28, v29 + v34 = iadd.i64 v30, v31 + + v35 = iadd.i64 v32, v33 + v36 = iadd.i64 v34, v35 + + return v36 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: str x22, [sp, #-16]! +; nextln: stp x19, x20, [sp, #-16]! +; nextln: add x1, x0, x0 + +; check: add x0, x1, x0 +; nextln: ldp x19, x20, [sp], #16 +; nextln: ldr x22, [sp], #16 ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret