Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cranelift: Make inline stackprobe unroll sequence valgrind compliant #7470

Merged
merged 5 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 48 additions & 36 deletions cranelift/codegen/src/isa/aarch64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,43 +677,9 @@ impl ABIMachineSpec for AArch64MachineDeps {

let probe_count = align_to(frame_size, guard_size) / guard_size;
if probe_count <= PROBE_MAX_UNROLL {
// When manually unrolling stick an instruction that stores 0 at a
// constant offset relative to the stack pointer. This will
// turn into something like `movn tmp, #n ; stur xzr [sp, tmp]`.
//
// Note that this may actually store beyond the stack size for the
// last item but that's ok since it's unused stack space and if
// that faults accidentally we're so close to faulting it shouldn't
// make too much difference to fault there.
insts.reserve(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;
insts.push(Self::gen_store_stack(
StackAMode::SPOffset(-offset, I8),
zero_reg(),
I32,
));
}
Self::gen_probestack_unroll(insts, guard_size, probe_count)
} else {
// The non-unrolled version uses two temporary registers. The
// `start` contains the current offset from sp and counts downwards
// during the loop by increments of `guard_size`. The `end` is
// the size of the frame and where we stop.
//
// Note that this emission is all post-regalloc so it should be ok
// to use the temporary registers here as input/output as the loop
// itself is not allowed to use the registers.
let start = writable_spilltmp_reg();
let end = writable_tmp2_reg();
// `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
// `start` and `end` as temporaries in load_constant.
insts.extend(Inst::load_constant(start, 0, &mut |_| start));
insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
insts.push(Inst::StackProbeLoop {
start,
end: end.to_reg(),
step: Imm12::maybe_from_u64(guard_size.into()).unwrap(),
});
Self::gen_probestack_loop(insts, frame_size, guard_size)
}
}

Expand Down Expand Up @@ -1210,6 +1176,52 @@ impl ABIMachineSpec for AArch64MachineDeps {
}
}

impl AArch64MachineDeps {
fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
// When manually unrolling adjust the stack pointer and then write a zero
// to the stack at that offset. This generates something like
// `sub sp, sp, #1, lsl #12` followed by `stur wzr, [sp]`.
//
// We do this because valgrind expects us to never write beyond the stack
// pointer and associated redzone.
// See: https://github.com/bytecodealliance/wasmtime/issues/7454
for _ in 0..probe_count {
insts.extend(Self::gen_sp_reg_adjust(-(guard_size as i32)));

insts.push(Self::gen_store_stack(
StackAMode::SPOffset(0, I8),
zero_reg(),
I32,
));
}

// Restore the stack pointer to its original value
insts.extend(Self::gen_sp_reg_adjust((guard_size * probe_count) as i32));
Comment on lines +1181 to +1199
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity / unrelated to this PR: do you know why it is necessary to probe via storing to the stack rather than loading from the stack? It seems like either would accomplish the task AFAICT, and loads should be cheaper than stores.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the point of view of global cost, stores might actually be better, I suspect: if it faults in a new page, a load would cause a read-only mapping to the global zero page to be created, and the first write would then cause another page fault (at least if the stack is an ordinary MAP_ANON region). Or at least, we've seen elsewhere that it's better for first touch to be write when TLB contention is high...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay that makes sense. Thanks!

}

fn gen_probestack_loop(insts: &mut SmallInstVec<Inst>, frame_size: u32, guard_size: u32) {
// The non-unrolled version uses two temporary registers. The
// `start` contains the current offset from sp and counts downwards
// during the loop by increments of `guard_size`. The `end` is
// the size of the frame and where we stop.
//
// Note that this emission is all post-regalloc so it should be ok
// to use the temporary registers here as input/output as the loop
// itself is not allowed to use the registers.
let start = writable_spilltmp_reg();
let end = writable_tmp2_reg();
// `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
// `start` and `end` as temporaries in load_constant.
insts.extend(Inst::load_constant(start, 0, &mut |_| start));
insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
insts.push(Inst::StackProbeLoop {
start,
end: end.to_reg(),
step: Imm12::maybe_from_u64(guard_size.into()).unwrap(),
});
}
}

fn select_api_key(
isa_flags: &aarch64_settings::Flags,
call_conv: isa::CallConv,
Expand Down
66 changes: 42 additions & 24 deletions cranelift/codegen/src/isa/riscv64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -718,10 +718,20 @@ impl ABIMachineSpec for Riscv64MachineDeps {
// Number of probes that we need to perform
let probe_count = align_to(frame_size, guard_size) / guard_size;

// Must be a caller-saved register that is not an argument.
let tmp = match call_conv {
isa::CallConv::Tail => Writable::from_reg(x_reg(1)),
_ => Writable::from_reg(x_reg(28)), // t3
};

if probe_count <= PROBE_MAX_UNROLL {
Self::gen_probestack_unroll(insts, guard_size, probe_count)
Self::gen_probestack_unroll(insts, tmp, guard_size, probe_count)
} else {
Self::gen_probestack_loop(insts, call_conv, guard_size, probe_count)
insts.push(Inst::StackProbeLoop {
guard_size,
probe_count,
tmp,
});
}
}
}
Expand Down Expand Up @@ -1052,33 +1062,41 @@ fn create_reg_enviroment() -> MachineEnv {
}

impl Riscv64MachineDeps {
fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
insts.reserve(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;
fn gen_probestack_unroll(
insts: &mut SmallInstVec<Inst>,
tmp: Writable<Reg>,
guard_size: u32,
probe_count: u32,
) {
// When manually unrolling adjust the stack pointer and then write a zero
// to the stack at that offset.
//
// We do this because valgrind expects us to never write beyond the stack
// pointer and associated redzone.
// See: https://github.com/bytecodealliance/wasmtime/issues/7454

// Store the adjust amount in a register upfront, so we don't have to
// reload it for each probe. It's worth loading this as a negative and
// using an `add` instruction since we have compressed versions of `add`
// but not the `sub` instruction.
insts.extend(Inst::load_constant_u64(tmp, (-(guard_size as i64)) as u64));

for _ in 0..probe_count {
insts.push(Inst::AluRRR {
alu_op: AluOPRRR::Add,
rd: writable_stack_reg(),
rs1: stack_reg(),
rs2: tmp.to_reg(),
});

insts.push(Self::gen_store_stack(
StackAMode::SPOffset(-offset, I8),
StackAMode::SPOffset(0, I8),
zero_reg(),
I32,
));
}
}

fn gen_probestack_loop(
insts: &mut SmallInstVec<Inst>,
call_conv: isa::CallConv,
guard_size: u32,
probe_count: u32,
) {
// Must be a caller-saved register that is not an argument.
let tmp = match call_conv {
isa::CallConv::Tail => Writable::from_reg(x_reg(1)),
_ => Writable::from_reg(x_reg(28)), // t3
};
insts.push(Inst::StackProbeLoop {
guard_size,
probe_count,
tmp,
});
// Restore the stack pointer to its original value
insts.extend(Self::gen_sp_reg_adjust((guard_size * probe_count) as i32));
}
}
16 changes: 11 additions & 5 deletions cranelift/codegen/src/isa/x64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,23 @@ pub struct X64ABIMachineSpec;
impl X64ABIMachineSpec {
fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
insts.reserve(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;
for _ in 0..probe_count {
// "Allocate" stack space for the probe by decrementing the stack pointer before
// the write. This is required to make valgrind happy.
// See: https://github.com/bytecodealliance/wasmtime/issues/7454
insts.extend(Self::gen_sp_reg_adjust(-(guard_size as i32)));

// TODO: It would be nice if we could store the imm 0, but we don't have insts for those
// so store the stack pointer. Any register will do, since the stack is undefined at this point
insts.push(Self::gen_store_stack(
StackAMode::SPOffset(-offset, I8),
StackAMode::SPOffset(0, I8),
regs::rsp(),
I32,
));
}

// Restore the stack pointer to its original value
insts.extend(Self::gen_sp_reg_adjust((guard_size * probe_count) as i32));
}

fn gen_probestack_loop(
Expand Down Expand Up @@ -523,8 +529,8 @@ impl ABIMachineSpec for X64ABIMachineSpec {
// Unroll at most n consecutive probes, before falling back to using a loop
//
// This was number was picked because the loop version is 38 bytes long. We can fit
// 5 inline probes in that space, so unroll if its beneficial in terms of code size.
const PROBE_MAX_UNROLL: u32 = 5;
// 4 inline probes in that space, so unroll if its beneficial in terms of code size.
const PROBE_MAX_UNROLL: u32 = 4;

// Number of probes that we need to perform
let probe_count = align_to(frame_size, guard_size) / guard_size;
Expand Down
25 changes: 15 additions & 10 deletions cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ block0:
; VCode:
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; movn x16, #4095 ; str wzr, [sp, x16, SXTX]
; movn x16, #8191 ; str wzr, [sp, x16, SXTX]
; movn x16, #12287 ; str wzr, [sp, x16, SXTX]
; sub sp, sp, #4096
; str wzr, [sp]
; sub sp, sp, #4096
; str wzr, [sp]
; sub sp, sp, #4096
; str wzr, [sp]
; add sp, sp, #12288
; sub sp, sp, #12288
; block0:
; mov x0, sp
Expand All @@ -61,14 +65,15 @@ block0:
; block0: ; offset 0x0
; stp x29, x30, [sp, #-0x10]!
; mov x29, sp
; mov x16, #-0x1000
; str wzr, [sp, x16, sxtx]
; mov x16, #-0x2000
; str wzr, [sp, x16, sxtx]
; mov x16, #-0x3000
; str wzr, [sp, x16, sxtx]
; sub sp, sp, #1, lsl #12
; stur wzr, [sp]
; sub sp, sp, #1, lsl #12
; stur wzr, [sp]
; sub sp, sp, #1, lsl #12
; stur wzr, [sp]
; add sp, sp, #3, lsl #12
; sub sp, sp, #3, lsl #12
; block1: ; offset 0x24
; block1: ; offset 0x28
; mov x0, sp
; add sp, sp, #3, lsl #12
; ldp x29, x30, [sp], #0x10
Expand Down
Loading