-
Notifications
You must be signed in to change notification settings - Fork 355
/
Copy pathfork.rs
391 lines (358 loc) · 16.1 KB
/
fork.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
use std::{ffi::c_int, num::NonZeroUsize};
use libc::SIGCHLD;
use nix::{
sys::{mman, resource},
unistd::Pid,
};
#[derive(Debug, thiserror::Error)]
pub enum CloneError {
#[error("failed to clone process")]
Clone(#[source] nix::Error),
#[error("failed to get system memory page size")]
PageSize(#[source] nix::Error),
#[error("failed to get resource limit")]
ResourceLimit(#[source] nix::Error),
#[error("the stack size is zero")]
ZeroStackSize,
#[error("failed to allocate stack")]
StackAllocation(#[source] nix::Error),
#[error("failed to create stack guard page")]
GuardPage(#[source] nix::Error),
#[error("unknown error code {0}")]
UnknownErrno(i32),
}
/// The callback function used in clone system call. The return value is i32
/// which is consistent with C functions return code. The trait has to be
/// `FnMut` because we need to be able to call the closure multiple times, once
/// in clone3 and once in clone if fallback is required. The closure is boxed
/// because we need to store the closure on heap, not stack in the case of
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
/// pass in a child stack, which is empty. By storing the closure in heap, we
/// can then in the new process to re-box the heap memory back to a closure
/// correctly.
pub type CloneCb = Box<dyn FnMut() -> i32>;
// Clone a sibling process that shares the same parent as the calling
// process. This is used to launch the container init process so the parent
// process of the calling process can receive ownership of the process. If we
// clone a child process as the init process, the calling process (likely the
// youki main process) will exit and the init process will be re-parented to the
// process 1 (system init process), which is not the right behavior of what we
// look for.
pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
// Note: normally, an exit signal is required, but when using
// `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set.
// The older `clone` will not return EINVAL in this case. Instead it ignores
// the exit signal bits in the glibc wrapper. Therefore, we explicitly set
// the exit_signal to None here, so this works for both version of clone.
clone_internal(cb, libc::CLONE_PARENT as u64, None)
}
// Clone a child process and execute the callback.
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, 0, Some(SIGCHLD as u64))
}
// An internal wrapper to manage the clone3 vs clone fallback logic.
fn clone_internal(
mut cb: CloneCb,
flags: u64,
exit_signal: Option<u64>,
) -> Result<Pid, CloneError> {
match clone3(&mut cb, flags, exit_signal) {
Ok(pid) => Ok(pid),
// For now, we decide to only fallback on ENOSYS
Err(CloneError::Clone(nix::Error::ENOSYS)) => {
tracing::debug!("clone3 is not supported, fallback to clone");
let pid = clone(cb, flags, exit_signal)?;
Ok(pid)
}
Err(err) => Err(err),
}
}
// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking
// the interface of fork. There is not need to explicitly manage the memory, so
// we can safely passing the callback closure as reference.
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
#[repr(C)]
struct clone3_args {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
}
let mut args = clone3_args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut clone3_args;
let args_size = std::mem::size_of::<clone3_args>();
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
// available yet. This can have undefined behavior because libc authors do
// not like people calling kernel syscall to directly create processes. Libc
// does perform additional bookkeeping when calling clone or fork. So far,
// we have not observed any issues with calling clone3 directly, but we
// should keep an eye on it.
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
// Inside the cloned process, we execute the callback and exit with
// the return code.
std::process::exit(cb());
}
ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)),
ret => Err(CloneError::UnknownErrno(ret as i32)),
}
}
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE)
.map_err(CloneError::PageSize)?
.map(|size| size as usize)
.unwrap_or(DEFAULT_PAGE_SIZE);
// Find out the default stack max size through getrlimit.
let (rlim_cur, _) =
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
// mmap will return ENOMEM if stack size is unlimited when we create the
// child stack, so we need to set a reasonable default stack size.
let default_stack_size = if rlim_cur != u64::MAX {
rlim_cur as usize
} else {
tracing::debug!(
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
);
DEFAULT_STACK_SIZE
};
// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap to the system default limit,
// which is 8MB on most of the linux system today. This is OK since mmap
// will only reserve the address space upfront, instead of allocating
// physical memory upfront. The stack will grow as needed, up to the size
// reserved, so no wasted memory here. Lastly, the child stack only needs
// to support the container init process set up code in Youki. When Youki
// calls exec into the container payload, exec will reset the stack. Note,
// do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
mman::mmap(
None,
NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
-1,
0,
)
.map_err(CloneError::StackAllocation)?
};
unsafe {
// Consistent with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.map_err(CloneError::GuardPage)?;
};
// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
// Combine the clone flags with exit signals.
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistent with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to the Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() }
}
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let ret = unsafe {
libc::clone(
main,
child_stack_top,
combined_flags,
data as *mut libc::c_void,
)
};
// After the clone returns, the heap memory associated with the Box closure
// is duplicated in the cloned process. Therefore, we can safely re-box the
// closure from the raw pointer and let rust to continue managing the
// memory. We call drop here explicitly to avoid the warning that the
// closure is not used. This is correct since the closure is called in the
// cloned process, not the parent process.
unsafe { drop(Box::from_raw(data)) };
match ret {
-1 => Err(CloneError::Clone(nix::Error::last())),
pid if ret > 0 => Ok(Pid::from_raw(pid)),
_ => unreachable!("clone returned a negative pid {ret}"),
}
}
#[cfg(test)]
mod test {
use crate::channel::channel;
use super::*;
use anyhow::{bail, Context, Result};
use nix::sys::wait::{waitpid, WaitStatus};
use nix::unistd;
#[test]
fn test_container_fork() -> Result<()> {
let pid = container_clone(Box::new(|| 0))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_err_fork() -> Result<()> {
let pid = container_clone(Box::new(|| -1))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 255);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_clone_sibling() -> Result<()> {
// The `container_clone_sibling` will create a sibling process (share
// the same parent) of the calling process. In Unix, a process can only
// wait on the immediate children process and can't wait on the sibling
// process. Therefore, to test the logic, we will have to fork a process
// first and then let the forked process call `container_clone_sibling`.
// Then the testing process (the process where test is called), who are
// the parent to this forked process and the sibling process cloned by
// the `container_clone_sibling`, can wait on both processes.
// We need to use a channel so that the forked process can pass the pid
// of the sibling process to the testing process.
let (sender, receiver) = &mut channel::<i32>()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
let sibling_process_pid =
Pid::from_raw(receiver.recv().with_context(|| {
"failed to receive the sibling pid from forked process"
})?);
receiver.close()?;
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(sibling_process_pid, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the sibling process"),
}
// After sibling process exits, we can wait on the forked process.
match waitpid(child, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(child, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the forked process"),
}
}
unistd::ForkResult::Child => {
// Inside the forked process. We call `container_clone` and pass
// the pid to the parent process.
let pid = container_clone_sibling(Box::new(|| 0))?;
sender.send(pid.as_raw())?;
sender.close()?;
std::process::exit(0);
}
};
Ok(())
}
// This test depends on libseccomp to work.
#[cfg(feature = "libseccomp")]
#[test]
fn test_clone_fallback() -> Result<()> {
use crate::test_utils::TestCallbackError;
use oci_spec::runtime::{
Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder,
};
fn has_clone3() -> bool {
// We use the probe syscall to check if the kernel supports clone3 or
// seccomp has successfully blocked clone3.
let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) };
let err = (res == -1)
.then(std::io::Error::last_os_error)
.expect("probe syscall should not succeed");
err.raw_os_error() != Some(libc::ENOSYS)
}
// To test the fallback behavior, we will create a seccomp rule that
// blocks `clone3` as ENOSYS.
let syscall = LinuxSyscallBuilder::default()
.names(vec![String::from("clone3")])
.action(LinuxSeccompAction::ScmpActErrno)
.errno_ret(libc::ENOSYS as u32)
.build()?;
let seccomp_profile = LinuxSeccompBuilder::default()
.default_action(LinuxSeccompAction::ScmpActAllow)
.architectures(vec![Arch::ScmpArchNative])
.syscalls(vec![syscall])
.build()?;
crate::test_utils::test_in_child_process(|| {
// We use seccomp to block `clone3`
let _ = prctl::set_no_new_privileges(true);
crate::seccomp::initialize_seccomp(&seccomp_profile)
.expect("failed to initialize seccomp");
if has_clone3() {
return Err(TestCallbackError::Custom(
"clone3 is not blocked by seccomp".into(),
));
}
let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
}
status => {
return Err(TestCallbackError::Custom(format!(
"failed to wait on child process: {:?}",
status
)));
}
};
Ok(())
})?;
Ok(())
}
}