Skip to content

Commit

Permalink
Put clone3 behind a feature
Browse files Browse the repository at this point in the history
Signed-off-by: Jorge Prendes <[email protected]>
  • Loading branch information
YJDoc2 authored and jprendes committed Oct 9, 2023
1 parent a7b6d27 commit 56202d8
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 63 deletions.
1 change: 1 addition & 0 deletions crates/libcontainer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ systemd = ["libcgroups/systemd", "v2"]
v2 = ["libcgroups/v2"]
v1 = ["libcgroups/v1"]
cgroupsv2_devices = ["libcgroups/cgroupsv2_devices"]
no-clone3 = []

[dependencies]
bitflags = "2.4.0"
Expand Down
106 changes: 43 additions & 63 deletions crates/libcontainer/src/process/fork.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
use std::{ffi::c_int, num::NonZeroUsize};

use libc::SIGCHLD;
use nix::{
sys::{mman, resource},
unistd::Pid,
};
use nix::unistd::Pid;

#[derive(Debug, thiserror::Error)]
pub enum CloneError {
Expand All @@ -25,15 +20,13 @@ pub enum CloneError {
}

/// The callback function used in clone system call. The return value is i32
/// which is consistent with C functions return code. The trait has to be
/// `FnMut` because we need to be able to call the closure multiple times, once
/// in clone3 and once in clone if fallback is required. The closure is boxed
/// which is consistent with C functions return code. The closure is boxed
/// because we need to store the closure on heap, not stack in the case of
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
/// pass in a child stack, which is empty. By storing the closure in heap, we
/// can then in the new process to re-box the heap memory back to a closure
/// correctly.
pub type CloneCb = Box<dyn FnMut() -> i32>;
pub type CloneCb = Box<dyn FnOnce() -> i32>;

// Clone a sibling process that shares the same parent as the calling
// process. This is used to launch the container init process so the parent
Expand All @@ -48,39 +41,26 @@ pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
// The older `clone` will not return EINVAL in this case. Instead it ignores
// the exit signal bits in the glibc wrapper. Therefore, we explicitly set
// the exit_signal to None here, so this works for both version of clone.
clone_internal(cb, libc::CLONE_PARENT as u64, None)
clone_internal(cb, libc::CLONE_PARENT as u64, 0)
}

// Clone a child process and execute the callback.
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, 0, Some(SIGCHLD as u64))
clone_internal(cb, 0, SIGCHLD as u64)
}

// An internal wrapper to manage the clone3 vs clone fallback logic.
fn clone_internal(
mut cb: CloneCb,
flags: u64,
exit_signal: Option<u64>,
) -> Result<Pid, CloneError> {
match clone3(&mut cb, flags, exit_signal) {
Ok(pid) => Ok(pid),
// For now, we decide to only fallback on ENOSYS
Err(CloneError::Clone(nix::Error::ENOSYS)) => {
tracing::debug!("clone3 is not supported, fallback to clone");
let pid = clone(cb, flags, exit_signal)?;

Ok(pid)
}
Err(err) => Err(err),
}
#[cfg(feature = "no-clone3")]
// We should not use clone3 as required by the no-clone3 feature. Call clone directly.
fn clone_internal(cb: CloneCb, flags: u64, exit_signal: u64) -> Result<Pid, CloneError> {
clone(cb, flags, exit_signal)
}

// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking
// the interface of fork. There is not need to explicitly manage the memory, so
// we can safely passing the callback closure as reference.
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
#[cfg(not(feature = "no-clone3"))]
// Try to use clone3, and fallback to clone in case of ENOSYS.
// Unlike the clone call, clone3 is currently using the kernel syscall.
fn clone_internal(cb: CloneCb, flags: u64, exit_signal: u64) -> Result<Pid, CloneError> {
#[repr(C)]
struct clone3_args {
struct Clone3Args {
flags: u64,
pidfd: u64,
child_tid: u64,
Expand All @@ -93,28 +73,30 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid,
set_tid_size: u64,
cgroup: u64,
}
let mut args = clone3_args {
let mut args = Clone3Args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
exit_signal,
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut clone3_args;
let args_size = std::mem::size_of::<clone3_args>();
let args_ptr = &mut args as *mut Clone3Args;
let args_size = std::mem::size_of::<Clone3Args>();
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
// available yet. This can have undefined behavior because libc authors do
// not like people calling kernel syscall to directly create processes. Libc
// does perform additional bookkeeping when calling clone or fork. So far,
// we have not observed any issues with calling clone3 directly, but we
// should keep an eye on it.
// does perform additional bookkeeping when calling clone or fork.
// So far we've observed issues when the main process uses threads. In such
// cases enable the `no-clone3` feature.
// See https://github.com/containerd/runwasi/issues/347
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 if nix::Error::last() == nix::Error::ENOSYS => clone(cb, flags, exit_signal),
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
// Inside the cloned process, we execute the callback and exit with
Expand All @@ -126,7 +108,10 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid,
}
}

fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
fn clone(cb: CloneCb, flags: u64, exit_signal: u64) -> Result<Pid, CloneError> {
use nix::sys::{mman, resource};
use std::{ffi::c_int, num::NonZeroUsize};

const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K

Expand All @@ -140,33 +125,28 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, Clone
// Find out the default stack max size through getrlimit.
let (rlim_cur, _) =
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
// mmap will return ENOMEM if stack size is unlimited when we create the
// child stack, so we need to set a reasonable default stack size.
let default_stack_size = if rlim_cur != u64::MAX {
rlim_cur as usize
} else {
tracing::debug!(
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
);
DEFAULT_STACK_SIZE
};

// RLIMIT_STACK is an upper bound but it might be set unreasonably high or
// even unlimited, which may cause mmap to error with ENOMEM.
// Most linux system today use a stack size of 8MB.
// Use the most constraining between RLIMIT_STACK or 8MB for the stack size.
let stack_size = rlim_cur.min(DEFAULT_STACK_SIZE as u64) as usize;

// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap to the system default limit,
// which is 8MB on most of the linux system today. This is OK since mmap
// will only reserve the address space upfront, instead of allocating
// physical memory upfront. The stack will grow as needed, up to the size
// reserved, so no wasted memory here. Lastly, the child stack only needs
// to support the container init process set up code in Youki. When Youki
// calls exec into the container payload, exec will reset the stack. Note,
// do not use MAP_GROWSDOWN since it is not well supported.
// process needs, we allocate through mmap. This is OK since mmap will only
// reserve the address space upfront, instead of allocating physical memory
// upfront. The stack will grow as needed, up to the size reserved, so no
// wasted memory here. Lastly, the child stack only needs to support the
// container init process set up code in Youki. When Youki calls exec into
// the container payload, exec will reset the stack.
// Note: do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
mman::mmap(
None,
NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?,
NonZeroUsize::new(stack_size).ok_or(CloneError::ZeroStackSize)?,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
-1,
Expand All @@ -185,10 +165,10 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, Clone

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
let child_stack_top = unsafe { child_stack.add(stack_size) };

// Combine the clone flags with exit signals.
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
let combined_flags = (flags | exit_signal) as c_int;

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
Expand Down

0 comments on commit 56202d8

Please sign in to comment.