From fdb2ff6a50f93f4fdab2743354c81852a9850048 Mon Sep 17 00:00:00 2001 From: wackxu Date: Mon, 5 Aug 2024 22:06:18 +0800 Subject: [PATCH] sandbox: support shared process namespace Signed-off-by: wackxu --- vmm/common/src/lib.rs | 1 + vmm/sandbox/src/cloud_hypervisor/factory.rs | 5 +- vmm/sandbox/src/container/handler/ns.rs | 19 +++- vmm/sandbox/src/sandbox.rs | 16 +++ vmm/task/src/config.rs | 4 + vmm/task/src/debug.rs | 1 + vmm/task/src/main.rs | 120 +++++++++++++------- 7 files changed, 118 insertions(+), 48 deletions(-) diff --git a/vmm/common/src/lib.rs b/vmm/common/src/lib.rs index a3a909fb..498f90c3 100644 --- a/vmm/common/src/lib.rs +++ b/vmm/common/src/lib.rs @@ -37,5 +37,6 @@ pub const RESOLV_FILENAME: &str = "resolv.conf"; pub const SANDBOX_NS_PATH: &str = "/run/sandbox-ns"; pub const NET_NAMESPACE: &str = "network"; pub const IPC_NAMESPACE: &str = "ipc"; +pub const PID_NAMESPACE: &str = "pid"; pub const UTS_NAMESPACE: &str = "uts"; pub const CGROUP_NAMESPACE: &str = "cgroup"; diff --git a/vmm/sandbox/src/cloud_hypervisor/factory.rs b/vmm/sandbox/src/cloud_hypervisor/factory.rs index 890ba030..45133d1b 100644 --- a/vmm/sandbox/src/cloud_hypervisor/factory.rs +++ b/vmm/sandbox/src/cloud_hypervisor/factory.rs @@ -22,6 +22,7 @@ use crate::{ devices::{console::Console, fs::Fs, pmem::Pmem, rng::Rng, vsock::Vsock}, CloudHypervisorVM, }, + sandbox::has_shared_pid_namespace, utils::get_netns, vm::VMFactory, }; @@ -46,7 +47,9 @@ impl VMFactory for CloudHypervisorVMFactory { ) -> containerd_sandbox::error::Result { let netns = get_netns(&s.sandbox); let mut vm = CloudHypervisorVM::new(id, &netns, &s.base_dir, &self.vm_config); - + if has_shared_pid_namespace(&s.sandbox) { + vm.config.cmdline.push_str(" task.share_pidns") + } // add image as a disk if !self.vm_config.common.image_path.is_empty() { let rootfs_device = Pmem::new("rootfs", &self.vm_config.common.image_path, true); diff --git a/vmm/sandbox/src/container/handler/ns.rs b/vmm/sandbox/src/container/handler/ns.rs index a2d3cb10..7ef83f3e 100644 --- a/vmm/sandbox/src/container/handler/ns.rs +++ b/vmm/sandbox/src/container/handler/ns.rs @@ -16,9 +16,15 @@ limitations under the License. use async_trait::async_trait; use containerd_sandbox::error::Result; -use vmm_common::{CGROUP_NAMESPACE, IPC_NAMESPACE, NET_NAMESPACE, SANDBOX_NS_PATH, UTS_NAMESPACE}; +use vmm_common::{ + CGROUP_NAMESPACE, IPC_NAMESPACE, NET_NAMESPACE, PID_NAMESPACE, SANDBOX_NS_PATH, UTS_NAMESPACE, +}; -use crate::{container::handler::Handler, sandbox::KuasarSandbox, vm::VM}; +use crate::{ + container::handler::Handler, + sandbox::{has_shared_pid_namespace, KuasarSandbox}, + vm::VM, +}; pub struct NamespaceHandler { container_id: String, @@ -38,6 +44,7 @@ where T: VM + Sync + Send, { async fn handle(&self, sandbox: &mut KuasarSandbox) -> Result<()> { + let share_pidns = has_shared_pid_namespace(&sandbox.data); let container = sandbox.container_mut(&self.container_id)?; let spec = if let Some(s) = &mut container.data.spec { s @@ -47,8 +54,14 @@ where if let Some(l) = spec.linux.as_mut() { l.namespaces .retain(|n| n.r#type != NET_NAMESPACE && n.r#type != CGROUP_NAMESPACE); + l.namespaces.iter_mut().for_each(|n| { - n.path = if n.r#type == IPC_NAMESPACE || n.r#type == UTS_NAMESPACE { + // IPC and UTS namespace is shared in default + // PID namespaces is shared if it is set in pod config + n.path = if n.r#type == IPC_NAMESPACE + || n.r#type == UTS_NAMESPACE + || (n.r#type == PID_NAMESPACE && share_pidns) + { format!("{}/{}", SANDBOX_NS_PATH, n.r#type) } else { "".to_string() diff --git a/vmm/sandbox/src/sandbox.rs b/vmm/sandbox/src/sandbox.rs index 41c5af30..6e0be2be 100644 --- a/vmm/sandbox/src/sandbox.rs +++ b/vmm/sandbox/src/sandbox.rs @@ -19,6 +19,7 @@ use std::{collections::HashMap, io::ErrorKind, path::Path, sync::Arc}; use anyhow::anyhow; use async_trait::async_trait; use containerd_sandbox::{ + cri::api::v1::NamespaceMode, data::SandboxData, error::{Error, Result}, signal::ExitSignal, @@ -667,6 +668,21 @@ fn parse_dnsoptions(servers: &[String], searches: &[String], options: &[String]) resolv_content } +pub fn has_shared_pid_namespace(data: &SandboxData) -> bool { + if let Some(conf) = &data.config { + if let Some(pid_ns_mode) = conf + .linux + .as_ref() + .and_then(|l| l.security_context.as_ref()) + .and_then(|s| s.namespace_options.as_ref()) + .map(|n| n.pid()) + { + return pid_ns_mode == NamespaceMode::Pod; + } + } + false +} + #[derive(Default, Debug, Deserialize)] pub struct SandboxConfig { #[serde(default)] diff --git a/vmm/task/src/config.rs b/vmm/task/src/config.rs index c614c7f3..f8937e3b 100644 --- a/vmm/task/src/config.rs +++ b/vmm/task/src/config.rs @@ -20,6 +20,7 @@ use tokio::fs::read_to_string; const SHAREFS_TYPE: &str = "task.sharefs_type"; const LOG_LEVEL: &str = "task.log_level"; const TASK_DEBUG: &str = "task.debug"; +const SHARE_PIDNS: &str = "task.share_pidns"; macro_rules! parse_cmdline { ($param:ident, $key:ident, $field:expr) => { @@ -41,6 +42,7 @@ macro_rules! parse_cmdline { pub struct TaskConfig { pub(crate) sharefs_type: String, pub(crate) log_level: String, + pub(crate) share_pidns: bool, pub(crate) debug: bool, } @@ -49,6 +51,7 @@ impl Default for TaskConfig { TaskConfig { sharefs_type: "9p".to_string(), log_level: "info".to_string(), + share_pidns: false, debug: false, } } @@ -66,6 +69,7 @@ impl TaskConfig { parse_cmdline!(param, SHAREFS_TYPE, config.sharefs_type, String::from); parse_cmdline!(param, LOG_LEVEL, config.log_level, String::from); parse_cmdline!(param, TASK_DEBUG, config.debug); + parse_cmdline!(param, SHARE_PIDNS, config.share_pidns); } Ok(config) } diff --git a/vmm/task/src/debug.rs b/vmm/task/src/debug.rs index 965b5233..094db50c 100644 --- a/vmm/task/src/debug.rs +++ b/vmm/task/src/debug.rs @@ -37,6 +37,7 @@ pub async fn listen_debug_console(addr: &str) -> Result<()> { tokio::spawn(async move { let mut incoming = l.incoming(); while let Some(Ok(s)) = incoming.next().await { + debug!("get a debug console request"); if let Err(e) = debug_console(s).await { error!("failed to open debug console {:?}", e); } diff --git a/vmm/task/src/main.rs b/vmm/task/src/main.rs index 0df94326..4c4c80d1 100644 --- a/vmm/task/src/main.rs +++ b/vmm/task/src/main.rs @@ -17,14 +17,14 @@ limitations under the License. #![warn(clippy::expect_fun_call, clippy::expect_used)] use std::{ - collections::HashMap, convert::TryFrom, path::Path, process::exit, str::FromStr, sync::Arc, - thread, + collections::HashMap, convert::TryFrom, os::fd::AsRawFd, path::Path, process::exit, + str::FromStr, sync::Arc, }; use containerd_shim::{ asynchronous::{monitor::monitor_notify_by_pid, util::asyncify}, error::Error, - io_error, other, + io_error, other, other_error, protos::{shim::shim_ttrpc_async::create_task, ttrpc::asynchronous::Server}, util::{mkdir, IntoOption}, Result, @@ -35,17 +35,15 @@ use log::{debug, error, info, warn, LevelFilter}; use nix::{ errno::Errno, sched::{unshare, CloneFlags}, - sys::{ - wait, - wait::{WaitPidFlag, WaitStatus}, - }, - unistd::{getpid, gettid, Pid}, + sys::wait::{self, WaitPidFlag, WaitStatus}, + unistd::{fork, getpid, pause, pipe, ForkResult, Pid}, }; use signal_hook_tokio::Signals; use tokio::fs::File; use vmm_common::{ api::sandbox_ttrpc::create_sandbox_service, mount::mount, ETC_RESOLV, HOSTNAME_FILENAME, - IPC_NAMESPACE, KUASAR_STATE_DIR, RESOLV_FILENAME, SANDBOX_NS_PATH, UTS_NAMESPACE, + IPC_NAMESPACE, KUASAR_STATE_DIR, PID_NAMESPACE, RESOLV_FILENAME, SANDBOX_NS_PATH, + UTS_NAMESPACE, }; use crate::{ @@ -137,6 +135,7 @@ lazy_static! { static ref CLONE_FLAG_TABLE: HashMap = HashMap::from([ (String::from(IPC_NAMESPACE), CloneFlags::CLONE_NEWIPC), (String::from(UTS_NAMESPACE), CloneFlags::CLONE_NEWUTS), + (String::from(PID_NAMESPACE), CloneFlags::CLONE_NEWPID), ]); } @@ -170,7 +169,7 @@ async fn start_task_server() -> anyhow::Result<()> { } } - late_init_call().await?; + late_init_call(&config).await?; start_ttrpc_server().await?.start().await?; @@ -305,7 +304,7 @@ async fn init_vm_rootfs() -> Result<()> { // Continue to do initialization that depend on shared path. // such as adding guest hook, preparing sandbox files and namespaces. -async fn late_init_call() -> Result<()> { +async fn late_init_call(config: &TaskConfig) -> Result<()> { // Setup DNS, bind mount to /etc/resolv.conf let dns_file = Path::new(KUASAR_STATE_DIR).join(RESOLV_FILENAME); if dns_file.exists() { @@ -321,7 +320,7 @@ async fn late_init_call() -> Result<()> { } // Setup sandbox namespace - setup_sandbox_ns().await?; + setup_sandbox_ns(config.share_pidns).await?; Ok(()) } @@ -368,12 +367,12 @@ async fn start_ttrpc_server() -> anyhow::Result { .register_service(sandbox_service)) } -async fn setup_sandbox_ns() -> Result<()> { - setup_persistent_ns(vec![ - String::from(IPC_NAMESPACE), - String::from(UTS_NAMESPACE), - ]) - .await?; +async fn setup_sandbox_ns(share_pidns: bool) -> Result<()> { + let mut nss = vec![String::from(IPC_NAMESPACE), String::from(UTS_NAMESPACE)]; + if share_pidns { + nss.push(String::from(PID_NAMESPACE)); + } + setup_persistent_ns(nss).await?; Ok(()) } @@ -398,36 +397,69 @@ async fn setup_persistent_ns(ns_types: Vec) -> Result<()> { .ok_or(other!("bad ns type {}", ns_type))?; } - let operator = move || -> anyhow::Result<()> { - unshare(clone_type)?; + fork_sandbox(ns_types, clone_type)?; + + Ok(()) +} + +fn fork_sandbox(ns_types: Vec, clone_type: CloneFlags) -> Result<()> { + debug!("fork sandbox process {:?}, {:b}", ns_types, clone_type); + let (r, w) = pipe().map_err(other_error!(e, "create pipe when fork sandbox error"))?; + match unsafe { fork().map_err(other_error!(e, "failed to fork"))? } { + ForkResult::Parent { child } => { + debug!("forked process {} for the sandbox", child); + drop(w); + let mut resp = [0u8; 4]; + // just wait the pipe close, do not care the read result + nix::unistd::read(r.as_raw_fd(), &mut resp).unwrap_or_default(); + Ok(()) + } + ForkResult::Child => { + drop(r); + unshare(clone_type).unwrap(); + if !ns_types.iter().any(|n| n == PID_NAMESPACE) { + debug!("mount namespaces in child"); + mount_ns(getpid(), &ns_types); + exit(0); + } + // if we need share pid ns, we fork a pause process to act as the pid 1 of the shared pid ns + match unsafe { fork().unwrap() } { + ForkResult::Parent { child } => { + mount_ns(child, &ns_types); + exit(0); + } + ForkResult::Child => { + debug!("mount namespaces in grand child"); + drop(w); + loop { + pause(); + } + } + } + } + } +} - // set hostname +fn mount_ns(pid: Pid, ns_types: &Vec) { + if ns_types.iter().any(|n| n == UTS_NAMESPACE) { let hostname = std::fs::read_to_string(Path::new(KUASAR_STATE_DIR).join(HOSTNAME_FILENAME)) .map(|s| s.trim().to_string()) .unwrap_or_default(); if !hostname.is_empty() { - nix::unistd::sethostname(hostname)?; - } - - for ns_type in &ns_types { - let sandbox_ns_path = format!("{}/{}", SANDBOX_NS_PATH, ns_type); - let ns_path = format!("/proc/{}/task/{}/ns/{}", getpid(), gettid(), ns_type); - mount( - Some("none"), - Some(ns_path.as_str()), - &["bind".to_string()], - &sandbox_ns_path, - )?; - } - Ok(()) - }; - - thread::spawn(move || { - if let Err(e) = operator() { - error!("setup persistent ns failed: {:?}", e); - exit(-1) + debug!("set hostname for sandbox: {}", hostname); + nix::unistd::sethostname(hostname).unwrap(); } - }); - - Ok(()) + } + for ns_type in ns_types { + let sandbox_ns_path = format!("{}/{}", SANDBOX_NS_PATH, ns_type); + let ns_path = format!("/proc/{}/ns/{}", pid, ns_type); + debug!("mount {} to {}", ns_path, sandbox_ns_path); + mount( + Some("none"), + Some(ns_path.as_str()), + &["bind".to_string()], + &sandbox_ns_path, + ) + .unwrap(); + } }