Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass process user integration test #243

Merged
merged 10 commits into from
Sep 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ youki is not at the practical stage yet. However, it is getting closer to practi
| Seccomp | Filtering system calls | WIP on [#25](https://github.com/containers/youki/issues/25) |
| Hooks | Add custom processing during container creation | ✅ |
| Rootless | Running a container without root privileges | It works, but cgroups isn't supported. WIP on [#77](https://github.com/containers/youki/issues/77) |
| OCI Compliance | Compliance with OCI Runtime Spec | 44 out of 55 test cases passing |
| OCI Compliance | Compliance with OCI Runtime Spec | 45 out of 55 test cases passing |

# Design and implementation of youki
![sequence diagram of youki](docs/.drawio.svg)
Expand Down
2 changes: 1 addition & 1 deletion integration_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ test_cases=(
"process_oom_score_adj/process_oom_score_adj.t"
"process_rlimits/process_rlimits.t"
"process_rlimits_fail/process_rlimits_fail.t"
# "process_user/process_user.t"
"process_user/process_user.t"
"root_readonly_true/root_readonly_true.t"
# Record the tests that runc also fails to pass below, maybe we will fix this by origin integration test, issue: https://github.com/containers/youki/issues/56
# "start/start.t"
Expand Down
17 changes: 12 additions & 5 deletions src/container/builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ impl<'a> ContainerBuilderImpl<'a> {
notify_socket,
preserve_fds: self.preserve_fds,
container: self.container.clone(),
rootless: self.rootless.clone(),
};
let intermediate_pid = fork::container_fork(|| {
// The fds in the pipe is duplicated during fork, so we first close
Expand All @@ -120,7 +121,7 @@ impl<'a> ContainerBuilderImpl<'a> {
.close()
.context("Failed to close unused receiver")?;

init::container_intermidiate(init_args, receiver_from_main, sender_to_main)
init::container_intermediate(init_args, receiver_from_main, sender_to_main)
})?;
// Close down unused fds. The corresponding fds are duplicated to the
// child process during fork.
Expand All @@ -137,7 +138,12 @@ impl<'a> ContainerBuilderImpl<'a> {
if self.rootless.is_some() {
receiver_from_intermediate.wait_for_mapping_request()?;
log::debug!("write mapping for pid {:?}", intermediate_pid);
utils::write_file(format!("/proc/{}/setgroups", intermediate_pid), "deny")?;
let rootless = self.rootless.as_ref().unwrap();
if !rootless.privileged {
// The main process is running as an unprivileged user and cannot write the mapping
// until "deny" has been written to setgroups. See CVE-2014-8989.
utils::write_file(format!("/proc/{}/setgroups", intermediate_pid), "deny")?;
}
rootless::write_uid_mapping(intermediate_pid, self.rootless.as_ref())?;
rootless::write_gid_mapping(intermediate_pid, self.rootless.as_ref())?;
sender_to_intermediate.mapping_written()?;
Expand All @@ -146,10 +152,11 @@ impl<'a> ContainerBuilderImpl<'a> {
let init_pid = receiver_from_intermediate.wait_for_intermediate_ready()?;
log::debug!("init pid is {:?}", init_pid);

cmanager
.add_task(init_pid)
.context("Failed to add tasks to cgroup manager")?;
if self.rootless.is_none() && linux.resources.is_some() && self.init {
cmanager
.add_task(init_pid)
.context("Failed to add tasks to cgroup manager")?;

cmanager
.apply(linux.resources.as_ref().unwrap())
.context("Failed to apply resource limits through cgroup")?;
Expand Down
4 changes: 2 additions & 2 deletions src/container/init_builder.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use anyhow::{bail, Context, Result};
use nix::unistd;
use oci_spec::Spec;
use rootless::detect_rootless;
use rootless::Rootless;
use std::{
fs,
path::{Path, PathBuf},
Expand Down Expand Up @@ -65,7 +65,7 @@ impl InitContainerBuilder {
None
};

let rootless = detect_rootless(&spec)?;
let rootless = Rootless::new(&spec)?;
let mut builder_impl = ContainerBuilderImpl {
init: true,
syscall: self.base.syscall,
Expand Down
4 changes: 2 additions & 2 deletions src/container/tenant_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::{
str::FromStr,
};

use crate::{notify_socket::NotifySocket, rootless::detect_rootless, tty, utils};
use crate::{notify_socket::NotifySocket, rootless::Rootless, tty, utils};

use super::{builder::ContainerBuilder, builder_impl::ContainerBuilderImpl, Container};

Expand Down Expand Up @@ -100,7 +100,7 @@ impl TenantContainerBuilder {
let csocketfd = self.setup_tty_socket(&container_dir)?;

let use_systemd = self.should_use_systemd(&container);
let rootless = detect_rootless(&spec)?;
let rootless = Rootless::new(&spec)?;

let mut builder_impl = ContainerBuilderImpl {
init: false,
Expand Down
4 changes: 2 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use youki::commands::run;
use youki::commands::spec_json;
use youki::commands::start;
use youki::commands::state;
use youki::rootless::should_use_rootless;
use youki::rootless::rootless_required;
use youki::utils::{self, create_dir_all_with_mode};

// High-level commandline option definition
Expand Down Expand Up @@ -118,7 +118,7 @@ fn determine_root_path(root_path: Option<PathBuf>) -> Result<PathBuf> {
return Ok(path);
}

if !should_use_rootless() {
if !rootless_required() {
let default = PathBuf::from("/run/youki");
utils::create_dir_all(&default)?;
return Ok(default);
Expand Down
69 changes: 67 additions & 2 deletions src/process/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use nix::{
sys::statfs,
unistd::{self, Gid, Uid},
};
use oci_spec::User;
use oci_spec::{LinuxNamespaceType, Spec};
use std::collections::HashMap;
use std::{
Expand All @@ -15,6 +16,7 @@ use std::{
};
use std::{fs, path::Path, path::PathBuf};

use crate::rootless::Rootless;
use crate::{
capabilities,
container::Container,
Expand Down Expand Up @@ -146,7 +148,7 @@ fn readonly_path(path: &str) -> Result<()> {
Ok(())
}

pub struct ContainerInitArgs {
pub struct ContainerInitArgs<'a> {
/// Flag indicating if an init or a tenant container should be created
pub init: bool,
/// Interface to operating system primitives
Expand All @@ -163,9 +165,11 @@ pub struct ContainerInitArgs {
pub preserve_fds: i32,
/// Container state
pub container: Option<Container>,
/// Options for rootless containers
pub rootless: Option<Rootless<'a>>,
}

pub fn container_intermidiate(
pub fn container_intermediate(
args: ContainerInitArgs,
receiver_from_main: &mut channel::ReceiverFromMain,
sender_to_main: &mut channel::SenderIntermediateToMain,
Expand Down Expand Up @@ -364,6 +368,9 @@ pub fn container_init(
}
};

set_supplementary_gids(&proc.user, &args.rootless)
.context("failed to set supplementary gids")?;

command
.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))
.context("Failed to configure uid and gid")?;
Expand Down Expand Up @@ -456,6 +463,64 @@ pub fn container_init(
unreachable!();
}

// Before 3.19 it was possible for an unprivileged user to enter an user namespace,
// become root and then call setgroups in order to drop membership in supplementary
// groups. This allowed access to files which blocked access based on being a member
// of these groups (see CVE-2014-8989)
//
// This leaves us with three scenarios:
//
// Unprivileged user starting a rootless container: The main process is running as an
// unprivileged user and therefore cannot write the mapping until "deny" has been written
// to /proc/{pid}/setgroups. Once written /proc/{pid}/setgroups cannot be reset and the
// setgroups system call will be disabled for all processes in this user namespace. This
// also means that we should detect if the user is unprivileged and additional gids have
// been specified and bail out early as this can never work. This is not handled here,
// but during the validation for rootless containers.
//
// Privileged user starting a rootless container: It is not necessary to write "deny" to
// /proc/setgroups in order to create the gid mapping and therefore we don't. This means
// that setgroups could be used to drop groups, but this is fine as the user is privileged
// and could do so anyway.
// We already have checked during validation if the specified supplemental groups fall into
// the range that are specified in the gid mapping and bail out early if they do not.
//
// Privileged user starting a normal container: Just add the supplementary groups.
//
fn set_supplementary_gids(user: &User, rootless: &Option<Rootless>) -> Result<()> {
if let Some(additional_gids) = &user.additional_gids {
if additional_gids.is_empty() {
return Ok(());
}

let setgroups =
yihuaf marked this conversation as resolved.
Show resolved Hide resolved
fs::read_to_string("/proc/self/setgroups").context("failed to read setgroups")?;
if setgroups.trim() == "deny" {
bail!("cannot set supplementary gids, setgroup is disabled");
}

let gids: Vec<Gid> = additional_gids
.iter()
.map(|gid| Gid::from_raw(*gid))
.collect();

match rootless {
Some(r) if r.privileged => {
nix::unistd::setgroups(&gids).context("failed to set supplementary gids")?;
}
None => {
nix::unistd::setgroups(&gids).context("failed to set supplementary gids")?;
}
// this should have been detected during validation
_ => unreachable!(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Under what circumstance where is branch get triggered? Isn't match option can only be either Some or None?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Some has a match guard, so this is only triggered when the user is not privileged. Ideally I could write
None | Some(r) if r. privileged
but Rust does not allow that (yet).

"unprivileged users cannot set supplementary gids in rootless container"
),
}
}

Ok(())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
40 changes: 22 additions & 18 deletions src/rootfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use nix::mount::mount as nix_mount;
use nix::mount::MsFlags;
use nix::sys::stat::Mode;
use nix::sys::stat::{mknod, umask};
use nix::unistd::{chdir, chown, close, getcwd};
use nix::unistd::{chown, close};
use nix::unistd::{Gid, Uid};
use oci_spec::{LinuxDevice, LinuxDeviceType, Mount, Spec};
use std::fs::OpenOptions;
Expand Down Expand Up @@ -68,17 +68,18 @@ pub fn prepare_rootfs(spec: &Spec, rootfs: &Path, bind_devices: bool) -> Result<
}
}

let olddir = getcwd()?;
chdir(rootfs)?;
setup_default_symlinks(rootfs).context("Failed to setup default symlinks")?;
if let Some(added_devices) = linux.devices.as_ref() {
create_devices(default_devices().iter().chain(added_devices), bind_devices)
create_devices(
rootfs,
default_devices().iter().chain(added_devices),
bind_devices,
)
} else {
create_devices(default_devices().iter(), bind_devices)
create_devices(rootfs, default_devices().iter(), bind_devices)
}?;
setup_ptmx(rootfs)?;
chdir(&olddir)?;

setup_ptmx(rootfs)?;
Ok(())
}

Expand All @@ -89,8 +90,7 @@ fn setup_ptmx(rootfs: &Path) -> Result<()> {
}
}

symlink("pts/ptmx", "dev/ptmx").context("Failed to symlink ptmx")?;

symlink("pts/ptmx", rootfs.join("dev/ptmx")).context("failed to symlink ptmx")?;
Ok(())
}

Expand Down Expand Up @@ -171,7 +171,7 @@ pub fn default_devices() -> Vec<LinuxDevice> {
]
}

fn create_devices<'a, I>(devices: I, bind: bool) -> Result<()>
fn create_devices<'a, I>(rootfs: &Path, devices: I, bind: bool) -> Result<()>
where
I: Iterator<Item = &'a LinuxDevice>,
{
Expand All @@ -183,7 +183,7 @@ where
panic!("{} is not a valid device path", dev.path.display());
}

bind_dev(dev)
bind_dev(rootfs, dev)
})
.collect::<Result<Vec<_>>>()?;
} else {
Expand All @@ -193,7 +193,7 @@ where
panic!("{} is not a valid device path", dev.path.display());
}

mknod_dev(dev)
mknod_dev(rootfs, dev)
})
.collect::<Result<Vec<_>>>()?;
}
Expand All @@ -202,15 +202,17 @@ where
Ok(())
}

fn bind_dev(dev: &LinuxDevice) -> Result<()> {
fn bind_dev(rootfs: &Path, dev: &LinuxDevice) -> Result<()> {
let full_container_path = rootfs.join(dev.path.as_in_container()?);

let fd = open(
&dev.path.as_in_container()?,
&full_container_path,
OFlag::O_RDWR | OFlag::O_CREAT,
Mode::from_bits_truncate(0o644),
)?;
close(fd)?;
nix_mount(
Some(&*dev.path.as_in_container()?),
Some(&full_container_path),
&dev.path,
None::<&str>,
MsFlags::MS_BIND,
Expand All @@ -220,21 +222,23 @@ fn bind_dev(dev: &LinuxDevice) -> Result<()> {
Ok(())
}

fn mknod_dev(dev: &LinuxDevice) -> Result<()> {
fn mknod_dev(rootfs: &Path, dev: &LinuxDevice) -> Result<()> {
fn makedev(major: i64, minor: i64) -> u64 {
((minor & 0xff)
| ((major & 0xfff) << 8)
| ((minor & !0xff) << 12)
| ((major & !0xfff) << 32)) as u64
}

let full_container_path = rootfs.join(dev.path.as_in_container()?);
mknod(
&dev.path.as_in_container()?,
&full_container_path,
dev.typ.to_sflag()?,
Mode::from_bits_truncate(dev.file_mode.unwrap_or(0)),
makedev(dev.major, dev.minor),
)?;
chown(
&dev.path.as_in_container()?,
&full_container_path,
dev.uid.map(Uid::from_raw),
dev.gid.map(Gid::from_raw),
)?;
Expand Down
Loading