From 7a0a8f3ac576cef38c30fad9b0a4694730602c07 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 16:34:17 +0100 Subject: [PATCH 01/11] Move cgroup config into youki config Signed-off-by: Aidan Hobson Sayers --- crates/libcgroups/src/common.rs | 3 +- crates/libcontainer/src/config.rs | 26 +++++------ .../src/container/builder_impl.rs | 45 ++++++------------- .../libcontainer/src/container/container.rs | 9 ---- .../src/container/container_delete.rs | 15 ++----- .../src/container/container_events.rs | 6 +-- .../src/container/container_kill.rs | 12 +---- .../src/container/container_pause.rs | 6 +-- .../src/container/container_resume.rs | 6 +-- .../src/container/container_start.rs | 7 ++- .../src/container/init_builder.rs | 19 +++++--- crates/libcontainer/src/container/state.rs | 3 -- .../src/container/tenant_builder.rs | 10 +---- crates/libcontainer/src/hooks.rs | 6 +-- crates/libcontainer/src/process/args.rs | 7 ++- .../src/process/container_init_process.rs | 5 +-- .../process/container_intermediate_process.rs | 2 +- .../src/process/container_main_process.rs | 21 +++++---- crates/youki/src/commands/mod.rs | 8 +--- 19 files changed, 71 insertions(+), 145 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index 76bcc14f0..695af77b9 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -11,6 +11,7 @@ use oci_spec::runtime::LinuxResources; use oci_spec::runtime::{ LinuxDevice, LinuxDeviceBuilder, LinuxDeviceCgroup, LinuxDeviceCgroupBuilder, LinuxDeviceType, }; +use serde::{Deserialize, Serialize}; use super::stats::Stats; use super::{systemd, v1, v2}; @@ -326,7 +327,7 @@ pub enum CreateCgroupSetupError { Systemd(#[from] systemd::manager::SystemdManagerError), } -#[derive(Clone)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct CgroupConfig { pub cgroup_path: PathBuf, pub systemd_cgroup: bool, diff --git a/crates/libcontainer/src/config.rs b/crates/libcontainer/src/config.rs index 37f1508f0..231f703b1 100644 --- a/crates/libcontainer/src/config.rs +++ b/crates/libcontainer/src/config.rs @@ -5,8 +5,6 @@ use std::path::{Path, PathBuf}; use oci_spec::runtime::{Hooks, Spec}; use serde::{Deserialize, Serialize}; -use crate::utils; - #[derive(Debug, thiserror::Error)] pub enum ConfigError { #[error("failed to save config")] @@ -43,20 +41,14 @@ const YOUKI_CONFIG_NAME: &str = "youki_config.json"; #[non_exhaustive] pub struct YoukiConfig { pub hooks: Option, - pub cgroup_path: PathBuf, + pub cgroup_config: libcgroups::common::CgroupConfig, } impl<'a> YoukiConfig { - pub fn from_spec(spec: &'a Spec, container_id: &str) -> Result { + pub fn from_spec(spec: &'a Spec, cgroup_config: libcgroups::common::CgroupConfig) -> Result { Ok(YoukiConfig { hooks: spec.hooks().clone(), - cgroup_path: utils::get_cgroup_path( - spec.linux() - .as_ref() - .ok_or(ConfigError::MissingLinux)? - .cgroups_path(), - container_id, - ), + cgroup_config, }) } @@ -106,13 +98,15 @@ mod tests { fn test_config_from_spec() -> Result<()> { let container_id = "sample"; let spec = Spec::default(); + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: PathBuf::from(format!(":youki:{container_id}")), + systemd_cgroup: true, + container_name: container_id.to_owned(), + }; let config = YoukiConfig::from_spec(&spec, container_id)?; assert_eq!(&config.hooks, spec.hooks()); - dbg!(&config.cgroup_path); - assert_eq!( - config.cgroup_path, - PathBuf::from(format!(":youki:{container_id}")) - ); + dbg!(&config.cgroup_config); + assert_eq!(config.cgroup_config, cgroup_config); Ok(()) } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index ed2cd07dd..7ca881e53 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -8,7 +8,7 @@ use libcgroups::common::CgroupManager; use nix::unistd::Pid; use oci_spec::runtime::Spec; -use super::{Container, ContainerStatus}; +use super::ContainerStatus; use crate::error::{LibcontainerError, MissingSpecError}; use crate::notify_socket::NotifyListener; use crate::process::args::{ContainerArgs, ContainerType}; @@ -17,17 +17,15 @@ use crate::process::{self}; use crate::syscall::syscall::SyscallType; use crate::user_ns::UserNamespaceConfig; use crate::workload::Executor; -use crate::{hooks, utils}; +use crate::hooks; pub(super) struct ContainerBuilderImpl { /// Flag indicating if an init or a tenant container should be created pub container_type: ContainerType, /// Interface to operating system primitives pub syscall: SyscallType, - /// Flag indicating if systemd should be used for cgroup management - pub use_systemd: bool, - /// Id of the container - pub container_id: String, + /// Interface to operating system primitives + pub cgroup_config: libcgroups::common::CgroupConfig, /// OCI compliant runtime spec pub spec: Rc, /// Root filesystem of the container @@ -41,8 +39,6 @@ pub(super) struct ContainerBuilderImpl { pub user_ns_config: Option, /// Path to the Unix Domain Socket to communicate container start pub notify_path: PathBuf, - /// Container state - pub container: Option, /// File descriptos preserved/passed to the container init process. pub preserve_fds: i32, /// If the container is to be run in detached mode @@ -58,7 +54,7 @@ impl ContainerBuilderImpl { Err(outer) => { // Only the init container should be cleaned up in the case of // an error. - if matches!(self.container_type, ContainerType::InitContainer) { + if matches!(self.container_type, ContainerType::InitContainer { .. }) { self.cleanup_container()?; } @@ -68,24 +64,17 @@ impl ContainerBuilderImpl { } fn run_container(&mut self) -> Result { - let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; - let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); - let cgroup_config = libcgroups::common::CgroupConfig { - cgroup_path: cgroups_path, - systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(), - container_name: self.container_id.to_owned(), - }; let process = self .spec .process() .as_ref() .ok_or(MissingSpecError::Process)?; - if matches!(self.container_type, ContainerType::InitContainer) { + if let ContainerType::InitContainer { container } = &self.container_type { if let Some(hooks) = self.spec.hooks() { hooks::run_hooks( hooks.create_runtime().as_ref(), - self.container.as_ref(), + container, None, )? } @@ -129,6 +118,7 @@ impl ContainerBuilderImpl { // going to be switching to a different security context. Thus setting // ourselves to be non-dumpable only breaks things (like rootless // containers), which is the recommendation from the kernel folks. + let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; if linux.namespaces().is_some() { prctl::set_dumpable(false).map_err(|e| { LibcontainerError::Other(format!( @@ -142,16 +132,15 @@ impl ContainerBuilderImpl { // therefore we will have to move all the variable by value. Since self // is a shared reference, we have to clone these variables here. let container_args = ContainerArgs { - container_type: self.container_type, + container_type: self.container_type.clone(), syscall: self.syscall, spec: Rc::clone(&self.spec), rootfs: self.rootfs.to_owned(), console_socket: self.console_socket, notify_listener, preserve_fds: self.preserve_fds, - container: self.container.to_owned(), user_ns_config: self.user_ns_config.to_owned(), - cgroup_config, + cgroup_config: self.cgroup_config.clone(), detached: self.detached, executor: self.executor.clone(), }; @@ -172,7 +161,7 @@ impl ContainerBuilderImpl { })?; } - if let Some(container) = &mut self.container { + if let ContainerType::InitContainer { container } = &mut self.container_type { // update status and pid of the container process container .set_status(ContainerStatus::Created) @@ -186,23 +175,15 @@ impl ContainerBuilderImpl { } fn cleanup_container(&self) -> Result<(), LibcontainerError> { - let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; - let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); - let cmanager = - libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { - cgroup_path: cgroups_path, - systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(), - container_name: self.container_id.to_string(), - })?; - let mut errors = Vec::new(); + let cmanager = libcgroups::common::create_cgroup_manager(self.cgroup_config.clone())?; if let Err(e) = cmanager.remove() { tracing::error!(error = ?e, "failed to remove cgroup manager"); errors.push(e.to_string()); } - if let Some(container) = &self.container { + if let ContainerType::InitContainer { container } = &self.container_type { if let Some(true) = container.clean_up_intel_rdt_subdirectory() { if let Err(e) = delete_resctrl_subdirectory(container.id()) { tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory"); diff --git a/crates/libcontainer/src/container/container.rs b/crates/libcontainer/src/container/container.rs index 7a05c3495..2d55f69ed 100644 --- a/crates/libcontainer/src/container/container.rs +++ b/crates/libcontainer/src/container/container.rs @@ -121,15 +121,6 @@ impl Container { self } - pub fn systemd(&self) -> bool { - self.state.use_systemd - } - - pub fn set_systemd(&mut self, should_use: bool) -> &mut Self { - self.state.use_systemd = should_use; - self - } - pub fn set_clean_up_intel_rdt_directory(&mut self, clean_up: bool) -> &mut Self { self.state.clean_up_intel_rdt_subdirectory = Some(clean_up); self diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index d246658c3..b4dd842e3 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -5,7 +5,6 @@ use libcgroups::{self}; use nix::sys::signal; use super::{Container, ContainerStatus}; -use crate::config::YoukiConfig; use crate::error::LibcontainerError; use crate::hooks; use crate::process::intel_rdt::delete_resctrl_subdirectory; @@ -78,27 +77,21 @@ impl Container { } if self.root.exists() { - match YoukiConfig::load(&self.root) { + match self.spec() { Ok(config) => { tracing::debug!("config: {:?}", config); // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let cmanager = libcgroups::common::create_cgroup_manager( - libcgroups::common::CgroupConfig { - cgroup_path: config.cgroup_path.to_owned(), - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - }, - )?; + let cmanager = libcgroups::common::create_cgroup_manager(config.cgroup_config.clone())?; cmanager.remove().map_err(|err| { - tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); + tracing::error!(cgroup_config = ?config.cgroup_config, "failed to remove cgroup due to: {err:?}"); err })?; if let Some(hooks) = config.hooks.as_ref() { - hooks::run_hooks(hooks.poststop().as_ref(), Some(self), None).map_err( + hooks::run_hooks(hooks.poststop().as_ref(), self, None).map_err( |err| { tracing::error!(err = ?err, "failed to run post stop hooks"); err diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index 83df282f4..f2336a0dd 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -35,11 +35,7 @@ impl Container { } let cgroup_manager = - libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path, - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - })?; + libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index fc5da3db9..86112a33a 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -83,11 +83,7 @@ impl Container { libcgroups::common::CgroupSetup::Legacy | libcgroups::common::CgroupSetup::Hybrid => { let cmanager = libcgroups::common::create_cgroup_manager( - libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path, - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - }, + self.spec()?.cgroup_config )?; cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; } @@ -100,11 +96,7 @@ impl Container { fn kill_all_processes>(&self, signal: S) -> Result<(), LibcontainerError> { let signal = signal.into().into_raw(); let cmanager = - libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path, - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - })?; + libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 7dcfa7793..b0ca2ceec 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -33,11 +33,7 @@ impl Container { } let cmanager = - libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path, - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - })?; + libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 48fd65d84..927461201 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -34,11 +34,7 @@ impl Container { } let cmanager = - libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path, - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - })?; + libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/container/container_start.rs b/crates/libcontainer/src/container/container_start.rs index 8ba182eed..fb800ac29 100644 --- a/crates/libcontainer/src/container/container_start.rs +++ b/crates/libcontainer/src/container/container_start.rs @@ -1,7 +1,6 @@ use nix::sys::signal; use super::{Container, ContainerStatus}; -use crate::config::YoukiConfig; use crate::error::LibcontainerError; use crate::hooks; use crate::notify_socket::{NotifySocket, NOTIFY_FILE}; @@ -35,7 +34,7 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let config = YoukiConfig::load(&self.root).map_err(|err| { + let config = self.spec().map_err(|err| { tracing::error!( "failed to load runtime spec for container {}: {}", self.id(), @@ -47,7 +46,7 @@ impl Container { // While prestart is marked as deprecated in the OCI spec, the docker and integration test still // uses it. #[allow(deprecated)] - hooks::run_hooks(hooks.prestart().as_ref(), Some(self), None).map_err(|err| { + hooks::run_hooks(hooks.prestart().as_ref(), self, None).map_err(|err| { tracing::error!("failed to run pre start hooks: {}", err); // In the case where prestart hook fails, the runtime must // stop the container before generating an error and exiting. @@ -69,7 +68,7 @@ impl Container { // Run post start hooks. It runs after the container process is started. // It is called in the runtime namespace. if let Some(hooks) = config.hooks.as_ref() { - hooks::run_hooks(hooks.poststart().as_ref(), Some(self), Some(&self.root)).map_err( + hooks::run_hooks(hooks.poststart().as_ref(), self, Some(&self.root)).map_err( |err| { tracing::error!("failed to run post start hooks: {}", err); err diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 2230acc60..098575712 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -52,7 +52,6 @@ impl InitContainerBuilder { let mut container = self.create_container_state(&container_dir)?; container - .set_systemd(self.use_systemd) .set_annotations(spec.annotations().clone()); let notify_path = container_dir.join(NOTIFY_FILE); @@ -74,24 +73,32 @@ impl InitContainerBuilder { let user_ns_config = UserNamespaceConfig::new(&spec)?; - let config = YoukiConfig::from_spec(&spec, container.id())?; + let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; + let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.base.container_id); + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || user_ns_config.is_some(), + container_name: self.base.container_id.to_owned(), + }; + + let config = YoukiConfig::from_spec(&spec, cgroup_config.clone())?; config.save(&container_dir).map_err(|err| { tracing::error!(?container_dir, "failed to save config: {}", err); err })?; let mut builder_impl = ContainerBuilderImpl { - container_type: ContainerType::InitContainer, + container_type: ContainerType::InitContainer { + container: container.clone(), + }, syscall: self.base.syscall, - container_id: self.base.container_id, pid_file: self.base.pid_file, console_socket: csocketfd, - use_systemd: self.use_systemd, + cgroup_config, spec: Rc::new(spec), rootfs, user_ns_config, notify_path, - container: Some(container.clone()), preserve_fds: self.base.preserve_fds, detached: self.detached, executor: self.base.executor, diff --git a/crates/libcontainer/src/container/state.rs b/crates/libcontainer/src/container/state.rs index 4f0bc07aa..05701404e 100644 --- a/crates/libcontainer/src/container/state.rs +++ b/crates/libcontainer/src/container/state.rs @@ -112,8 +112,6 @@ pub struct State { // User that created the container #[serde(skip_serializing_if = "Option::is_none")] pub creator: Option, - // Specifies if systemd should be used to manage cgroups - pub use_systemd: bool, // Specifies if the Intel RDT subdirectory needs be cleaned up. pub clean_up_intel_rdt_subdirectory: Option, } @@ -136,7 +134,6 @@ impl State { annotations: Some(HashMap::default()), created: None, creator: None, - use_systemd: false, clean_up_intel_rdt_subdirectory: None, } } diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index 1ebddd76b..292a1ba39 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -119,7 +119,7 @@ impl TenantContainerBuilder { // get file descriptors of console socket let csocketfd = self.setup_tty_socket(&container_dir)?; - let use_systemd = self.should_use_systemd(&container); + let cgroup_config = container.spec()?.cgroup_config; let user_ns_config = UserNamespaceConfig::new(&spec)?; let (read_end, write_end) = @@ -130,15 +130,13 @@ impl TenantContainerBuilder { exec_notify_fd: write_end.as_raw_fd(), }, syscall: self.base.syscall, - container_id: self.base.container_id, pid_file: self.base.pid_file, console_socket: csocketfd, - use_systemd, + cgroup_config, spec: Rc::new(spec), rootfs, user_ns_config, notify_path: notify_path.clone(), - container: None, preserve_fds: self.base.preserve_fds, detached: self.detached, executor: self.base.executor, @@ -488,10 +486,6 @@ impl TenantContainerBuilder { Ok(tenant_namespaces) } - fn should_use_systemd(&self, container: &Container) -> bool { - container.systemd() - } - fn setup_notify_listener(container_dir: &Path) -> Result { let notify_name = Self::generate_name(container_dir, TENANT_NOTIFY); let socket_path = container_dir.join(notify_name); diff --git a/crates/libcontainer/src/hooks.rs b/crates/libcontainer/src/hooks.rs index 8233e2ab8..0f243e0d7 100644 --- a/crates/libcontainer/src/hooks.rs +++ b/crates/libcontainer/src/hooks.rs @@ -23,8 +23,6 @@ pub enum HookError { Killed, #[error("failed to execute hook command due to a timeout")] Timeout, - #[error("container state is required to run hook")] - MissingContainerState, #[error("failed to write container state to stdin")] WriteContainerState(#[source] std::io::Error), } @@ -33,10 +31,10 @@ type Result = std::result::Result; pub fn run_hooks( hooks: Option<&Vec>, - container: Option<&Container>, + container: &Container, cwd: Option<&Path>, ) -> Result<()> { - let state = &(container.ok_or(HookError::MissingContainerState)?.state); + let state = &container.state; if let Some(hooks) = hooks { for hook in hooks { diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index a4451dd85..9884e673e 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -10,9 +10,10 @@ use crate::notify_socket::NotifyListener; use crate::syscall::syscall::SyscallType; use crate::user_ns::UserNamespaceConfig; use crate::workload::Executor; -#[derive(Debug, Copy, Clone)] + +#[derive(Debug, Clone)] pub enum ContainerType { - InitContainer, + InitContainer { container: Container }, TenantContainer { exec_notify_fd: RawFd }, } @@ -32,8 +33,6 @@ pub struct ContainerArgs { pub notify_listener: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, - /// Container state - pub container: Option, /// Options for new namespace creation pub user_ns_config: Option, /// Cgroup Manager Config diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 64182c1c7..b12843f09 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -285,7 +285,6 @@ pub fn container_init_process( utils::parse_env(proc.env().as_ref().unwrap_or(&vec![])); let rootfs_path = &args.rootfs; let hooks = spec.hooks().as_ref(); - let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; let notify_listener = &args.notify_listener; @@ -312,7 +311,7 @@ pub fn container_init_process( let _ = prctl::set_no_new_privileges(true); } - if matches!(args.container_type, ContainerType::InitContainer) { + if let ContainerType::InitContainer { container } = &args.container_type { // create_container hook needs to be called after the namespace setup, but // before pivot_root is called. This runs in the container namespaces. if let Some(hooks) = hooks { @@ -619,7 +618,7 @@ pub fn container_init_process( // create_container hook needs to be called after the namespace setup, but // before pivot_root is called. This runs in the container namespaces. - if matches!(args.container_type, ContainerType::InitContainer) { + if let ContainerType::InitContainer { container } = &args.container_type { if let Some(hooks) = hooks { hooks::run_hooks(hooks.start_container().as_ref(), container, None).map_err(|err| { tracing::error!(?err, "failed to run start container hooks"); diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 1bee6f3ee..219f62347 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -65,7 +65,7 @@ pub fn container_intermediate_process( apply_cgroups( &cgroup_manager, linux.resources().as_ref(), - matches!(args.container_type, ContainerType::InitContainer), + matches!(args.container_type, ContainerType::InitContainer { .. }), )?; // if new user is specified in specification, this will be true and new diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index fd5dfcb1c..eda7a11fc 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -1,7 +1,7 @@ use nix::sys::wait::{waitpid, WaitStatus}; use nix::unistd::Pid; -use crate::process::args::ContainerArgs; +use crate::process::args::{ContainerArgs, ContainerType}; use crate::process::fork::{self, CloneCb}; use crate::process::intel_rdt::setup_intel_rdt; use crate::process::{channel, container_intermediate_process}; @@ -126,18 +126,17 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo if let Some(linux) = container_args.spec.linux() { #[cfg(feature = "libseccomp")] if let Some(seccomp) = linux.seccomp() { + let container_state = match &container_args.container_type { + ContainerType::InitContainer { container } => &container.state, + ContainerType::TenantContainer { .. } => return Err(ProcessError::ContainerStateRequired), + }; let state = crate::container::ContainerProcessState { oci_version: container_args.spec.version().to_string(), // runc hardcode the `seccompFd` name for fds. fds: vec![String::from("seccompFd")], pid: init_pid.as_raw(), metadata: seccomp.listener_metadata().to_owned().unwrap_or_default(), - state: container_args - .container - .as_ref() - .ok_or(ProcessError::ContainerStateRequired)? - .state - .clone(), + state: container_state.clone(), }; crate::process::seccomp_listener::sync_seccomp( seccomp, @@ -148,10 +147,10 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo } if let Some(intel_rdt) = linux.intel_rdt() { - let container_id = container_args - .container - .as_ref() - .map(|container| container.id()); + let container_id = match &container_args.container_type { + ContainerType::InitContainer { container } => Some(container.id()), + ContainerType::TenantContainer { .. } => None, + }; need_to_clean_up_intel_rdt_subdirectory = setup_intel_rdt(container_id, &init_pid, intel_rdt)?; } diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index 08044b7fb..85d9916f0 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -57,11 +57,5 @@ fn create_cgroup_manager>( container_id: &str, ) -> Result { let container = load_container(root_path, container_id)?; - Ok(libcgroups::common::create_cgroup_manager( - libcgroups::common::CgroupConfig { - cgroup_path: container.spec()?.cgroup_path, - systemd_cgroup: container.systemd(), - container_name: container.id().to_string(), - }, - )?) + Ok(libcgroups::common::create_cgroup_manager(container.spec()?.cgroup_config)?) } From 3637c46a1ad4fbe9f58401c05049718d0fc8db98 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 17:11:04 +0100 Subject: [PATCH 02/11] Allow entirely disabling cgroup manipulation Signed-off-by: Aidan Hobson Sayers --- crates/libcontainer/src/config.rs | 4 +-- .../src/container/builder_impl.rs | 12 +++++---- .../src/container/container_delete.rs | 12 +++++---- .../src/container/container_events.rs | 7 ++++- .../src/container/container_kill.rs | 27 ++++++++++++------- .../src/container/container_pause.rs | 7 ++++- .../src/container/container_resume.rs | 7 ++++- .../src/container/init_builder.rs | 25 ++++++++++++----- crates/libcontainer/src/error.rs | 2 ++ crates/libcontainer/src/process/args.rs | 2 +- .../process/container_intermediate_process.rs | 16 ++++++----- crates/liboci-cli/src/lib.rs | 3 +++ crates/youki/src/commands/create.rs | 3 ++- crates/youki/src/commands/mod.rs | 5 +++- crates/youki/src/commands/run.rs | 3 ++- crates/youki/src/main.rs | 5 ++-- 16 files changed, 96 insertions(+), 44 deletions(-) diff --git a/crates/libcontainer/src/config.rs b/crates/libcontainer/src/config.rs index 231f703b1..8d4bb694c 100644 --- a/crates/libcontainer/src/config.rs +++ b/crates/libcontainer/src/config.rs @@ -41,11 +41,11 @@ const YOUKI_CONFIG_NAME: &str = "youki_config.json"; #[non_exhaustive] pub struct YoukiConfig { pub hooks: Option, - pub cgroup_config: libcgroups::common::CgroupConfig, + pub cgroup_config: Option, } impl<'a> YoukiConfig { - pub fn from_spec(spec: &'a Spec, cgroup_config: libcgroups::common::CgroupConfig) -> Result { + pub fn from_spec(spec: &'a Spec, cgroup_config: Option) -> Result { Ok(YoukiConfig { hooks: spec.hooks().clone(), cgroup_config, diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 7ca881e53..b4fcc3fe7 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -25,7 +25,7 @@ pub(super) struct ContainerBuilderImpl { /// Interface to operating system primitives pub syscall: SyscallType, /// Interface to operating system primitives - pub cgroup_config: libcgroups::common::CgroupConfig, + pub cgroup_config: Option, /// OCI compliant runtime spec pub spec: Rc, /// Root filesystem of the container @@ -177,10 +177,12 @@ impl ContainerBuilderImpl { fn cleanup_container(&self) -> Result<(), LibcontainerError> { let mut errors = Vec::new(); - let cmanager = libcgroups::common::create_cgroup_manager(self.cgroup_config.clone())?; - if let Err(e) = cmanager.remove() { - tracing::error!(error = ?e, "failed to remove cgroup manager"); - errors.push(e.to_string()); + if let Some(cc) = &self.cgroup_config { + let cmanager = libcgroups::common::create_cgroup_manager(cc.to_owned())?; + if let Err(e) = cmanager.remove() { + tracing::error!(error = ?e, "failed to remove cgroup manager"); + errors.push(e.to_string()); + } } if let ContainerType::InitContainer { container } = &self.container_type { diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index b4dd842e3..c73c69eb6 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -84,11 +84,13 @@ impl Container { // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let cmanager = libcgroups::common::create_cgroup_manager(config.cgroup_config.clone())?; - cmanager.remove().map_err(|err| { - tracing::error!(cgroup_config = ?config.cgroup_config, "failed to remove cgroup due to: {err:?}"); - err - })?; + if let Some(cc) = config.cgroup_config { + let cmanager = libcgroups::common::create_cgroup_manager(cc.clone())?; + cmanager.remove().map_err(|err| { + tracing::error!(cgroup_config = ?cc, "failed to remove cgroup due to: {err:?}"); + err + })?; + } if let Some(hooks) = config.hooks.as_ref() { hooks::run_hooks(hooks.poststop().as_ref(), self, None).map_err( diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index f2336a0dd..6be6418d0 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -34,8 +34,13 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } + let cgroup_config = match self.spec()?.cgroup_config { + Some(cc) => cc, + None => return Err(LibcontainerError::CgroupsMissing), + }; + let cgroup_manager = - libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; + libcgroups::common::create_cgroup_manager(cgroup_config)?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 86112a33a..14d82b10b 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -79,24 +79,33 @@ impl Container { // For cgroup V1, a frozon process cannot respond to signals, // so we need to thaw it. Only thaw the cgroup for SIGKILL. if self.status() == ContainerStatus::Paused && signal == signal::Signal::SIGKILL { - match get_cgroup_setup()? { - libcgroups::common::CgroupSetup::Legacy - | libcgroups::common::CgroupSetup::Hybrid => { - let cmanager = libcgroups::common::create_cgroup_manager( - self.spec()?.cgroup_config - )?; - cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; + if let Some(cgroup_config) = self.spec()?.cgroup_config { + match get_cgroup_setup()? { + libcgroups::common::CgroupSetup::Legacy + | libcgroups::common::CgroupSetup::Hybrid => { + let cmanager = libcgroups::common::create_cgroup_manager( + cgroup_config + )?; + cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; + } + libcgroups::common::CgroupSetup::Unified => {} } - libcgroups::common::CgroupSetup::Unified => {} + } else { + return Err(LibcontainerError::CgroupsMissing) } } Ok(()) } fn kill_all_processes>(&self, signal: S) -> Result<(), LibcontainerError> { + let cgroup_config = match self.spec()?.cgroup_config { + Some(cc) => cc, + None => return Err(LibcontainerError::CgroupsMissing), + }; + let signal = signal.into().into_raw(); let cmanager = - libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; + libcgroups::common::create_cgroup_manager(cgroup_config)?; if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index b0ca2ceec..e0af5746f 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -32,8 +32,13 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } + let cgroup_config = match self.spec()?.cgroup_config { + Some(cc) => cc, + None => return Err(LibcontainerError::CgroupsMissing), + }; + let cmanager = - libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; + libcgroups::common::create_cgroup_manager(cgroup_config)?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 927461201..67b845e50 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -33,8 +33,13 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } + let cgroup_config = match self.spec()?.cgroup_config { + Some(cc) => cc, + None => return Err(LibcontainerError::CgroupsMissing), + }; + let cmanager = - libcgroups::common::create_cgroup_manager(self.spec()?.cgroup_config)?; + libcgroups::common::create_cgroup_manager(cgroup_config)?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 098575712..457bad451 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -18,6 +18,7 @@ use crate::{apparmor, tty, user_ns, utils}; pub struct InitContainerBuilder { base: ContainerBuilder, bundle: PathBuf, + use_cgroups: bool, use_systemd: bool, detached: bool, } @@ -29,11 +30,18 @@ impl InitContainerBuilder { Self { base: builder, bundle, + use_cgroups: true, use_systemd: true, detached: true, } } + /// Sets if cgroups should be used at all (overrides systemd if false) + pub fn with_cgroups(mut self, should_use: bool) -> Self { + self.use_cgroups = should_use; + self + } + /// Sets if systemd should be used for managing cgroups pub fn with_systemd(mut self, should_use: bool) -> Self { self.use_systemd = should_use; @@ -73,13 +81,16 @@ impl InitContainerBuilder { let user_ns_config = UserNamespaceConfig::new(&spec)?; - let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; - let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.base.container_id); - let cgroup_config = libcgroups::common::CgroupConfig { - cgroup_path: cgroups_path, - systemd_cgroup: self.use_systemd || user_ns_config.is_some(), - container_name: self.base.container_id.to_owned(), - }; + let mut cgroup_config = None; + if self.use_cgroups { + let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; + let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.base.container_id); + cgroup_config = Some(libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || user_ns_config.is_some(), + container_name: self.base.container_id.to_owned(), + }); + } let config = YoukiConfig::from_spec(&spec, cgroup_config.clone())?; config.save(&container_dir).map_err(|err| { diff --git a/crates/libcontainer/src/error.rs b/crates/libcontainer/src/error.rs index f6efed7db..e5ca35677 100644 --- a/crates/libcontainer/src/error.rs +++ b/crates/libcontainer/src/error.rs @@ -14,6 +14,8 @@ pub enum MissingSpecError { pub enum LibcontainerError { #[error("failed to perform operation due to incorrect container status")] IncorrectStatus, + #[error("requested operation requires cgroups to be enabled on the container")] + CgroupsMissing, #[error("container already exists")] Exist, #[error("container state directory does not exist")] diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 9884e673e..41ee201b3 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -36,7 +36,7 @@ pub struct ContainerArgs { /// Options for new namespace creation pub user_ns_config: Option, /// Cgroup Manager Config - pub cgroup_config: CgroupConfig, + pub cgroup_config: Option, /// If the container is to be run in detached mode pub detached: bool, /// Manage the functions that actually run on the container diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 219f62347..ae245e1a6 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -49,8 +49,6 @@ pub fn container_intermediate_process( let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; - let cgroup_manager = libcgroups::common::create_cgroup_manager(args.cgroup_config.to_owned()) - .map_err(|e| IntermediateProcessError::Cgroup(e.to_string()))?; // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup. It also needs to be done @@ -62,11 +60,15 @@ pub fn container_intermediate_process( // In addition this needs to be done before we enter the cgroup namespace as // the cgroup of the process will form the root of the cgroup hierarchy in // the cgroup namespace. - apply_cgroups( - &cgroup_manager, - linux.resources().as_ref(), - matches!(args.container_type, ContainerType::InitContainer { .. }), - )?; + if let Some(cgroup_config) = &args.cgroup_config { + let cgroup_manager = libcgroups::common::create_cgroup_manager(cgroup_config.to_owned()) + .map_err(|e| IntermediateProcessError::Cgroup(e.to_string()))?; + apply_cgroups( + &cgroup_manager, + linux.resources().as_ref(), + matches!(args.container_type, ContainerType::InitContainer { .. }), + )?; + } // if new user is specified in specification, this will be true and new // namespace will be created, check diff --git a/crates/liboci-cli/src/lib.rs b/crates/liboci-cli/src/lib.rs index 5e5b2eb9a..b4c30e19c 100644 --- a/crates/liboci-cli/src/lib.rs +++ b/crates/liboci-cli/src/lib.rs @@ -94,4 +94,7 @@ pub struct GlobalOpts { /// Enable systemd cgroup manager, rather then use the cgroupfs directly. #[clap(short, long)] pub systemd_cgroup: bool, + /// Entirely disable cgroup manipulation (breaks some operations, e.g. pause) + #[clap(short, long)] + pub disable_cgroups: bool, } diff --git a/crates/youki/src/commands/create.rs b/crates/youki/src/commands/create.rs index dca591fb2..907233c72 100644 --- a/crates/youki/src/commands/create.rs +++ b/crates/youki/src/commands/create.rs @@ -13,7 +13,7 @@ use crate::workload::executor::default_executor; // can be given impression that is is running on a complete system, but on the system which // it is running, it is just another process, and has attributes such as pid, file descriptors, etc. // associated with it like any other process. -pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { +pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool, use_cgroups: bool) -> Result<()> { ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) .with_executor(default_executor()) .with_pid_file(args.pid_file.as_ref())? @@ -23,6 +23,7 @@ pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool) -> Result< .validate_id()? .as_init(&args.bundle) .with_systemd(systemd_cgroup) + .with_cgroups(use_cgroups) .with_detach(true) .build()?; diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index 85d9916f0..10f476aa4 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -57,5 +57,8 @@ fn create_cgroup_manager>( container_id: &str, ) -> Result { let container = load_container(root_path, container_id)?; - Ok(libcgroups::common::create_cgroup_manager(container.spec()?.cgroup_config)?) + match container.spec()?.cgroup_config { + Some(cc) => Ok(libcgroups::common::create_cgroup_manager(cc)?), + None => bail!("cannot use cgroups on container started without them"), + } } diff --git a/crates/youki/src/commands/run.rs b/crates/youki/src/commands/run.rs index f297903f9..b4b8ac35b 100644 --- a/crates/youki/src/commands/run.rs +++ b/crates/youki/src/commands/run.rs @@ -11,7 +11,7 @@ use nix::unistd::Pid; use crate::workload::executor::default_executor; -pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool) -> Result { +pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool, use_cgroups: bool) -> Result { let mut container = ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) .with_executor(default_executor()) .with_pid_file(args.pid_file.as_ref())? @@ -21,6 +21,7 @@ pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool) -> Result { .validate_id()? .as_init(&args.bundle) .with_systemd(systemd_cgroup) + .with_cgroups(use_cgroups) .with_detach(args.detach) .build()?; diff --git a/crates/youki/src/main.rs b/crates/youki/src/main.rs index 27a515b2c..e3c64dc79 100644 --- a/crates/youki/src/main.rs +++ b/crates/youki/src/main.rs @@ -101,11 +101,12 @@ fn main() -> Result<()> { ); let root_path = rootpath::determine(opts.global.root)?; let systemd_cgroup = opts.global.systemd_cgroup; + let use_cgroups = !opts.global.disable_cgroups; let cmd_result = match opts.subcmd { SubCommand::Standard(cmd) => match *cmd { StandardCmd::Create(create) => { - commands::create::create(create, root_path, systemd_cgroup) + commands::create::create(create, root_path, systemd_cgroup, use_cgroups) } StandardCmd::Start(start) => commands::start::start(start, root_path), StandardCmd::Kill(kill) => commands::kill::kill(kill, root_path), @@ -130,7 +131,7 @@ fn main() -> Result<()> { CommonCmd::Pause(pause) => commands::pause::pause(pause, root_path), CommonCmd::Ps(ps) => commands::ps::ps(ps, root_path), CommonCmd::Resume(resume) => commands::resume::resume(resume, root_path), - CommonCmd::Run(run) => match commands::run::run(run, root_path, systemd_cgroup) { + CommonCmd::Run(run) => match commands::run::run(run, root_path, systemd_cgroup, use_cgroups) { Ok(exit_code) => std::process::exit(exit_code), Err(e) => { tracing::error!("error in executing command: {:?}", e); From b75f81d973a12f94f3c8c4f160678f339f3bac0b Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 17:27:27 +0100 Subject: [PATCH 03/11] Tidy logic of container cleanup Signed-off-by: Aidan Hobson Sayers --- .../src/container/builder_impl.rs | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index b4fcc3fe7..86f2da9b9 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -8,7 +8,7 @@ use libcgroups::common::CgroupManager; use nix::unistd::Pid; use oci_spec::runtime::Spec; -use super::ContainerStatus; +use super::{Container, ContainerStatus}; use crate::error::{LibcontainerError, MissingSpecError}; use crate::notify_socket::NotifyListener; use crate::process::args::{ContainerArgs, ContainerType}; @@ -54,8 +54,8 @@ impl ContainerBuilderImpl { Err(outer) => { // Only the init container should be cleaned up in the case of // an error. - if matches!(self.container_type, ContainerType::InitContainer { .. }) { - self.cleanup_container()?; + if let ContainerType::InitContainer { container } = &self.container_type { + cleanup_container(self.cgroup_config.clone(), container)?; } Err(outer) @@ -173,41 +173,42 @@ impl ContainerBuilderImpl { Ok(init_pid) } +} - fn cleanup_container(&self) -> Result<(), LibcontainerError> { - let mut errors = Vec::new(); - - if let Some(cc) = &self.cgroup_config { - let cmanager = libcgroups::common::create_cgroup_manager(cc.to_owned())?; - if let Err(e) = cmanager.remove() { - tracing::error!(error = ?e, "failed to remove cgroup manager"); - errors.push(e.to_string()); - } +fn cleanup_container( + cgroup_config: Option, + container: &Container, +) -> Result<(), LibcontainerError> { + let mut errors = Vec::new(); + + if let Some(cc) = cgroup_config { + let cmanager = libcgroups::common::create_cgroup_manager(cc)?; + if let Err(e) = cmanager.remove() { + tracing::error!(error = ?e, "failed to remove cgroup manager"); + errors.push(e.to_string()); } + } - if let ContainerType::InitContainer { container } = &self.container_type { - if let Some(true) = container.clean_up_intel_rdt_subdirectory() { - if let Err(e) = delete_resctrl_subdirectory(container.id()) { - tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory"); - errors.push(e.to_string()); - } - } - - if container.root.exists() { - if let Err(e) = fs::remove_dir_all(&container.root) { - tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root"); - errors.push(e.to_string()); - } - } + if let Some(true) = container.clean_up_intel_rdt_subdirectory() { + if let Err(e) = delete_resctrl_subdirectory(container.id()) { + tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory"); + errors.push(e.to_string()); } + } - if !errors.is_empty() { - return Err(LibcontainerError::Other(format!( - "failed to cleanup container: {}", - errors.join(";") - ))); + if container.root.exists() { + if let Err(e) = fs::remove_dir_all(&container.root) { + tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root"); + errors.push(e.to_string()); } + } - Ok(()) + if !errors.is_empty() { + return Err(LibcontainerError::Other(format!( + "failed to cleanup container: {}", + errors.join(";") + ))); } + + Ok(()) } From 34b77ecd8447027b95808d1780317a4211a27161 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Fri, 30 Aug 2024 19:35:49 +0100 Subject: [PATCH 04/11] Fix linting issues Signed-off-by: Aidan Hobson Sayers --- crates/libcontainer/src/config.rs | 5 ++++- .../libcontainer/src/container/builder_impl.rs | 8 ++------ .../src/container/container_delete.rs | 10 ++++------ .../src/container/container_events.rs | 3 +-- .../libcontainer/src/container/container_kill.rs | 9 +++------ .../src/container/container_pause.rs | 3 +-- .../src/container/container_resume.rs | 3 +-- .../libcontainer/src/container/init_builder.rs | 6 +++--- .../src/process/container_main_process.rs | 10 ++++++---- crates/youki/src/commands/create.rs | 7 ++++++- crates/youki/src/main.rs | 16 +++++++++------- 11 files changed, 40 insertions(+), 40 deletions(-) diff --git a/crates/libcontainer/src/config.rs b/crates/libcontainer/src/config.rs index 8d4bb694c..e025b514d 100644 --- a/crates/libcontainer/src/config.rs +++ b/crates/libcontainer/src/config.rs @@ -45,7 +45,10 @@ pub struct YoukiConfig { } impl<'a> YoukiConfig { - pub fn from_spec(spec: &'a Spec, cgroup_config: Option) -> Result { + pub fn from_spec( + spec: &'a Spec, + cgroup_config: Option, + ) -> Result { Ok(YoukiConfig { hooks: spec.hooks().clone(), cgroup_config, diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 86f2da9b9..8df01524c 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -10,6 +10,7 @@ use oci_spec::runtime::Spec; use super::{Container, ContainerStatus}; use crate::error::{LibcontainerError, MissingSpecError}; +use crate::hooks; use crate::notify_socket::NotifyListener; use crate::process::args::{ContainerArgs, ContainerType}; use crate::process::intel_rdt::delete_resctrl_subdirectory; @@ -17,7 +18,6 @@ use crate::process::{self}; use crate::syscall::syscall::SyscallType; use crate::user_ns::UserNamespaceConfig; use crate::workload::Executor; -use crate::hooks; pub(super) struct ContainerBuilderImpl { /// Flag indicating if an init or a tenant container should be created @@ -72,11 +72,7 @@ impl ContainerBuilderImpl { if let ContainerType::InitContainer { container } = &self.container_type { if let Some(hooks) = self.spec.hooks() { - hooks::run_hooks( - hooks.create_runtime().as_ref(), - container, - None, - )? + hooks::run_hooks(hooks.create_runtime().as_ref(), container, None)? } } diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index c73c69eb6..ece0ed189 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -93,12 +93,10 @@ impl Container { } if let Some(hooks) = config.hooks.as_ref() { - hooks::run_hooks(hooks.poststop().as_ref(), self, None).map_err( - |err| { - tracing::error!(err = ?err, "failed to run post stop hooks"); - err - }, - )?; + hooks::run_hooks(hooks.poststop().as_ref(), self, None).map_err(|err| { + tracing::error!(err = ?err, "failed to run post stop hooks"); + err + })?; } } Err(err) => { diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index 6be6418d0..9a4f39f62 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -39,8 +39,7 @@ impl Container { None => return Err(LibcontainerError::CgroupsMissing), }; - let cgroup_manager = - libcgroups::common::create_cgroup_manager(cgroup_config)?; + let cgroup_manager = libcgroups::common::create_cgroup_manager(cgroup_config)?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 14d82b10b..d4ebe95fe 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -83,15 +83,13 @@ impl Container { match get_cgroup_setup()? { libcgroups::common::CgroupSetup::Legacy | libcgroups::common::CgroupSetup::Hybrid => { - let cmanager = libcgroups::common::create_cgroup_manager( - cgroup_config - )?; + let cmanager = libcgroups::common::create_cgroup_manager(cgroup_config)?; cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; } libcgroups::common::CgroupSetup::Unified => {} } } else { - return Err(LibcontainerError::CgroupsMissing) + return Err(LibcontainerError::CgroupsMissing); } } Ok(()) @@ -104,8 +102,7 @@ impl Container { }; let signal = signal.into().into_raw(); - let cmanager = - libcgroups::common::create_cgroup_manager(cgroup_config)?; + let cmanager = libcgroups::common::create_cgroup_manager(cgroup_config)?; if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index e0af5746f..aed466012 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -37,8 +37,7 @@ impl Container { None => return Err(LibcontainerError::CgroupsMissing), }; - let cmanager = - libcgroups::common::create_cgroup_manager(cgroup_config)?; + let cmanager = libcgroups::common::create_cgroup_manager(cgroup_config)?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 67b845e50..dc6cc4ad3 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -38,8 +38,7 @@ impl Container { None => return Err(LibcontainerError::CgroupsMissing), }; - let cmanager = - libcgroups::common::create_cgroup_manager(cgroup_config)?; + let cmanager = libcgroups::common::create_cgroup_manager(cgroup_config)?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 457bad451..2ec5bccaf 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -59,8 +59,7 @@ impl InitContainerBuilder { let container_dir = self.create_container_dir()?; let mut container = self.create_container_state(&container_dir)?; - container - .set_annotations(spec.annotations().clone()); + container.set_annotations(spec.annotations().clone()); let notify_path = container_dir.join(NOTIFY_FILE); // convert path of root file system of the container to absolute path @@ -84,7 +83,8 @@ impl InitContainerBuilder { let mut cgroup_config = None; if self.use_cgroups { let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; - let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.base.container_id); + let cgroups_path = + utils::get_cgroup_path(linux.cgroups_path(), &self.base.container_id); cgroup_config = Some(libcgroups::common::CgroupConfig { cgroup_path: cgroups_path, systemd_cgroup: self.use_systemd || user_ns_config.is_some(), diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index eda7a11fc..6302aa1c5 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -127,8 +127,10 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo #[cfg(feature = "libseccomp")] if let Some(seccomp) = linux.seccomp() { let container_state = match &container_args.container_type { - ContainerType::InitContainer { container } => &container.state, - ContainerType::TenantContainer { .. } => return Err(ProcessError::ContainerStateRequired), + ContainerType::InitContainer { container } => &container.state, + ContainerType::TenantContainer { .. } => { + return Err(ProcessError::ContainerStateRequired) + } }; let state = crate::container::ContainerProcessState { oci_version: container_args.spec.version().to_string(), @@ -148,8 +150,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo if let Some(intel_rdt) = linux.intel_rdt() { let container_id = match &container_args.container_type { - ContainerType::InitContainer { container } => Some(container.id()), - ContainerType::TenantContainer { .. } => None, + ContainerType::InitContainer { container } => Some(container.id()), + ContainerType::TenantContainer { .. } => None, }; need_to_clean_up_intel_rdt_subdirectory = setup_intel_rdt(container_id, &init_pid, intel_rdt)?; diff --git a/crates/youki/src/commands/create.rs b/crates/youki/src/commands/create.rs index 907233c72..02928dcbe 100644 --- a/crates/youki/src/commands/create.rs +++ b/crates/youki/src/commands/create.rs @@ -13,7 +13,12 @@ use crate::workload::executor::default_executor; // can be given impression that is is running on a complete system, but on the system which // it is running, it is just another process, and has attributes such as pid, file descriptors, etc. // associated with it like any other process. -pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool, use_cgroups: bool) -> Result<()> { +pub fn create( + args: Create, + root_path: PathBuf, + systemd_cgroup: bool, + use_cgroups: bool, +) -> Result<()> { ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) .with_executor(default_executor()) .with_pid_file(args.pid_file.as_ref())? diff --git a/crates/youki/src/main.rs b/crates/youki/src/main.rs index e3c64dc79..40eac1d2b 100644 --- a/crates/youki/src/main.rs +++ b/crates/youki/src/main.rs @@ -131,14 +131,16 @@ fn main() -> Result<()> { CommonCmd::Pause(pause) => commands::pause::pause(pause, root_path), CommonCmd::Ps(ps) => commands::ps::ps(ps, root_path), CommonCmd::Resume(resume) => commands::resume::resume(resume, root_path), - CommonCmd::Run(run) => match commands::run::run(run, root_path, systemd_cgroup, use_cgroups) { - Ok(exit_code) => std::process::exit(exit_code), - Err(e) => { - tracing::error!("error in executing command: {:?}", e); - eprintln!("run failed : {e}"); - std::process::exit(-1); + CommonCmd::Run(run) => { + match commands::run::run(run, root_path, systemd_cgroup, use_cgroups) { + Ok(exit_code) => std::process::exit(exit_code), + Err(e) => { + tracing::error!("error in executing command: {:?}", e); + eprintln!("run failed : {e}"); + std::process::exit(-1); + } } - }, + } CommonCmd::Spec(spec) => commands::spec_json::spec(spec), CommonCmd::Update(update) => commands::update::update(update, root_path), }, From f6514a00ea28684af62697efd3bff27d11f41eaa Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Fri, 30 Aug 2024 19:46:32 +0100 Subject: [PATCH 05/11] Fix test compilation failures Signed-off-by: Aidan Hobson Sayers --- crates/libcontainer/src/config.rs | 11 ++++++++--- crates/libcontainer/src/container/container.rs | 12 +----------- crates/libcontainer/src/hooks.rs | 11 +++++------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/crates/libcontainer/src/config.rs b/crates/libcontainer/src/config.rs index e025b514d..5b3d3d992 100644 --- a/crates/libcontainer/src/config.rs +++ b/crates/libcontainer/src/config.rs @@ -106,10 +106,10 @@ mod tests { systemd_cgroup: true, container_name: container_id.to_owned(), }; - let config = YoukiConfig::from_spec(&spec, container_id)?; + let config = YoukiConfig::from_spec(&spec, Some(cgroup_config.clone()))?; assert_eq!(&config.hooks, spec.hooks()); dbg!(&config.cgroup_config); - assert_eq!(config.cgroup_config, cgroup_config); + assert_eq!(config.cgroup_config, Some(cgroup_config)); Ok(()) } @@ -118,7 +118,12 @@ mod tests { let container_id = "sample"; let tmp = tempfile::tempdir().expect("create temp dir"); let spec = Spec::default(); - let config = YoukiConfig::from_spec(&spec, container_id)?; + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: PathBuf::from(format!(":youki:{container_id}")), + systemd_cgroup: true, + container_name: container_id.to_owned(), + }; + let config = YoukiConfig::from_spec(&spec, Some(cgroup_config))?; config.save(&tmp)?; let act = YoukiConfig::load(&tmp)?; assert_eq!(act, config); diff --git a/crates/libcontainer/src/container/container.rs b/crates/libcontainer/src/container/container.rs index 2d55f69ed..fbe3acf5a 100644 --- a/crates/libcontainer/src/container/container.rs +++ b/crates/libcontainer/src/container/container.rs @@ -273,16 +273,6 @@ mod tests { assert_eq!(container.state.annotations, Some(annotations)); } - #[test] - fn test_get_set_systemd() { - let mut container = Container::default(); - assert!(!container.systemd()); - container.set_systemd(true); - assert!(container.systemd()); - container.set_systemd(false); - assert!(!container.systemd()); - } - #[test] fn test_get_set_creator() { let mut container = Container::default(); @@ -322,7 +312,7 @@ mod tests { let tmp_dir = tempfile::tempdir().unwrap(); use oci_spec::runtime::Spec; let spec = Spec::default(); - let config = YoukiConfig::from_spec(&spec, "123").context("convert spec to config")?; + let config = YoukiConfig::from_spec(&spec, None).context("convert spec to config")?; config.save(tmp_dir.path()).context("save config")?; let container = Container { diff --git a/crates/libcontainer/src/hooks.rs b/crates/libcontainer/src/hooks.rs index 0f243e0d7..5423d65b3 100644 --- a/crates/libcontainer/src/hooks.rs +++ b/crates/libcontainer/src/hooks.rs @@ -174,7 +174,7 @@ mod test { fn test_run_hook() -> Result<()> { { let default_container: Container = Default::default(); - run_hooks(None, Some(&default_container), None).context("Failed simple test")?; + run_hooks(None, &default_container, None).context("Failed simple test")?; } { @@ -183,7 +183,7 @@ mod test { let hook = HookBuilder::default().path("true").build()?; let hooks = Some(vec![hook]); - run_hooks(hooks.as_ref(), Some(&default_container), None).context("Failed true")?; + run_hooks(hooks.as_ref(), &default_container, None).context("Failed true")?; } { @@ -203,8 +203,7 @@ mod test { .env(vec![String::from("key=value")]) .build()?; let hooks = Some(vec![hook]); - run_hooks(hooks.as_ref(), Some(&default_container), None) - .context("Failed printenv test")?; + run_hooks(hooks.as_ref(), &default_container, None).context("Failed printenv test")?; } { @@ -222,7 +221,7 @@ mod test { ]) .build()?; let hooks = Some(vec![hook]); - run_hooks(hooks.as_ref(), Some(&default_container), Some(tmp.path())) + run_hooks(hooks.as_ref(), &default_container, Some(tmp.path())) .context("Failed pwd test")?; } @@ -246,7 +245,7 @@ mod test { .timeout(1) .build()?; let hooks = Some(vec![hook]); - match run_hooks(hooks.as_ref(), Some(&default_container), None) { + match run_hooks(hooks.as_ref(), &default_container, None) { Ok(_) => { bail!("The test expects the hook to error out with timeout. Should not execute cleanly"); } From 08bd82378c1aa14058a6651d8174209e7b6b3489 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 21:47:48 +0100 Subject: [PATCH 06/11] Fix preserve-fds flag Signed-off-by: Aidan Hobson Sayers --- .../libcontainer/src/process/container_init_process.rs | 3 +++ .../libcontainer/src/process/container_main_process.rs | 9 --------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 0d16e6094..f47e3821d 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -580,6 +580,9 @@ pub fn container_init_process( // will be closed after execve into the container payload. We can't close the // fds immediately since we at least still need it for the pipe used to wait on // starting the container. + // + // Note: this should happen very late, in order to avoid accidentally leaking FDs + // Please refer to https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv for more details. syscall.close_range(preserve_fds).map_err(|err| { tracing::error!(?err, "failed to cleanup extra fds"); InitProcessError::SyscallOther(err) diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index fd5dfcb1c..7739a89da 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -75,15 +75,6 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo }) }; - // Before starting the intermediate process, mark all non-stdio open files as O_CLOEXEC - // to ensure we don't leak any file descriptors to the intermediate process. - // Please refer to https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv for more details. - let syscall = container_args.syscall.create_syscall(); - syscall.close_range(0).map_err(|err| { - tracing::error!(?err, "failed to cleanup extra fds"); - ProcessError::SyscallOther(err) - })?; - let intermediate_pid = fork::container_clone(cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err) From 4f388a36dba8f006fa6257cc5c5f1d16494852fa Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 21:48:15 +0100 Subject: [PATCH 07/11] Fix a stray FD leaking in containers when using preserve-fd Signed-off-by: Aidan Hobson Sayers --- crates/libcontainer/src/syscall/linux.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/crates/libcontainer/src/syscall/linux.rs b/crates/libcontainer/src/syscall/linux.rs index ed68e104a..1b6d702e1 100644 --- a/crates/libcontainer/src/syscall/linux.rs +++ b/crates/libcontainer/src/syscall/linux.rs @@ -17,7 +17,7 @@ use nix::fcntl::{open, OFlag}; use nix::mount::{mount, umount2, MntFlags, MsFlags}; use nix::sched::{unshare, CloneFlags}; use nix::sys::stat::{mknod, Mode, SFlag}; -use nix::unistd::{chown, chroot, fchdir, pivot_root, sethostname, Gid, Uid}; +use nix::unistd::{chown, chroot, close, fchdir, pivot_root, sethostname, Gid, Uid}; use oci_spec::runtime::PosixRlimit; use super::{Result, Syscall, SyscallError}; @@ -232,11 +232,15 @@ impl Syscall for LinuxSyscall { /// Function to set given path as root path inside process fn pivot_rootfs(&self, path: &Path) -> Result<()> { // open the path as directory and read only - let newroot = - open(path, OFlag::O_DIRECTORY | OFlag::O_RDONLY, Mode::empty()).map_err(|errno| { - tracing::error!(?errno, ?path, "failed to open the new root for pivot root"); - errno - })?; + let newroot = open( + path, + OFlag::O_DIRECTORY | OFlag::O_RDONLY | OFlag::O_CLOEXEC, + Mode::empty(), + ) + .map_err(|errno| { + tracing::error!(?errno, ?path, "failed to open the new root for pivot root"); + errno + })?; // make the given path as the root directory for the container // see https://man7.org/linux/man-pages/man2/pivot_root.2.html, specially the notes @@ -279,6 +283,11 @@ impl Syscall for LinuxSyscall { errno })?; + close(newroot).map_err(|errno| { + tracing::error!(?errno, ?newroot, "failed to close new root directory"); + errno + })?; + Ok(()) } From 6dfbb77adbd0db755f627518ce4500eac3b87d8e Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Mon, 2 Sep 2024 02:25:50 +0100 Subject: [PATCH 08/11] Add tests for preserve-fds, extend test harness Test harness additionally needed to support 1. tests that cannot run in parallel 2. tests that need to customise create arguments Signed-off-by: Aidan Hobson Sayers --- tests/contest/contest/src/main.rs | 3 + .../contest/src/tests/fd_control/mod.rs | 107 ++++++++++++++++++ .../src/tests/lifecycle/container_create.rs | 4 + .../tests/lifecycle/container_lifecycle.rs | 4 + tests/contest/contest/src/tests/mod.rs | 1 + .../contest/src/tests/pidfile/pidfile_test.rs | 32 ++---- tests/contest/contest/src/utils/mod.rs | 2 +- tests/contest/contest/src/utils/test_utils.rs | 14 ++- tests/contest/runtimetest/src/main.rs | 1 + tests/contest/runtimetest/src/tests.rs | 45 ++++++++ .../contest/test_framework/src/test_group.rs | 90 ++++++++++----- .../test_framework/src/test_manager.rs | 29 +++++ tests/contest/test_framework/src/testable.rs | 1 + 13 files changed, 281 insertions(+), 52 deletions(-) create mode 100644 tests/contest/contest/src/tests/fd_control/mod.rs diff --git a/tests/contest/contest/src/main.rs b/tests/contest/contest/src/main.rs index bb1019ca8..7c2708624 100644 --- a/tests/contest/contest/src/main.rs +++ b/tests/contest/contest/src/main.rs @@ -12,6 +12,7 @@ use tests::cgroups; use crate::tests::devices::get_devices_test; use crate::tests::domainname::get_domainname_tests; use crate::tests::example::get_example_test; +use crate::tests::fd_control::get_fd_control_test; use crate::tests::hooks::get_hooks_tests; use crate::tests::hostname::get_hostname_test; use crate::tests::intel_rdt::get_intel_rdt_test; @@ -125,6 +126,7 @@ fn main() -> Result<()> { let process_rlimtis = get_process_rlimits_test(); let no_pivot = get_no_pivot_test(); let process_oom_score_adj = get_process_oom_score_adj_test(); + let fd_control = get_fd_control_test(); tm.add_test_group(Box::new(cl)); tm.add_test_group(Box::new(cc)); @@ -154,6 +156,7 @@ fn main() -> Result<()> { tm.add_test_group(Box::new(process_rlimtis)); tm.add_test_group(Box::new(no_pivot)); tm.add_test_group(Box::new(process_oom_score_adj)); + tm.add_test_group(Box::new(fd_control)); tm.add_test_group(Box::new(io_priority_test)); tm.add_cleanup(Box::new(cgroups::cleanup_v1)); diff --git a/tests/contest/contest/src/tests/fd_control/mod.rs b/tests/contest/contest/src/tests/fd_control/mod.rs new file mode 100644 index 000000000..1470670a9 --- /dev/null +++ b/tests/contest/contest/src/tests/fd_control/mod.rs @@ -0,0 +1,107 @@ +use std::fs; +use std::os::fd::{AsRawFd, RawFd}; + +use anyhow::{anyhow, Context, Result}; +use oci_spec::runtime::{ProcessBuilder, Spec, SpecBuilder}; +use test_framework::{test_result, Test, TestGroup, TestResult}; + +use crate::utils::{test_inside_container, CreateOptions}; + +fn create_spec() -> Result { + SpecBuilder::default() + .process( + ProcessBuilder::default() + .args( + ["runtimetest", "fd_control"] + .iter() + .map(|s| s.to_string()) + .collect::>(), + ) + .build()?, + ) + .build() + .context("failed to create spec") +} + +fn open_devnull_no_cloexec() -> Result<(fs::File, RawFd)> { + // Rust std by default sets cloexec, so we undo it + let devnull = fs::File::open("/dev/null")?; + let devnull_fd = devnull.as_raw_fd(); + let flags = nix::fcntl::fcntl(devnull_fd, nix::fcntl::FcntlArg::F_GETFD)?; + let mut flags = nix::fcntl::FdFlag::from_bits_retain(flags); + flags.remove(nix::fcntl::FdFlag::FD_CLOEXEC); + nix::fcntl::fcntl(devnull_fd, nix::fcntl::FcntlArg::F_SETFD(flags))?; + Ok((devnull, devnull_fd)) +} + +// If not opening any other FDs, verify youki itself doesnt open anything that gets +// leaked in if passing --preserve-fds with a large number +// NOTE: this will also fail if the test harness itself starts leaking FDs +fn only_stdio_test() -> TestResult { + let spec = test_result!(create_spec()); + test_inside_container( + spec, + &CreateOptions::default().with_extra_args(&["--preserve-fds".as_ref(), "100".as_ref()]), + &|bundle_path| { + fs::write(bundle_path.join("num-fds"), "0".as_bytes())?; + Ok(()) + }, + ) +} + +// If we know we have an open FD without cloexec, it should be closed if preserve-fds +// is 0 (the default) +fn closes_fd_test() -> TestResult { + // Open this before the setup function so it's kept alive for the container lifetime + let (_devnull, _devnull_fd) = match open_devnull_no_cloexec() { + Ok(v) => v, + Err(e) => return TestResult::Failed(anyhow!("failed to open dev null: {}", e)), + }; + + let spec = test_result!(create_spec()); + test_inside_container( + spec, + &CreateOptions::default().with_extra_args(&["--preserve-fds".as_ref(), "0".as_ref()]), + &|bundle_path| { + fs::write(bundle_path.join("num-fds"), "0".as_bytes())?; + Ok(()) + }, + ) +} + +// Given an open FD, verify it can be passed down with preserve-fds +fn pass_single_fd_test() -> TestResult { + // Open this before the setup function so it's kept alive for the container lifetime + let (_devnull, devnull_fd) = match open_devnull_no_cloexec() { + Ok(v) => v, + Err(e) => return TestResult::Failed(anyhow!("failed to open dev null: {}", e)), + }; + + let spec = test_result!(create_spec()); + test_inside_container( + spec, + &CreateOptions::default().with_extra_args(&[ + "--preserve-fds".as_ref(), + (devnull_fd - 2).to_string().as_ref(), // relative to stdio + ]), + &|bundle_path| { + fs::write(bundle_path.join("num-fds"), "1".as_bytes())?; + Ok(()) + }, + ) +} + +pub fn get_fd_control_test() -> TestGroup { + let mut test_group = TestGroup::new("fd_control"); + test_group.set_nonparallel(); // fds are process-wide state + let test_only_stdio = Test::new("only_stdio", Box::new(only_stdio_test)); + let test_closes_fd = Test::new("closes_fd", Box::new(closes_fd_test)); + let test_pass_single_fd = Test::new("pass_single_fd", Box::new(pass_single_fd_test)); + test_group.add(vec![ + Box::new(test_only_stdio), + Box::new(test_closes_fd), + Box::new(test_pass_single_fd), + ]); + + test_group +} diff --git a/tests/contest/contest/src/tests/lifecycle/container_create.rs b/tests/contest/contest/src/tests/lifecycle/container_create.rs index a2021bff7..2f2050b04 100644 --- a/tests/contest/contest/src/tests/lifecycle/container_create.rs +++ b/tests/contest/contest/src/tests/lifecycle/container_create.rs @@ -82,6 +82,10 @@ impl TestableGroup for ContainerCreate { "create" } + fn parallel(&self) -> bool { + true + } + fn run_all(&self) -> Vec<(&'static str, TestResult)> { vec![ ("empty_id", self.create_empty_id()), diff --git a/tests/contest/contest/src/tests/lifecycle/container_lifecycle.rs b/tests/contest/contest/src/tests/lifecycle/container_lifecycle.rs index 6b925feea..b33f3364c 100644 --- a/tests/contest/contest/src/tests/lifecycle/container_lifecycle.rs +++ b/tests/contest/contest/src/tests/lifecycle/container_lifecycle.rs @@ -93,6 +93,10 @@ impl TestableGroup for ContainerLifecycle { "lifecycle" } + fn parallel(&self) -> bool { + true + } + fn run_all(&self) -> Vec<(&'static str, TestResult)> { vec![ ("create", self.create()), diff --git a/tests/contest/contest/src/tests/mod.rs b/tests/contest/contest/src/tests/mod.rs index 6e8e39be8..ac7f6e0ee 100644 --- a/tests/contest/contest/src/tests/mod.rs +++ b/tests/contest/contest/src/tests/mod.rs @@ -2,6 +2,7 @@ pub mod cgroups; pub mod devices; pub mod domainname; pub mod example; +pub mod fd_control; pub mod hooks; pub mod hostname; pub mod intel_rdt; diff --git a/tests/contest/contest/src/tests/pidfile/pidfile_test.rs b/tests/contest/contest/src/tests/pidfile/pidfile_test.rs index 3d545bfd4..2c3d9525d 100644 --- a/tests/contest/contest/src/tests/pidfile/pidfile_test.rs +++ b/tests/contest/contest/src/tests/pidfile/pidfile_test.rs @@ -1,13 +1,12 @@ use std::fs::File; -use std::process::{Command, Stdio}; use anyhow::anyhow; use test_framework::{Test, TestGroup, TestResult}; use uuid::Uuid; use crate::utils::{ - delete_container, generate_uuid, get_runtime_path, get_state, kill_container, prepare_bundle, - State, + create_container, delete_container, generate_uuid, get_state, kill_container, prepare_bundle, + CreateOptions, State, }; #[inline] @@ -18,7 +17,8 @@ fn cleanup(id: &Uuid, bundle: &tempfile::TempDir) { } // here we have to manually create and manage the container -// as the test_inside container does not provide a way to set the pid file argument +// as the test_inside_container does not provide a way to set the pid file argument +// TODO: this comment is now out of date, the test just needs updating fn test_pidfile() -> TestResult { // create id for the container and pidfile let container_id = generate_uuid(); @@ -30,22 +30,14 @@ fn test_pidfile() -> TestResult { let _ = File::create(&pidfile_path).unwrap(); // start the container - Command::new(get_runtime_path()) - .stdin(Stdio::null()) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .arg("--root") - .arg(bundle.as_ref().join("runtime")) - .arg("create") - .arg(container_id.to_string()) - .arg("--bundle") - .arg(bundle.as_ref().join("bundle")) - .arg("--pid-file") - .arg(pidfile_path) - .spawn() - .unwrap() - .wait() - .unwrap(); + create_container( + &container_id.to_string(), + &bundle, + &CreateOptions::default().with_extra_args(&["--pid-file".as_ref(), pidfile_path.as_ref()]), + ) + .unwrap() + .wait() + .unwrap(); let (out, err) = get_state(&container_id.to_string(), &bundle).unwrap(); diff --git a/tests/contest/contest/src/utils/mod.rs b/tests/contest/contest/src/utils/mod.rs index 9f7c3eccb..9088ac27d 100644 --- a/tests/contest/contest/src/utils/mod.rs +++ b/tests/contest/contest/src/utils/mod.rs @@ -7,5 +7,5 @@ pub use support::{ }; pub use test_utils::{ create_container, delete_container, get_state, kill_container, test_inside_container, - test_outside_container, State, + test_outside_container, CreateOptions, State, }; diff --git a/tests/contest/contest/src/utils/test_utils.rs b/tests/contest/contest/src/utils/test_utils.rs index 0964a9a04..9da480923 100644 --- a/tests/contest/contest/src/utils/test_utils.rs +++ b/tests/contest/contest/src/utils/test_utils.rs @@ -1,6 +1,7 @@ //! Contains utility functions for testing //! Similar to https://github.com/opencontainers/runtime-tools/blob/master/validation/util/test.go use std::collections::HashMap; +use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::{Child, Command, ExitStatus, Stdio}; use std::thread::sleep; @@ -43,11 +44,17 @@ pub struct ContainerData { } #[derive(Debug, Default)] -pub struct CreateOptions { +pub struct CreateOptions<'a> { + extra_args: &'a [&'a OsStr], no_pivot: bool, } -impl CreateOptions { +impl<'a> CreateOptions<'a> { + pub fn with_extra_args(mut self, extra_args: &'a [&'a OsStr]) -> Self { + self.extra_args = extra_args; + self + } + pub fn with_no_pivot_root(mut self) -> Self { self.no_pivot = true; self @@ -64,7 +71,8 @@ fn create_container_command>(id: &str, dir: P, options: &CreateOp .arg("create") .arg(id) .arg("--bundle") - .arg(dir.as_ref().join("bundle")); + .arg(dir.as_ref().join("bundle")) + .args(options.extra_args); if options.no_pivot { command.arg("--no-pivot"); } diff --git a/tests/contest/runtimetest/src/main.rs b/tests/contest/runtimetest/src/main.rs index ef85a35cb..62f8372b4 100644 --- a/tests/contest/runtimetest/src/main.rs +++ b/tests/contest/runtimetest/src/main.rs @@ -50,6 +50,7 @@ fn main() { "process_rlimits" => tests::validate_process_rlimits(&spec), "no_pivot" => tests::validate_rootfs(), "process_oom_score_adj" => tests::validate_process_oom_score_adj(&spec), + "fd_control" => tests::validate_fd_control(&spec), _ => eprintln!("error due to unexpected execute test name: {execute_test}"), } } diff --git a/tests/contest/runtimetest/src/tests.rs b/tests/contest/runtimetest/src/tests.rs index 721141aaf..46f6f270e 100644 --- a/tests/contest/runtimetest/src/tests.rs +++ b/tests/contest/runtimetest/src/tests.rs @@ -1,4 +1,5 @@ use std::env; +use std::ffi::OsStr; use std::fs::{self, read_dir}; use std::os::linux::fs::MetadataExt; use std::os::unix::fs::{FileTypeExt, PermissionsExt}; @@ -775,3 +776,47 @@ pub fn validate_process_oom_score_adj(spec: &Spec) { eprintln!("Unexpected oom_score_adj, expected: {expected_value} found: {actual_value}"); } } + +pub fn validate_fd_control(_spec: &Spec) { + // --preserve-fds does not get passed via the spec so we have to communicate information + // via the root filesystem + let expected_num_fds: usize = fs::read_to_string("/num-fds").unwrap().parse().unwrap(); + + let mut entries = vec![]; + let stdio: &[&OsStr] = &["0".as_ref(), "1".as_ref(), "2".as_ref()]; + for entry in fs::read_dir("/proc/self/fd").unwrap() { + let entry = entry.unwrap(); + let name = entry.file_name(); + if stdio.contains(&name.as_os_str()) { + // Ignore stdio + continue; + } + entries.push((entry.path(), fs::read_link(entry.path()))) + } + + // NOTE: we do this in a separate loop so we can filter out the dirfd used behind + // the scenes in 'fs::read_dir'. It is important to *not* store the full DirEntry + // type, as that keeps the dirfd open. + let mut fd_details = vec![]; + let mut found_dirfd = false; + for (path, linkpath) in &entries { + println!("found fd in container {} {:?}", path.display(), linkpath); + // The difference between metadata.unwrap() and fs::metadata is that the latter + // will now try to follow the symlink + match fs::metadata(path) { + Ok(m) => fd_details.push((path, linkpath, m)), + Err(e) if e.kind() == std::io::ErrorKind::NotFound && !found_dirfd => { + // Expected for the dirfd + println!("(ignoring dirfd)"); + found_dirfd = true + } + Err(e) => { + eprintln!("unexpected error reading metadata: {}", e) + } + } + } + + if fd_details.len() != expected_num_fds { + eprintln!("mismatched fds inside container! {:?}", fd_details); + } +} diff --git a/tests/contest/test_framework/src/test_group.rs b/tests/contest/test_framework/src/test_group.rs index b5a01ec6b..771041165 100644 --- a/tests/contest/test_framework/src/test_group.rs +++ b/tests/contest/test_framework/src/test_group.rs @@ -9,6 +9,9 @@ use crate::testable::{TestResult, Testable, TestableGroup}; pub struct TestGroup { /// name of the test group name: &'static str, + /// can the test group be executed in parallel (both the tests + /// within it, and alongside other test groups) + parallel: bool, /// tests belonging to this group tests: BTreeMap<&'static str, Box>, } @@ -18,10 +21,16 @@ impl TestGroup { pub fn new(name: &'static str) -> Self { TestGroup { name, + parallel: true, tests: BTreeMap::new(), } } + /// mark the test group as unsuitable for parallel execution + pub fn set_nonparallel(&mut self) { + self.parallel = false + } + /// add a test to the group pub fn add(&mut self, tests: Vec>) { tests.into_iter().for_each(|t| { @@ -36,26 +45,41 @@ impl TestableGroup for TestGroup { self.name } + /// can this test group be executed (within itself, and alongside other groups) + fn parallel(&self) -> bool { + self.parallel + } + /// run all the test from the test group fn run_all(&self) -> Vec<(&'static str, TestResult)> { let mut ret = Vec::with_capacity(self.tests.len()); - thread::scope(|s| { - let mut collector = Vec::with_capacity(self.tests.len()); + if self.parallel { + thread::scope(|s| { + let mut collector = Vec::with_capacity(self.tests.len()); + for (_, t) in self.tests.iter() { + let _t = s.spawn(move |_| { + if t.can_run() { + (t.get_name(), t.run()) + } else { + (t.get_name(), TestResult::Skipped) + } + }); + collector.push(_t); + } + for handle in collector { + ret.push(handle.join().unwrap()); + } + }) + .unwrap(); + } else { for (_, t) in self.tests.iter() { - let _t = s.spawn(move |_| { - if t.can_run() { - (t.get_name(), t.run()) - } else { - (t.get_name(), TestResult::Skipped) - } + ret.push(if t.can_run() { + (t.get_name(), t.run()) + } else { + (t.get_name(), TestResult::Skipped) }); - collector.push(_t); } - for handle in collector { - ret.push(handle.join().unwrap()); - } - }) - .unwrap(); + } ret } @@ -66,23 +90,33 @@ impl TestableGroup for TestGroup { .iter() .filter(|(name, _)| selected.contains(name)); let mut ret = Vec::with_capacity(selected.len()); - thread::scope(|s| { - let mut collector = Vec::with_capacity(selected.len()); + if self.parallel { + thread::scope(|s| { + let mut collector = Vec::with_capacity(selected.len()); + for (_, t) in selected_tests { + let _t = s.spawn(move |_| { + if t.can_run() { + (t.get_name(), t.run()) + } else { + (t.get_name(), TestResult::Skipped) + } + }); + collector.push(_t); + } + for handle in collector { + ret.push(handle.join().unwrap()); + } + }) + .unwrap(); + } else { for (_, t) in selected_tests { - let _t = s.spawn(move |_| { - if t.can_run() { - (t.get_name(), t.run()) - } else { - (t.get_name(), TestResult::Skipped) - } + ret.push(if t.can_run() { + (t.get_name(), t.run()) + } else { + (t.get_name(), TestResult::Skipped) }); - collector.push(_t); } - for handle in collector { - ret.push(handle.join().unwrap()); - } - }) - .unwrap(); + } ret } } diff --git a/tests/contest/test_framework/src/test_manager.rs b/tests/contest/test_framework/src/test_manager.rs index 0bccdefd9..2971e24b6 100644 --- a/tests/contest/test_framework/src/test_manager.rs +++ b/tests/contest/test_framework/src/test_manager.rs @@ -64,6 +64,9 @@ impl TestManager { thread::scope(|s| { let mut collector = Vec::with_capacity(self.test_groups.len()); for (name, tg) in &self.test_groups { + if !tg.parallel() { + continue; + } let r = s.spawn(move |_| tg.run_all()); collector.push((name, r)); } @@ -72,6 +75,13 @@ impl TestManager { } }) .unwrap(); + for (name, tg) in &self.test_groups { + if tg.parallel() { + continue; + } + self.print_test_result(name, &tg.run_all()) + } + for cleaner in &self.cleanup { if let Err(e) = cleaner() { print!("Failed to cleanup: {e}"); @@ -85,6 +95,9 @@ impl TestManager { let mut collector = Vec::with_capacity(tests.len()); for (test_group_name, tests) in &tests { if let Some(tg) = self.test_groups.get(test_group_name) { + if !tg.parallel() { + continue; + } let r = match tests { None => s.spawn(move |_| tg.run_all()), Some(tests) => s.spawn(move |_| tg.run_selected(tests)), @@ -99,6 +112,22 @@ impl TestManager { } }) .unwrap(); + for (test_group_name, tests) in &tests { + if let Some(tg) = self.test_groups.get(test_group_name) { + if tg.parallel() { + continue; + } + self.print_test_result( + test_group_name, + &match tests { + None => tg.run_all(), + Some(tests) => tg.run_selected(tests), + }, + ); + } else { + // We've already printed errors for not finding tests + } + } for cleaner in &self.cleanup { if let Err(e) = cleaner() { diff --git a/tests/contest/test_framework/src/testable.rs b/tests/contest/test_framework/src/testable.rs index 2c90986b8..b1e15e311 100644 --- a/tests/contest/test_framework/src/testable.rs +++ b/tests/contest/test_framework/src/testable.rs @@ -39,6 +39,7 @@ pub trait Testable { /// Test groups are used to group tests in sensible manner as well as provide namespacing to tests pub trait TestableGroup { fn get_name(&self) -> &'static str; + fn parallel(&self) -> bool; fn run_all(&self) -> Vec<(&'static str, TestResult)>; fn run_selected(&self, selected: &[&str]) -> Vec<(&'static str, TestResult)>; } From 32d5dded202c760095b7119554f26c4a5bbab209 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Mon, 2 Sep 2024 15:52:59 +0100 Subject: [PATCH 09/11] Opt-out a test from runc that it errors on Signed-off-by: Aidan Hobson Sayers --- tests/contest/contest/src/tests/fd_control/mod.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/contest/contest/src/tests/fd_control/mod.rs b/tests/contest/contest/src/tests/fd_control/mod.rs index 1470670a9..a64d14b37 100644 --- a/tests/contest/contest/src/tests/fd_control/mod.rs +++ b/tests/contest/contest/src/tests/fd_control/mod.rs @@ -3,9 +3,9 @@ use std::os::fd::{AsRawFd, RawFd}; use anyhow::{anyhow, Context, Result}; use oci_spec::runtime::{ProcessBuilder, Spec, SpecBuilder}; -use test_framework::{test_result, Test, TestGroup, TestResult}; +use test_framework::{test_result, ConditionalTest, Test, TestGroup, TestResult}; -use crate::utils::{test_inside_container, CreateOptions}; +use crate::utils::{is_runtime_runc, test_inside_container, CreateOptions}; fn create_spec() -> Result { SpecBuilder::default() @@ -94,11 +94,16 @@ fn pass_single_fd_test() -> TestResult { pub fn get_fd_control_test() -> TestGroup { let mut test_group = TestGroup::new("fd_control"); test_group.set_nonparallel(); // fds are process-wide state - let test_only_stdio = Test::new("only_stdio", Box::new(only_stdio_test)); + let test_only_stdio = ConditionalTest::new( + "only_stdio", + // runc errors if any of the N passed FDs via preserve-fd are not currently open + Box::new(|| !is_runtime_runc()), + Box::new(only_stdio_test), + ); let test_closes_fd = Test::new("closes_fd", Box::new(closes_fd_test)); let test_pass_single_fd = Test::new("pass_single_fd", Box::new(pass_single_fd_test)); + test_group.add(vec![Box::new(test_only_stdio)]); test_group.add(vec![ - Box::new(test_only_stdio), Box::new(test_closes_fd), Box::new(test_pass_single_fd), ]); From e4ec29c3bcc4574f4458d8732fa3b502cdb9ee9c Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sun, 25 Aug 2024 13:21:26 +0100 Subject: [PATCH 10/11] Pass fewer fds down to intermediate process --- crates/libcontainer/src/channel.rs | 9 ++-- .../process/container_intermediate_process.rs | 44 ++++++------------- .../src/process/container_main_process.rs | 27 +++++++----- 3 files changed, 33 insertions(+), 47 deletions(-) diff --git a/crates/libcontainer/src/channel.rs b/crates/libcontainer/src/channel.rs index fd2ea6c04..8493b7c59 100644 --- a/crates/libcontainer/src/channel.rs +++ b/crates/libcontainer/src/channel.rs @@ -1,6 +1,6 @@ use std::io::{IoSlice, IoSliceMut}; use std::marker::PhantomData; -use std::os::fd::AsRawFd; +use std::os::fd::IntoRawFd; use std::os::unix::prelude::RawFd; use nix::sys::socket::{self, UnixAddr}; @@ -213,9 +213,6 @@ fn unix_channel() -> Result<(RawFd, RawFd), ChannelError> { socket::SockFlag::SOCK_CLOEXEC, )?; // It is not straightforward to share the OwnedFd across forks, so we - // treat them as i32. We use ManuallyDrop to keep the connection open. - let f1 = std::mem::ManuallyDrop::new(f1); - let f2 = std::mem::ManuallyDrop::new(f2); - - Ok((f1.as_raw_fd(), f2.as_raw_fd())) + // treat them as i32 - into_raw_fd ensures they're not dropped + Ok((f1.into_raw_fd(), f2.into_raw_fd())) } diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index ae245e1a6..f2dd97776 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -39,12 +39,10 @@ type Result = std::result::Result; pub fn container_intermediate_process( args: &ContainerArgs, - intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver), - init_chan: &mut (channel::InitSender, channel::InitReceiver), + inter_receiver: &mut channel::IntermediateReceiver, + init_receiver: &mut channel::InitReceiver, main_sender: &mut channel::MainSender, ) -> Result<()> { - let (inter_sender, inter_receiver) = intermediate_chan; - let (init_sender, init_receiver) = init_chan; let command = args.syscall.create_syscall(); let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; @@ -86,6 +84,12 @@ pub fn container_intermediate_process( command.set_id(Uid::from_raw(0), Gid::from_raw(0))?; } + // We're done with inter_receiver here + inter_receiver.close().map_err(|err| { + tracing::error!("failed to close unused main sender: {}", err); + err + })?; + // set limits and namespaces to the process let proc = spec.process().as_ref().ok_or(MissingSpecError::Process)?; if let Some(rlimits) = proc.rlimits() { @@ -109,17 +113,8 @@ pub fn container_intermediate_process( return ret; } - // We are inside the forked process here. The first thing we have to do - // is to close any unused senders, since fork will make a dup for all - // the socket. - if let Err(err) = init_sender.close() { - tracing::error!(?err, "failed to close receiver in init process"); - return -1; - } - if let Err(err) = inter_sender.close() { - tracing::error!(?err, "failed to close sender in the intermediate process"); - return -1; - } + // We are inside the forked process here - all FDs have been cleaned up + // already match container_init_process(args, main_sender, init_receiver) { Ok(_) => 0, Err(e) => { @@ -171,22 +166,9 @@ pub fn container_intermediate_process( err })?; - // Close unused senders here so we don't have lingering socket around. - main_sender.close().map_err(|err| { - tracing::error!("failed to close unused main sender: {}", err); - err - })?; - inter_sender.close().map_err(|err| { - tracing::error!( - "failed to close sender in the intermediate process: {}", - err - ); - err - })?; - init_sender.close().map_err(|err| { - tracing::error!("failed to close unused init sender: {}", err); - err - })?; + // Don't close main sender here - the intermediate process is about + // to exit successfully, closing gives an opportunity for a syscall + // to fail which then cannot be reported Ok(()) } diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 00a05546b..190badb86 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -40,8 +40,11 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // At minimum, we have to close down any unused senders. The corresponding // receivers will be cleaned up once the senders are closed down. let (mut main_sender, mut main_receiver) = channel::main_channel()?; - let mut inter_chan = channel::intermediate_channel()?; - let mut init_chan = channel::init_channel()?; + let (mut inter_sender, mut inter_receiver) = channel::intermediate_channel()?; + #[cfg(feature = "libseccomp")] + let (mut init_sender, mut init_receiver) = channel::init_channel()?; + #[cfg(not(feature = "libseccomp"))] + let (init_sender, mut init_receiver) = channel::init_channel()?; let cb: CloneCb = { Box::new(|| { @@ -50,10 +53,20 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo return ret; } + // We are inside the forked process here - clean up FDs that don't need + // passing down + if let Err(err) = init_sender.close() { + tracing::error!(?err, "failed to close receiver in init process"); + return -1; + } + if let Err(err) = inter_sender.close() { + tracing::error!(?err, "failed to close receiver in init process"); + return -1; + } match container_intermediate_process::container_intermediate_process( container_args, - &mut inter_chan, - &mut init_chan, + &mut inter_receiver, + &mut init_receiver, &mut main_sender, ) { Ok(_) => 0, @@ -87,12 +100,6 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo err })?; - let (mut inter_sender, inter_receiver) = inter_chan; - #[cfg(feature = "libseccomp")] - let (mut init_sender, init_receiver) = init_chan; - #[cfg(not(feature = "libseccomp"))] - let (init_sender, init_receiver) = init_chan; - // If creating a container with new user namespace, the intermediate process will ask // the main process to set up uid and gid mapping, once the intermediate // process enters into a new user namespace. From 43c86642ac5d4024d22e144969fb96a5ab7ab23f Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sun, 25 Aug 2024 15:45:40 +0100 Subject: [PATCH 11/11] Implement FD remapping --- crates/libcontainer/src/container/builder.rs | 30 +++++++++++-- .../src/container/builder_impl.rs | 5 ++- .../src/container/init_builder.rs | 1 + .../src/container/tenant_builder.rs | 1 + crates/libcontainer/src/process/args.rs | 2 + .../src/process/container_init_process.rs | 18 ++++++-- crates/libcontainer/src/syscall/linux.rs | 42 +++++++++++++++++-- crates/libcontainer/src/syscall/syscall.rs | 4 +- crates/libcontainer/src/syscall/test.rs | 5 +++ 9 files changed, 95 insertions(+), 13 deletions(-) diff --git a/crates/libcontainer/src/container/builder.rs b/crates/libcontainer/src/container/builder.rs index a22642cd5..00f0572c0 100644 --- a/crates/libcontainer/src/container/builder.rs +++ b/crates/libcontainer/src/container/builder.rs @@ -1,4 +1,4 @@ -use std::os::fd::OwnedFd; +use std::os::fd::{OwnedFd, RawFd}; use std::path::PathBuf; use super::init_builder::InitContainerBuilder; @@ -20,8 +20,10 @@ pub struct ContainerBuilder { pub(super) pid_file: Option, /// Socket to communicate the file descriptor of the ptty pub(super) console_socket: Option, - /// File descriptors to be passed into the container process + /// Number of file descriptors to be passed into the container process pub(super) preserve_fds: i32, + /// File descriptors to be remapped into the container process + pub(super) remap_fds: Vec<(RawFd, RawFd)>, /// The function that actually runs on the container init process. Default /// is to execute the specified command in the oci spec. pub(super) executor: Box, @@ -76,6 +78,7 @@ impl ContainerBuilder { pid_file: None, console_socket: None, preserve_fds: 0, + remap_fds: vec![], executor: workload::default::get_executor(), stdin: None, stdout: None, @@ -231,7 +234,7 @@ impl ContainerBuilder { } /// Sets the number of additional file descriptors which will be passed into - /// the container process. + /// the container process, over and above stdio (0, 1, 2). /// # Example /// /// ```no_run @@ -242,6 +245,7 @@ impl ContainerBuilder { /// "74f1a4cb3801".to_owned(), /// SyscallType::default(), /// ) + /// // Will pass FDs <= 7 /// .with_preserved_fds(5); /// ``` pub fn with_preserved_fds(mut self, preserved_fds: i32) -> Self { @@ -249,6 +253,26 @@ impl ContainerBuilder { self } + /// Sets a list of file descriptors that will be mapped and passed into + /// the container process (ignoring any limits on preserved fds) + /// # Example + /// + /// ```no_run + /// # use libcontainer::container::builder::ContainerBuilder; + /// # use libcontainer::syscall::syscall::SyscallType; + /// + /// ContainerBuilder::new( + /// "74f1a4cb3801".to_owned(), + /// SyscallType::default(), + /// ) + /// // Overwrites stderr with FD 15, and explicitly passes FD 10 + /// .with_remapped_fds(vec![(15, 2), (10, 10)]); + /// ``` + pub fn with_remapped_fds(mut self, remapped_fds: Vec<(RawFd, RawFd)>) -> Self { + self.remap_fds = remapped_fds; + self + } + /// Sets the function that actually runs on the container init process. /// # Example /// diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 8e0a150cf..35e910b76 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -39,8 +39,10 @@ pub(super) struct ContainerBuilderImpl { pub user_ns_config: Option, /// Path to the Unix Domain Socket to communicate container start pub notify_path: PathBuf, - /// File descriptos preserved/passed to the container init process. + /// Number of file descriptors preserved/passed to the container init process. pub preserve_fds: i32, + /// File descriptors remapped into the container process + pub remap_fds: Vec<(RawFd, RawFd)>, /// If the container is to be run in detached mode pub detached: bool, /// Default executes the specified execution of a generic command @@ -146,6 +148,7 @@ impl ContainerBuilderImpl { console_socket: self.console_socket.as_ref().map(|c| c.as_raw_fd()), notify_listener, preserve_fds: self.preserve_fds, + remap_fds: self.remap_fds.clone(), user_ns_config: self.user_ns_config.to_owned(), cgroup_config: self.cgroup_config.clone(), detached: self.detached, diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 310ee92ba..dc62b277a 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -118,6 +118,7 @@ impl InitContainerBuilder { user_ns_config, notify_path, preserve_fds: self.base.preserve_fds, + remap_fds: self.base.remap_fds, detached: self.detached, executor: self.base.executor, no_pivot: self.no_pivot, diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index 410f2a740..70ce9d1e4 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -137,6 +137,7 @@ impl TenantContainerBuilder { user_ns_config, notify_path: notify_path.clone(), preserve_fds: self.base.preserve_fds, + remap_fds: self.base.remap_fds, detached: self.detached, executor: self.base.executor, no_pivot: false, diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 0f11a8f87..ec8f59b9a 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -33,6 +33,8 @@ pub struct ContainerArgs { pub notify_listener: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, + /// File descriptors preserved/passed to the container init process. + pub remap_fds: Vec<(RawFd, RawFd)>, /// Options for new namespace creation pub user_ns_config: Option, /// Cgroup Manager Config diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 818102043..533c8e0f3 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -662,6 +662,11 @@ pub fn container_init_process( } } + if proc.args().is_none() { + tracing::error!("on non-Windows, at least one process arg entry is required"); + Err(MissingSpecError::Args)?; + } + args.executor.validate(spec)?; args.executor.setup_envs(envs)?; @@ -702,10 +707,15 @@ pub fn container_init_process( } } - if proc.args().is_none() { - tracing::error!("on non-Windows, at least one process arg entry is required"); - Err(MissingSpecError::Args)?; - } + // Remap any FDs the user wants to pass to the container. This has to happen when + // no other FDs at all are open, otherwise the user could accidentally clobber + // an FD that's in use by youki. Unfortunately this can conflict with seccomp - + // the way to fix this would be to relocate in-use FDs out of the way of user + // requested ones, but that is quite hard. + syscall.remap_passed_fds(&args.remap_fds).map_err(|err| { + tracing::error!(?err, "failed to remap fds to be passed"); + InitProcessError::SyscallOther(err) + })?; args.executor.exec(spec).map_err(|err| { tracing::error!(?err, "failed to execute payload"); diff --git a/crates/libcontainer/src/syscall/linux.rs b/crates/libcontainer/src/syscall/linux.rs index 1b6d702e1..2501a2dab 100644 --- a/crates/libcontainer/src/syscall/linux.rs +++ b/crates/libcontainer/src/syscall/linux.rs @@ -1,10 +1,9 @@ //! Implements Command trait for Linux systems use std::any::Any; use std::ffi::{CStr, CString, OsStr}; -use std::os::fd::BorrowedFd; +use std::os::fd::{BorrowedFd, RawFd}; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::symlink; -use std::os::unix::io::RawFd; use std::path::Path; use std::str::FromStr; use std::sync::Arc; @@ -17,7 +16,7 @@ use nix::fcntl::{open, OFlag}; use nix::mount::{mount, umount2, MntFlags, MsFlags}; use nix::sched::{unshare, CloneFlags}; use nix::sys::stat::{mknod, Mode, SFlag}; -use nix::unistd::{chown, chroot, close, fchdir, pivot_root, sethostname, Gid, Uid}; +use nix::unistd::{chown, chroot, close, dup2, fchdir, pivot_root, sethostname, Gid, Uid}; use oci_spec::runtime::PosixRlimit; use super::{Result, Syscall, SyscallError}; @@ -179,7 +178,13 @@ impl LinuxSyscall { to_be_cleaned_up_fds.iter().for_each(|&fd| { // Intentionally ignore errors here -- the cases where this might fail // are basically file descriptors that have already been closed. - let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC)); + match fcntl::fcntl(fd, fcntl::F_GETFD) { + Ok(flags) => { + let flags = fcntl::FdFlag::from_bits_retain(flags) & fcntl::FdFlag::FD_CLOEXEC; + let _ = fcntl::fcntl(fd, fcntl::F_SETFD(flags)); + }, + Err(_) => (), + } }); Ok(()) @@ -527,6 +532,35 @@ impl Syscall for LinuxSyscall { Ok(()) } + // Remap FDs and clear the cloexec flag if set + fn remap_passed_fds(&self, remap_fds: &[(RawFd, RawFd)]) -> Result<()> { + for &(oldfd, newfd) in remap_fds { + if oldfd != newfd { + // dup2 clears cloexec + dup2(oldfd, newfd).map_err(|errno| { + tracing::error!(?errno, ?oldfd, ?newfd, "failed to remap fd"); + errno + })?; + close(oldfd).map_err(|errno| { + tracing::error!(?errno, ?oldfd, "failed to close fd"); + errno + })?; + } else { + // dup2 is a no-op if the FDs match, so just clear cloexec + let flags = fcntl::fcntl(oldfd, fcntl::F_GETFD).map_err(|errno| { + tracing::error!(?errno, ?oldfd, "failed to get fd flags"); + errno + })?; + let flags = fcntl::FdFlag::from_bits_retain(flags) & !fcntl::FdFlag::FD_CLOEXEC; + fcntl::fcntl(oldfd, fcntl::F_SETFD(flags)).map_err(|errno| { + tracing::error!(?errno, ?oldfd, "failed to clear cloexec"); + errno + })?; + } + } + Ok(()) + } + fn mount_setattr( &self, dirfd: RawFd, diff --git a/crates/libcontainer/src/syscall/syscall.rs b/crates/libcontainer/src/syscall/syscall.rs index e886aef7a..5b243f037 100644 --- a/crates/libcontainer/src/syscall/syscall.rs +++ b/crates/libcontainer/src/syscall/syscall.rs @@ -3,6 +3,7 @@ //! implementation details use std::any::Any; use std::ffi::OsStr; +use std::os::fd::RawFd; use std::path::Path; use std::sync::Arc; @@ -24,7 +25,7 @@ pub trait Syscall { fn as_any(&self) -> &dyn Any; fn pivot_rootfs(&self, path: &Path) -> Result<()>; fn chroot(&self, path: &Path) -> Result<()>; - fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()>; + fn set_ns(&self, rawfd: RawFd, nstype: CloneFlags) -> Result<()>; fn set_id(&self, uid: Uid, gid: Gid) -> Result<()>; fn unshare(&self, flags: CloneFlags) -> Result<()>; fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<()>; @@ -45,6 +46,7 @@ pub trait Syscall { fn chown(&self, path: &Path, owner: Option, group: Option) -> Result<()>; fn set_groups(&self, groups: &[Gid]) -> Result<()>; fn close_range(&self, preserve_fds: i32) -> Result<()>; + fn remap_passed_fds(&self, remap_fds: &[(RawFd, RawFd)]) -> Result<()>; fn mount_setattr( &self, dirfd: i32, diff --git a/crates/libcontainer/src/syscall/test.rs b/crates/libcontainer/src/syscall/test.rs index 6e2e01977..aec7114ac 100644 --- a/crates/libcontainer/src/syscall/test.rs +++ b/crates/libcontainer/src/syscall/test.rs @@ -2,6 +2,7 @@ use std::any::Any; use std::cell::{Ref, RefCell, RefMut}; use std::collections::HashMap; use std::ffi::{OsStr, OsString}; +use std::os::fd::RawFd; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -249,6 +250,10 @@ impl Syscall for TestHelperSyscall { todo!() } + fn remap_passed_fds(&self, _: &[(RawFd, RawFd)]) -> Result<()> { + todo!() + } + fn mount_setattr( &self, _: i32,