Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow users to remap FDs before container start #3013

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion crates/libcgroups/src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use oci_spec::runtime::LinuxResources;
use oci_spec::runtime::{
LinuxDevice, LinuxDeviceBuilder, LinuxDeviceCgroup, LinuxDeviceCgroupBuilder, LinuxDeviceType,
};
use serde::{Deserialize, Serialize};

use super::stats::Stats;
use super::{systemd, v1, v2};
Expand Down Expand Up @@ -326,7 +327,7 @@ pub enum CreateCgroupSetupError {
Systemd(#[from] systemd::manager::SystemdManagerError),
}

#[derive(Clone)]
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
pub struct CgroupConfig {
pub cgroup_path: PathBuf,
pub systemd_cgroup: bool,
Expand Down
9 changes: 3 additions & 6 deletions crates/libcontainer/src/channel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::io::{IoSlice, IoSliceMut};
use std::marker::PhantomData;
use std::os::fd::AsRawFd;
use std::os::fd::IntoRawFd;
use std::os::unix::prelude::RawFd;

use nix::sys::socket::{self, UnixAddr};
Expand Down Expand Up @@ -213,9 +213,6 @@ fn unix_channel() -> Result<(RawFd, RawFd), ChannelError> {
socket::SockFlag::SOCK_CLOEXEC,
)?;
// It is not straightforward to share the OwnedFd across forks, so we
// treat them as i32. We use ManuallyDrop to keep the connection open.
let f1 = std::mem::ManuallyDrop::new(f1);
let f2 = std::mem::ManuallyDrop::new(f2);

Ok((f1.as_raw_fd(), f2.as_raw_fd()))
// treat them as i32 - into_raw_fd ensures they're not dropped
Ok((f1.into_raw_fd(), f2.into_raw_fd()))
}
38 changes: 20 additions & 18 deletions crates/libcontainer/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use std::path::{Path, PathBuf};
use oci_spec::runtime::{Hooks, Spec};
use serde::{Deserialize, Serialize};

use crate::utils;

#[derive(Debug, thiserror::Error)]
pub enum ConfigError {
#[error("failed to save config")]
Expand Down Expand Up @@ -43,20 +41,17 @@ const YOUKI_CONFIG_NAME: &str = "youki_config.json";
#[non_exhaustive]
pub struct YoukiConfig {
pub hooks: Option<Hooks>,
pub cgroup_path: PathBuf,
pub cgroup_config: Option<libcgroups::common::CgroupConfig>,
}

impl<'a> YoukiConfig {
pub fn from_spec(spec: &'a Spec, container_id: &str) -> Result<Self> {
pub fn from_spec(
spec: &'a Spec,
cgroup_config: Option<libcgroups::common::CgroupConfig>,
) -> Result<Self> {
Ok(YoukiConfig {
hooks: spec.hooks().clone(),
cgroup_path: utils::get_cgroup_path(
spec.linux()
.as_ref()
.ok_or(ConfigError::MissingLinux)?
.cgroups_path(),
container_id,
),
cgroup_config,
})
}

Expand Down Expand Up @@ -106,13 +101,15 @@ mod tests {
fn test_config_from_spec() -> Result<()> {
let container_id = "sample";
let spec = Spec::default();
let config = YoukiConfig::from_spec(&spec, container_id)?;
let cgroup_config = libcgroups::common::CgroupConfig {
cgroup_path: PathBuf::from(format!(":youki:{container_id}")),
systemd_cgroup: true,
container_name: container_id.to_owned(),
};
let config = YoukiConfig::from_spec(&spec, Some(cgroup_config.clone()))?;
assert_eq!(&config.hooks, spec.hooks());
dbg!(&config.cgroup_path);
assert_eq!(
config.cgroup_path,
PathBuf::from(format!(":youki:{container_id}"))
);
dbg!(&config.cgroup_config);
assert_eq!(config.cgroup_config, Some(cgroup_config));
Ok(())
}

Expand All @@ -121,7 +118,12 @@ mod tests {
let container_id = "sample";
let tmp = tempfile::tempdir().expect("create temp dir");
let spec = Spec::default();
let config = YoukiConfig::from_spec(&spec, container_id)?;
let cgroup_config = libcgroups::common::CgroupConfig {
cgroup_path: PathBuf::from(format!(":youki:{container_id}")),
systemd_cgroup: true,
container_name: container_id.to_owned(),
};
let config = YoukiConfig::from_spec(&spec, Some(cgroup_config))?;
config.save(&tmp)?;
let act = YoukiConfig::load(&tmp)?;
assert_eq!(act, config);
Expand Down
30 changes: 27 additions & 3 deletions crates/libcontainer/src/container/builder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::os::fd::OwnedFd;
use std::os::fd::{OwnedFd, RawFd};
use std::path::PathBuf;

use super::init_builder::InitContainerBuilder;
Expand All @@ -20,8 +20,10 @@ pub struct ContainerBuilder {
pub(super) pid_file: Option<PathBuf>,
/// Socket to communicate the file descriptor of the ptty
pub(super) console_socket: Option<PathBuf>,
/// File descriptors to be passed into the container process
/// Number of file descriptors to be passed into the container process
pub(super) preserve_fds: i32,
/// File descriptors to be remapped into the container process
pub(super) remap_fds: Vec<(RawFd, RawFd)>,
/// The function that actually runs on the container init process. Default
/// is to execute the specified command in the oci spec.
pub(super) executor: Box<dyn Executor>,
Expand Down Expand Up @@ -76,6 +78,7 @@ impl ContainerBuilder {
pid_file: None,
console_socket: None,
preserve_fds: 0,
remap_fds: vec![],
executor: workload::default::get_executor(),
stdin: None,
stdout: None,
Expand Down Expand Up @@ -231,7 +234,7 @@ impl ContainerBuilder {
}

/// Sets the number of additional file descriptors which will be passed into
/// the container process.
/// the container process, over and above stdio (0, 1, 2).
/// # Example
///
/// ```no_run
Expand All @@ -242,13 +245,34 @@ impl ContainerBuilder {
/// "74f1a4cb3801".to_owned(),
/// SyscallType::default(),
/// )
/// // Will pass FDs <= 7
/// .with_preserved_fds(5);
/// ```
pub fn with_preserved_fds(mut self, preserved_fds: i32) -> Self {
self.preserve_fds = preserved_fds;
self
}

/// Sets a list of file descriptors that will be mapped and passed into
/// the container process (ignoring any limits on preserved fds)
/// # Example
///
/// ```no_run
/// # use libcontainer::container::builder::ContainerBuilder;
/// # use libcontainer::syscall::syscall::SyscallType;
///
/// ContainerBuilder::new(
/// "74f1a4cb3801".to_owned(),
/// SyscallType::default(),
/// )
/// // Overwrites stderr with FD 15, and explicitly passes FD 10
/// .with_remapped_fds(vec![(15, 2), (10, 10)]);
/// ```
pub fn with_remapped_fds(mut self, remapped_fds: Vec<(RawFd, RawFd)>) -> Self {
self.remap_fds = remapped_fds;
self
}

/// Sets the function that actually runs on the container init process.
/// # Example
///
Expand Down
110 changes: 45 additions & 65 deletions crates/libcontainer/src/container/builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,22 @@

use super::{Container, ContainerStatus};
use crate::error::{CreateContainerError, LibcontainerError, MissingSpecError};
use crate::hooks;
use crate::notify_socket::NotifyListener;
use crate::process::args::{ContainerArgs, ContainerType};
use crate::process::intel_rdt::delete_resctrl_subdirectory;
use crate::process::{self};
use crate::syscall::syscall::SyscallType;
use crate::user_ns::UserNamespaceConfig;
use crate::workload::Executor;
use crate::{hooks, utils};

pub(super) struct ContainerBuilderImpl {
/// Flag indicating if an init or a tenant container should be created
pub container_type: ContainerType,
/// Interface to operating system primitives
pub syscall: SyscallType,
/// Flag indicating if systemd should be used for cgroup management
pub use_systemd: bool,
/// Id of the container
pub container_id: String,
/// Interface to operating system primitives
pub cgroup_config: Option<libcgroups::common::CgroupConfig>,
/// OCI compliant runtime spec
pub spec: Rc<Spec>,
/// Root filesystem of the container
Expand All @@ -41,10 +39,10 @@
pub user_ns_config: Option<UserNamespaceConfig>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Container state
pub container: Option<Container>,
/// File descriptos preserved/passed to the container init process.
/// Number of file descriptors preserved/passed to the container init process.
pub preserve_fds: i32,
/// File descriptors remapped into the container process
pub remap_fds: Vec<(RawFd, RawFd)>,

Check failure on line 45 in crates/libcontainer/src/container/builder_impl.rs

View workflow job for this annotation

GitHub Actions / tests (x86_64, gnu)

cannot find type `RawFd` in this scope

Check failure on line 45 in crates/libcontainer/src/container/builder_impl.rs

View workflow job for this annotation

GitHub Actions / tests (x86_64, gnu)

cannot find type `RawFd` in this scope

Check failure on line 45 in crates/libcontainer/src/container/builder_impl.rs

View workflow job for this annotation

GitHub Actions / tests (x86_64, musl)

cannot find type `RawFd` in this scope

Check failure on line 45 in crates/libcontainer/src/container/builder_impl.rs

View workflow job for this annotation

GitHub Actions / tests (x86_64, musl)

cannot find type `RawFd` in this scope
/// If the container is to be run in detached mode
pub detached: bool,
/// Default executes the specified execution of a generic command
Expand All @@ -66,42 +64,28 @@
Err(outer) => {
// Only the init container should be cleaned up in the case of
// an error.
let cleanup_err = if self.is_init_container() {
self.cleanup_container().err()
} else {
None
};
let cleanup_err =
if let ContainerType::InitContainer { container } = &self.container_type {
cleanup_container(self.cgroup_config.clone(), container).err()
} else {
None
};

Err(CreateContainerError::new(outer, cleanup_err).into())
}
}
}

fn is_init_container(&self) -> bool {
matches!(self.container_type, ContainerType::InitContainer)
}

fn run_container(&mut self) -> Result<Pid, LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cgroup_config = libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_owned(),
};
let process = self
.spec
.process()
.as_ref()
.ok_or(MissingSpecError::Process)?;

if matches!(self.container_type, ContainerType::InitContainer) {
if let ContainerType::InitContainer { container } = &self.container_type {
if let Some(hooks) = self.spec.hooks() {
hooks::run_hooks(
hooks.create_runtime().as_ref(),
self.container.as_ref(),
None,
)?
hooks::run_hooks(hooks.create_runtime().as_ref(), container, None)?
}
}

Expand Down Expand Up @@ -143,6 +127,7 @@
// going to be switching to a different security context. Thus setting
// ourselves to be non-dumpable only breaks things (like rootless
// containers), which is the recommendation from the kernel folks.
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
if linux.namespaces().is_some() {
prctl::set_dumpable(false).map_err(|e| {
LibcontainerError::Other(format!(
Expand All @@ -156,16 +141,16 @@
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let container_args = ContainerArgs {
container_type: self.container_type,
container_type: self.container_type.clone(),
syscall: self.syscall,
spec: Rc::clone(&self.spec),
rootfs: self.rootfs.to_owned(),
console_socket: self.console_socket.as_ref().map(|c| c.as_raw_fd()),
notify_listener,
preserve_fds: self.preserve_fds,
container: self.container.to_owned(),
remap_fds: self.remap_fds.clone(),
user_ns_config: self.user_ns_config.to_owned(),
cgroup_config,
cgroup_config: self.cgroup_config.clone(),
detached: self.detached,
executor: self.executor.clone(),
no_pivot: self.no_pivot,
Expand All @@ -190,7 +175,7 @@
})?;
}

if let Some(container) = &mut self.container {
if let ContainerType::InitContainer { container } = &mut self.container_type {
// update status and pid of the container process
container
.set_status(ContainerStatus::Created)
Expand All @@ -202,47 +187,42 @@

Ok(init_pid)
}
}

fn cleanup_container(&self) -> Result<(), LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cmanager =
libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_string(),
})?;

let mut errors = Vec::new();
fn cleanup_container(
cgroup_config: Option<libcgroups::common::CgroupConfig>,
container: &Container,
) -> Result<(), LibcontainerError> {
let mut errors = Vec::new();

if let Some(cc) = cgroup_config {
let cmanager = libcgroups::common::create_cgroup_manager(cc)?;
if let Err(e) = cmanager.remove() {
tracing::error!(error = ?e, "failed to remove cgroup manager");
errors.push(e.to_string());
}
}

if let Some(container) = &self.container {
if let Some(true) = container.clean_up_intel_rdt_subdirectory() {
if let Err(e) = delete_resctrl_subdirectory(container.id()) {
tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory");
errors.push(e.to_string());
}
}

if container.root.exists() {
if let Err(e) = fs::remove_dir_all(&container.root) {
tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root");
errors.push(e.to_string());
}
}
if let Some(true) = container.clean_up_intel_rdt_subdirectory() {
if let Err(e) = delete_resctrl_subdirectory(container.id()) {
tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory");
errors.push(e.to_string());
}
}

if !errors.is_empty() {
return Err(LibcontainerError::Other(format!(
"failed to cleanup container: {}",
errors.join(";")
)));
if container.root.exists() {
if let Err(e) = fs::remove_dir_all(&container.root) {
tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root");
errors.push(e.to_string());
}
}

Ok(())
if !errors.is_empty() {
return Err(LibcontainerError::Other(format!(
"failed to cleanup container: {}",
errors.join(";")
)));
}

Ok(())
}
Loading
Loading