diff --git a/README.md b/README.md index 8037f21..e7e6c16 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,12 @@ powershell -c "irm https://github.com/fossas/circe/releases/latest/download/circ > [!TIP] > Check the help output for more details. +## extract + +Extracts the contents of the image to disk. + ```shell -# Export the contents of the image to disk. +# Extracts the contents of the image to disk. # # Usage: # circe extract [--layers ] [--platform ] [--overwrite] @@ -58,6 +62,18 @@ powershell -c "irm https://github.com/fossas/circe/releases/latest/download/circ # Accepts the same values as `docker` (e.g. `linux/amd64`, `darwin/arm64`, etc). # --overwrite # If the target directory already exists, overwrite it. +# --layer-glob, --lg +# A glob pattern to filter layers to extract. +# Layers matching this pattern are extracted. +# --layer-regex, --lr +# A regex pattern to filter layers to extract. +# Layers matching this pattern are extracted. +# --file-glob, --fg +# A glob pattern to filter files to extract. +# Files matching this pattern are extracted. +# --file-regex, --fr +# A regex pattern to filter files to extract. +# Files matching this pattern are extracted. # --username # The username to use for authentication; "password" is also required if provided. # --password @@ -65,6 +81,31 @@ powershell -c "irm https://github.com/fossas/circe/releases/latest/download/circ circe extract docker.io/contribsys/faktory:latest ./faktory --layers squash --platform linux/amd64 ``` +## list + +Lists the contents of an image. + +```shell +# Lists the contents of the image. +# +# Usage: +# circe list [--platform ] [--username ] [--password ] +# +# Arguments: +# +# The image to list. +# +# Options for `circe list`: +# --platform +# Defaults to your current platform. +# Accepts the same values as `docker` (e.g. `linux/amd64`, `darwin/arm64`, etc). +# --username +# The username to use for authentication; "password" is also required if provided. +# --password +# The password to use for authentication; "username" is also required if provided. +circe list docker.io/contribsys/faktory:latest +``` + ## platform selection You can customize the platform used by `circe` by passing `--platform`. diff --git a/bin/Cargo.toml b/bin/Cargo.toml index 406d637..e09a9e8 100644 --- a/bin/Cargo.toml +++ b/bin/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "circe" -version = "0.2.0" +version = "0.3.0" edition = "2021" authors = ["Jessica Black ", "FOSSA Inc. "] description = "Extracts and examines the contents of containers" @@ -24,3 +24,4 @@ tracing-tree = { version = "0.4.0" } circe_lib = { path = "../lib" } serde_json = "1.0.133" derive_more = { version = "1.0.0", features = ["debug"] } +pluralizer = "0.4.0" diff --git a/bin/src/extract.rs b/bin/src/extract.rs index 4d0b788..f8489f9 100644 --- a/bin/src/extract.rs +++ b/bin/src/extract.rs @@ -1,7 +1,7 @@ use circe_lib::{ registry::Registry, Authentication, Filters, LayerDescriptor, Platform, Reference, }; -use clap::{Parser, ValueEnum}; +use clap::{Args, Parser, ValueEnum}; use color_eyre::eyre::{bail, Context, Result}; use derive_more::Debug; use std::{path::PathBuf, str::FromStr}; @@ -9,9 +9,9 @@ use tracing::{debug, info}; #[derive(Debug, Parser)] pub struct Options { - /// Image reference being extracted (e.g. docker.io/library/ubuntu:latest) - #[arg(value_parser = Reference::from_str)] - image: Reference, + /// Target to extract + #[clap(flatten)] + target: Target, /// Directory to which the extracted contents will be written #[arg(default_value = ".")] @@ -21,21 +21,6 @@ pub struct Options { #[arg(long, short)] overwrite: bool, - /// Platform to extract (e.g. linux/amd64) - /// - /// If the image is not multi-platform, this is ignored. - /// If the image is multi-platform, this is used to select the platform to extract. - /// - /// If the image is multi-platform and this argument is not provided, - /// the platform is chosen according to the following priority list: - /// 1. The first platform-independent image - /// 2. The current platform (if available) - /// 3. The `linux` platform for the current architecture - /// 4. The `linux` platform for the `amd64` architecture - /// 5. The first platform in the image manifest - #[arg(long, value_parser = Platform::from_str, verbatim_doc_comment)] - platform: Option, - /// How to handle layers during extraction #[arg(long, default_value = "squash")] layers: Mode, @@ -88,15 +73,38 @@ pub struct Options { /// If filters are provided, only files whose path matches any filter are extracted. #[arg(long, alias = "fr")] file_regex: Option>, +} + +/// Shared options for any command that needs to work with the OCI registry for a given image. +#[derive(Debug, Args)] +pub struct Target { + /// Image reference being extracted (e.g. docker.io/library/ubuntu:latest) + #[arg(value_parser = Reference::from_str)] + pub image: Reference, + + /// Platform to extract (e.g. linux/amd64) + /// + /// If the image is not multi-platform, this is ignored. + /// If the image is multi-platform, this is used to select the platform to extract. + /// + /// If the image is multi-platform and this argument is not provided, + /// the platform is chosen according to the following priority list: + /// 1. The first platform-independent image + /// 2. The current platform (if available) + /// 3. The `linux` platform for the current architecture + /// 4. The `linux` platform for the `amd64` architecture + /// 5. The first platform in the image manifest + #[arg(long, value_parser = Platform::from_str, verbatim_doc_comment)] + pub platform: Option, /// The username to use for authenticating to the registry #[arg(long, requires = "password")] - username: Option, + pub username: Option, /// The password to use for authenticating to the registry #[arg(long, requires = "username")] #[debug(skip)] - password: Option, + pub password: Option, } #[derive(Copy, Clone, Debug, Default, ValueEnum)] @@ -120,7 +128,7 @@ pub enum Mode { pub async fn main(opts: Options) -> Result<()> { info!("extracting image"); - let auth = match (opts.username, opts.password) { + let auth = match (opts.target.username, opts.target.password) { (Some(username), Some(password)) => Authentication::basic(username, password), _ => Authentication::default(), }; @@ -132,8 +140,8 @@ pub async fn main(opts: Options) -> Result<()> { let output = canonicalize_output_dir(&opts.output_dir, opts.overwrite)?; let registry = Registry::builder() - .maybe_platform(opts.platform) - .reference(opts.image) + .maybe_platform(opts.target.platform) + .reference(opts.target.image) .auth(auth) .layer_filters(layer_globs + layer_regexes) .file_filters(file_globs + file_regexes) diff --git a/bin/src/list.rs b/bin/src/list.rs new file mode 100644 index 0000000..d488c9e --- /dev/null +++ b/bin/src/list.rs @@ -0,0 +1,54 @@ +use circe_lib::{registry::Registry, Authentication}; +use clap::Parser; +use color_eyre::eyre::{Context, Result}; +use derive_more::Debug; +use pluralizer::pluralize; +use std::collections::HashMap; +use tracing::{debug, info}; + +use crate::extract::Target; + +#[derive(Debug, Parser)] +pub struct Options { + /// Target to list + #[clap(flatten)] + target: Target, +} + +#[tracing::instrument] +pub async fn main(opts: Options) -> Result<()> { + info!("extracting image"); + + let auth = match (opts.target.username, opts.target.password) { + (Some(username), Some(password)) => Authentication::basic(username, password), + _ => Authentication::default(), + }; + let registry = Registry::builder() + .maybe_platform(opts.target.platform) + .reference(opts.target.image) + .auth(auth) + .build() + .await + .context("configure remote registry")?; + + let layers = registry.layers().await.context("list layers")?; + let count = layers.len(); + info!("enumerated {}", pluralize("layer", count as isize, true)); + + let mut listing = HashMap::new(); + for (descriptor, layer) in layers.into_iter().zip(1usize..) { + info!(layer = %descriptor, %layer, "reading layer"); + let files = registry + .list_files(&descriptor) + .await + .context("list files")?; + + debug!(layer = %descriptor, files = %files.len(), "listed files"); + listing.insert(descriptor.digest.to_string(), files); + } + + let rendered = serde_json::to_string_pretty(&listing).context("render listing")?; + println!("{rendered}"); + + Ok(()) +} diff --git a/bin/src/main.rs b/bin/src/main.rs index bf8ead8..96bd633 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -4,7 +4,7 @@ use tracing::level_filters::LevelFilter; use tracing_subscriber::{self, prelude::*}; mod extract; - +mod list; #[derive(Debug, Parser)] #[command(author, version, about)] struct Cli { @@ -16,6 +16,9 @@ struct Cli { enum Commands { /// Extract OCI image to a directory Extract(extract::Options), + + /// Enumerate the layers and files in an OCI image + List(list::Options), } #[tokio::main] @@ -46,6 +49,7 @@ async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { Commands::Extract(opts) => extract::main(opts).await?, + Commands::List(opts) => list::main(opts).await?, } Ok(()) diff --git a/lib/Cargo.toml b/lib/Cargo.toml index e353432..6ff24bc 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "circe_lib" -version = "0.2.0" +version = "0.3.0" edition = "2021" authors = ["Jessica Black ", "FOSSA Inc. "] description = "Extracts and examines the contents of containers" diff --git a/lib/src/registry.rs b/lib/src/registry.rs index af2f3dd..2a6fe1f 100644 --- a/lib/src/registry.rs +++ b/lib/src/registry.rs @@ -27,6 +27,22 @@ use crate::{ LayerMediaTypeFlag, Platform, Reference, Version, }; +/// Unwrap a value, logging an error and performing the provided action if it fails. +macro_rules! unwrap_warn { + ($expr:expr, $action:expr) => { + unwrap_warn!($expr, $action,) + }; + ($expr:expr, $action:expr, $($msg:tt)*) => { + match $expr { + Ok(value) => value, + Err(e) => { + tracing::warn!(error = ?e, $($msg)*); + $action; + } + } + }; +} + /// Each instance is a unique view of remote registry for a specific [`Platform`] and [`Reference`]. /// The intention here is to better support chained methods like "pull list of layers" and then "apply each layer to disk". // Note: internal fields aren't public because we don't want the caller to be able to mutate the internal state between method calls. @@ -149,6 +165,55 @@ impl Registry { .map(|layer| layer.stream) } + /// Enumerate files in a layer. + #[tracing::instrument] + pub async fn list_files(&self, layer: &LayerDescriptor) -> Result> { + let stream = self.pull_layer_internal(layer).await?; + + // Applying the layer requires interpreting the layer's media type. + match &layer.media_type { + // Standard OCI layers. + LayerMediaType::Oci(flags) => { + // Foreign layers are skipped, as they would if you ran `docker pull`. + // This causes an extra iteration over the flags for layers that aren't foreign, + // but the flag count is small and this saves us the complexity of setting up layer transforms + // and then discarding them if this flag is encountered. + if flags.contains(&LayerMediaTypeFlag::Foreign) { + warn!("skip: foreign layer"); + return Ok(Vec::new()); + } + + // The vast majority of the time (maybe even all the time), the layer only has zero or one flags. + // Meanwhile, `transform::sequence` forces the streams into dynamic dispatch, imposing extra overhead. + // This match allows us to specialize the stream based on the most common cases, + // while still supporting arbitrary flags. + match flags.as_slice() { + // No flags; this means the layer is uncompressed. + [] => enumerate_tarball(stream).await, + + // The layer is compressed with zstd. + [LayerMediaTypeFlag::Zstd] => { + let stream = transform::zstd(stream); + enumerate_tarball(stream).await + } + + // The layer is compressed with gzip. + [LayerMediaTypeFlag::Gzip] => { + let stream = transform::gzip(stream); + enumerate_tarball(stream).await + } + + // The layer has a more complicated set of flags. + // For this, we fall back to the generic sequence operator. + _ => { + let stream = transform::sequence(stream, flags); + enumerate_tarball(stream).await + } + } + } + } + } + /// Apply a layer to a location on disk. /// /// The intention of this method is that when it is run for each layer in an image in order it is equivalent @@ -189,7 +254,7 @@ impl Registry { // A future improvement would be to support downloading layers concurrently, // then still applying them serially. Since network transfer is the slowest part of this process, // this would speed up the overall process. - // #[tracing::instrument] + #[tracing::instrument] pub async fn apply_layer(&self, layer: &LayerDescriptor, output: &Path) -> Result<()> { let stream = self.pull_layer_internal(layer).await?; @@ -238,6 +303,7 @@ impl Registry { } } +/// Apply files in the tarball to a location on disk. async fn apply_tarball( path_filters: &Filters, stream: impl Stream + Unpin, @@ -247,22 +313,6 @@ async fn apply_tarball( let mut archive = Archive::new(reader); let mut entries = archive.entries().context("read entries from tar")?; - /// Unwrap a value, logging an error and continuing the loop if it fails. - macro_rules! unwrap_warn { - ($expr:expr) => { - unwrap_warn!($expr,); - }; - ($expr:expr, $($msg:tt)*) => { - match $expr { - Ok(value) => value, - Err(e) => { - tracing::warn!(error = ?e, $($msg)*); - continue; - } - } - }; - } - // Future improvement: the OCI spec guarantees that paths will not repeat within the same layer, // so we could concurrently read files and apply them to disk. // The overall archive is streaming so we'd need to buffer the entries, @@ -271,8 +321,8 @@ async fn apply_tarball( // without buffering- maybe we could read the tar entries while streaming to disk, // and then divide them among workers that apply them to disk concurrently? while let Some(entry) = entries.next().await { - let mut entry = unwrap_warn!(entry, "read entry"); - let path = unwrap_warn!(entry.path(), "read entry path"); + let mut entry = unwrap_warn!(entry, continue, "read entry"); + let path = unwrap_warn!(entry.path(), continue, "read entry path"); // Paths inside the container are relative to the root of the container; // we need to convert them to be relative to the output directory. @@ -285,7 +335,11 @@ async fn apply_tarball( // Whiteout files delete the file from the filesystem. if let Some(path) = is_whiteout(&path) { - unwrap_warn!(tokio::fs::remove_file(&path).await, "whiteout: {path:?}"); + unwrap_warn!( + tokio::fs::remove_file(&path).await, + continue, + "whiteout: {path:?}" + ); debug!(?path, "whiteout"); continue; } @@ -299,7 +353,7 @@ async fn apply_tarball( // Otherwise, apply the file as normal. // Both _new_ and _changed_ files are handled the same way: // the layer contains the entire file content, so we just overwrite the file. - if !unwrap_warn!(entry.unpack_in(output).await, "unpack {path:?}") { + if !unwrap_warn!(entry.unpack_in(output).await, continue, "unpack {path:?}") { warn!(?path, "skip: tried to write outside of output directory"); continue; } @@ -310,6 +364,23 @@ async fn apply_tarball( Ok(()) } +/// Enumerate files in a tarball. +async fn enumerate_tarball(stream: impl Stream + Unpin) -> Result> { + let reader = StreamReader::new(stream); + let mut archive = Archive::new(reader); + let mut entries = archive.entries().context("read entries from tar")?; + + let mut files = Vec::new(); + while let Some(entry) = entries.next().await { + let entry = unwrap_warn!(entry, continue, "read entry"); + let path = unwrap_warn!(entry.path(), continue, "read entry path"); + debug!(?path, "enumerate"); + files.push(path.to_string_lossy().to_string()); + } + + Ok(files) +} + impl From<&Reference> for OciReference { fn from(reference: &Reference) -> Self { match &reference.version { diff --git a/lib/tests/it/registry.rs b/lib/tests/it/registry.rs index 395a7d7..ffed4a5 100644 --- a/lib/tests/it/registry.rs +++ b/lib/tests/it/registry.rs @@ -96,3 +96,28 @@ async fn pull_layer_filtered( ); Ok(()) } + +#[test_case("cgr.dev/chainguard/wolfi-base:latest", Some(Platform::linux_amd64()); "cgr.dev/chainguard/wolfi-base:latest.linux_amd64")] +#[test_case("cgr.dev/chainguard/wolfi-base:latest", Some(Platform::linux_arm64()); "cgr.dev/chainguard/wolfi-base:latest.linux_arm64")] +#[test_case("cgr.dev/chainguard/wolfi-base:latest", None; "cgr.dev/chainguard/wolfi-base:latest_default")] +#[test_case("docker.io/library/ubuntu:latest", None; "docker.io/library/ubuntu:latest_default")] +#[test_case("docker.io/library/alpine:latest", None; "docker.io/library/alpine:latest_default")] +#[test_log::test(tokio::test)] +async fn list_layer(image: &str, platform: Option) -> Result<()> { + let reference = image.parse::()?; + let registry = Registry::builder() + .maybe_platform(platform) + .reference(reference) + .build() + .await?; + + let layers = registry.layers().await?; + assert!(!layers.is_empty(), "image should have at least one layer"); + + for layer in layers { + let files = registry.list_files(&layer).await?; + assert!(!files.is_empty(), "layer should have at least one file"); + } + + Ok(()) +}