From f4d31e9a3d3efa3ae4b387e10ee6045e7b8b8fa6 Mon Sep 17 00:00:00 2001 From: Jeffrey Huynh Date: Mon, 13 Nov 2023 12:00:56 -0800 Subject: [PATCH] Remove 30 day staleness limit restriction for integration scanning (#141) --- CHANGELOG.md | 5 + Cargo.lock | 2 +- Cargo.toml | 2 +- docs/reference/faq.md | 4 +- src/api/remote.rs | 2 +- src/api/remote/git/repository.rs | 120 +-------------------- src/ext/command.rs | 23 ++-- testdata/config/fossa-one-http-no-auth.yml | 2 - tests/it/remote_git.rs | 3 +- 9 files changed, 25 insertions(+), 138 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80aed3d2..e96d70bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## v0.3.1 + +Features: +- Remove 30 day limit restriction for integration scanning + ## v0.3.0 Features: diff --git a/Cargo.lock b/Cargo.lock index c3a03001..6ad26774 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -263,7 +263,7 @@ dependencies = [ [[package]] name = "broker" -version = "0.3.0" +version = "0.3.1" dependencies = [ "aho-corasick 0.7.20", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 21c876f2..bdfea988 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "broker" -version = "0.3.0" +version = "0.3.1" edition = "2021" description = "The bridge between FOSSA and internal DevOps services" readme = "README.md" diff --git a/docs/reference/faq.md b/docs/reference/faq.md index 67440714..c63f8a35 100644 --- a/docs/reference/faq.md +++ b/docs/reference/faq.md @@ -112,11 +112,11 @@ For `git` integrations, Broker considers the following to be a "reference": - Any tag - Any branch `HEAD` commit -Broker enumerates the list of references that were created in the last 30 days. +Broker first enumerates all references in the git repository. _Note that tags cannot be modified; once a tag has been created to "modify" it in `git` requires that the tag is_ _deleted and then created with the same name. Such modifications are actually creation of the tag,_ -_and as such any tag that was "modified" within the last 30 days is also scanned by Broker._ +_and as such any tag that was "modified" since the last scan is re-canned by Broker._ After enumerating the list of references, Broker then uses its local database to filter any reference that it has already scanned. Note that this means that a modified tag would then be filtered at this step, diff --git a/src/api/remote.rs b/src/api/remote.rs index 93b00820..c073b0d0 100644 --- a/src/api/remote.rs +++ b/src/api/remote.rs @@ -406,7 +406,7 @@ pub trait RemoteProvider { reference: &Self::Reference, ) -> Result>; - /// List references that have been updated in the last 30 days. + /// List all references async fn references(&self) -> Result, Report>; } diff --git a/src/api/remote/git/repository.rs b/src/api/remote/git/repository.rs index ee3e7bc4..47f1c1fc 100644 --- a/src/api/remote/git/repository.rs +++ b/src/api/remote/git/repository.rs @@ -1,7 +1,6 @@ //! Wrapper for Git use base64::{engine::general_purpose, Engine as _}; use error_stack::{bail, report, Report}; -use futures::StreamExt; use itertools::Itertools; use std::env; use std::fs::File; @@ -9,8 +8,6 @@ use std::io::Write; use std::path::{Path, PathBuf}; use tempfile::{tempdir, NamedTempFile, TempDir}; use thiserror::Error; -use time::{ext::NumericalDuration, format_description::well_known::Iso8601, OffsetDateTime}; -use tracing::debug; use super::Reference; use crate::ext::command::{Command, CommandDescriber, Output, OutputProvider, Value}; @@ -72,11 +69,10 @@ impl Error { } } -/// List references that have been updated in the last 30 days. +/// List all references #[tracing::instrument] pub async fn list_references(transport: &Transport) -> Result, Report> { - let references = get_all_references(transport).await?; - references_that_need_scanning(transport, references).await + get_all_references(transport).await } /// Clone a [`Reference`] into a temporary directory. @@ -181,118 +177,6 @@ pub fn pastable_ls_remote_command(transport: &Transport) -> Result -/// git ls-remote --quiet -/// -/// and then parsing the results of git ls-remote - -/// Filter references by looking at the date of their head commit and only including repos -/// that have been updated in the last 30 days -/// To do this we need a cloned repository so that we can run -/// `git log ` -/// in the cloned repo for each branch or tag -#[tracing::instrument(skip_all)] -async fn references_that_need_scanning( - transport: &Transport, - references: Vec, -) -> Result, Report> { - let tmpdir = blobless_clone(transport, None).await?; - - // Filtering over the references async is a little harder than a sync filter, - // since the item being filtered may cross threads. - // Using `filter_map` simplifies this for the most part. - // - // Despite the fact that this is an async stream, it is currently still operating - // serially; to make it parallel use `buffer` or `buffer_unordered`. - let filtered_references = futures::stream::iter(references) - .filter_map(|reference| async { - reference_needs_scanning(transport, &reference, PathBuf::from(tmpdir.path())) - .await - .ok() - .and_then(|needs_scanning| match needs_scanning { - true => Some(reference), - false => None, - }) - }) - .collect() - .await; - - debug!( - "references that have been updated in the last {} days: {:?}", - DAYS_UNTIL_STALE, filtered_references - ); - - Ok(filtered_references) -} - -/// A reference needs scanning if its head commit is less than 30 days old -#[tracing::instrument(skip(transport))] -async fn reference_needs_scanning( - transport: &Transport, - reference: &Reference, - cloned_repo_dir: PathBuf, -) -> Result> { - let reference_string = match reference { - Reference::Branch { name, .. } => format!("origin/{name}"), - Reference::Tag { name, .. } => name.clone(), - }; - - let args = vec![ - "log", - "-n", - "1", - "--format=%aI:::%cI", - reference_string.as_str(), - ] - .into_iter() - .map(Value::new_plain) - .collect_vec(); - - // git log -n 1 --format="%aI:::%cI" - // This will return one line containing the author date and committer date separated by ":::". E.g.: - // The "I" in "aI" and "cI" forces the date to be in strict ISO-8601 format - // - // git log -n 1 --format="%ai:::%ci" parse-config-file - // 2023-02-17 17:14:52 -0800:::2023-02-17 17:14:52 -0800 - // - // The author and committer dates are almost always the same, but we'll parse both and take the most - // recent, just to be super safe - - let output = run_git(transport, &args, Some(&cloned_repo_dir)).await?; - let date_strings = output.stdout_string_lossy(); - let mut dates = date_strings.split(":::"); - let earliest_commit_date_that_needs_to_be_scanned = - OffsetDateTime::checked_sub(OffsetDateTime::now_utc(), DAYS_UNTIL_STALE.days()) - .ok_or_else(|| report!(Error::ParseGitOutput))?; - - let author_date = dates - .next() - .map(|d| OffsetDateTime::parse(d, &Iso8601::DEFAULT)); - if let Some(Ok(author_date)) = author_date { - if author_date > earliest_commit_date_that_needs_to_be_scanned { - return Ok(true); - } - } - - let committer_date = dates - .next() - .map(|d| OffsetDateTime::parse(d, &Iso8601::DEFAULT)); - if let Some(Ok(committer_date)) = committer_date { - if committer_date > earliest_commit_date_that_needs_to_be_scanned { - return Ok(true); - } - } - - Ok(false) -} - /// Do a blobless clone of the repository, checking out the Reference if it exists #[tracing::instrument(skip(transport))] async fn blobless_clone( diff --git a/src/ext/command.rs b/src/ext/command.rs index d9c15894..957d2485 100644 --- a/src/ext/command.rs +++ b/src/ext/command.rs @@ -3,7 +3,6 @@ use std::{ ffi::{OsStr, OsString}, fmt::Display, - ops::Deref, path::PathBuf, process::{ExitStatus, Stdio}, }; @@ -711,15 +710,16 @@ impl CommandDescriber for tokio::process::Command { } } -// Double derefs here because this is implemented on `&T`. // Desugared, `&self` on `&T` means `&&T`. -// We want to deref all the way down to `T`, which means dereferencing twice. +// Dereference using `*self` as opposed to `self.deref()`. +// This is because `self.deref()` causes warnings about double references (`&&T`), +// whereas `*self` does not since it is more explicit. // // Normally derefs are handled automatically, so we'd just write e.g. `self.stdout()`, // but in this case we want to be careful to access the `stdout()` method from the _underlying_ `&T`, // not the `stdout()` method that is currently executing, so we need to deref to make that explicit. // -// You can tell this is the case because if you remove the derefs clippy warns: +// You can tell this is the case because if you remove the deref clippy warns: // // > function cannot return without recursing // @@ -729,7 +729,7 @@ where T: CommandDescriber, { fn describe(&self) -> Description { - self.deref().deref().describe() + (*self).describe() } } @@ -796,15 +796,16 @@ impl OutputProvider for std::process::Output { } } -// Double derefs here because this is implemented on `&T`. // Desugared, `&self` on `&T` means `&&T`. -// We want to deref all the way down to `T`, which means dereferencing twice. +// Dereference using `*self` as opposed to `self.deref()`. +// This is because `self.deref()` causes warnings about double references (`&&T`), +// whereas `*self` does not since it is more explicit. // // Normally derefs are handled automatically, so we'd just write e.g. `self.stdout()`, // but in this case we want to be careful to access the `stdout()` method from the _underlying_ `&T`, // not the `stdout()` method that is currently executing, so we need to deref to make that explicit. // -// You can tell this is the case because if you remove the derefs clippy warns: +// You can tell this is the case because if you remove the deref clippy warns: // // > function cannot return without recursing // @@ -814,15 +815,15 @@ where T: OutputProvider, { fn stdout(&self) -> Vec { - self.deref().deref().stdout() + (*self).stdout() } fn stderr(&self) -> Vec { - self.deref().deref().stderr() + (*self).stderr() } fn status(&self) -> ExitStatus { - self.deref().deref().status() + (*self).status() } } diff --git a/testdata/config/fossa-one-http-no-auth.yml b/testdata/config/fossa-one-http-no-auth.yml index eb99b09b..97cf5986 100644 --- a/testdata/config/fossa-one-http-no-auth.yml +++ b/testdata/config/fossa-one-http-no-auth.yml @@ -12,8 +12,6 @@ integrations: poll_interval: 1h remote: https://github.com/fossas/one.git import_branches: true - watched_branches: - - main auth: type: none transport: http diff --git a/tests/it/remote_git.rs b/tests/it/remote_git.rs index 919394bc..b87158b5 100644 --- a/tests/it/remote_git.rs +++ b/tests/it/remote_git.rs @@ -23,8 +23,7 @@ async fn references_on_public_repo_with_no_auth() { .references() .await .expect("no results returned from get_references_that_need_scanning on a public repo!"); - let expected_empty_vec: Vec = Vec::new(); - assert_eq!(expected_empty_vec, references); + assert!(!references.is_empty()); } #[tokio::test]