Skip to content

Commit

Permalink
Remove 30 day staleness limit restriction for integration scanning (#141
Browse files Browse the repository at this point in the history
)
  • Loading branch information
JeffreyHuynh1 authored Nov 13, 2023
1 parent d7b61a5 commit f4d31e9
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 138 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## v0.3.1

Features:
- Remove 30 day limit restriction for integration scanning

## v0.3.0

Features:
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "broker"
version = "0.3.0"
version = "0.3.1"
edition = "2021"
description = "The bridge between FOSSA and internal DevOps services"
readme = "README.md"
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ For `git` integrations, Broker considers the following to be a "reference":
- Any tag
- Any branch `HEAD` commit

Broker enumerates the list of references that were created in the last 30 days.
Broker first enumerates all references in the git repository.

_Note that tags cannot be modified; once a tag has been created to "modify" it in `git` requires that the tag is_
_deleted and then created with the same name. Such modifications are actually creation of the tag,_
_and as such any tag that was "modified" within the last 30 days is also scanned by Broker._
_and as such any tag that was "modified" since the last scan is re-canned by Broker._

After enumerating the list of references, Broker then uses its local database to filter any reference that it has already scanned.
Note that this means that a modified tag would then be filtered at this step,
Expand Down
2 changes: 1 addition & 1 deletion src/api/remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ pub trait RemoteProvider {
reference: &Self::Reference,
) -> Result<TempDir, Report<RemoteProviderError>>;

/// List references that have been updated in the last 30 days.
/// List all references
async fn references(&self) -> Result<Vec<Self::Reference>, Report<RemoteProviderError>>;
}

Expand Down
120 changes: 2 additions & 118 deletions src/api/remote/git/repository.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
//! Wrapper for Git
use base64::{engine::general_purpose, Engine as _};
use error_stack::{bail, report, Report};
use futures::StreamExt;
use itertools::Itertools;
use std::env;
use std::fs::File;
use std::io::Write;
use std::path::{Path, PathBuf};
use tempfile::{tempdir, NamedTempFile, TempDir};
use thiserror::Error;
use time::{ext::NumericalDuration, format_description::well_known::Iso8601, OffsetDateTime};
use tracing::debug;

use super::Reference;
use crate::ext::command::{Command, CommandDescriber, Output, OutputProvider, Value};
Expand Down Expand Up @@ -72,11 +69,10 @@ impl Error {
}
}

/// List references that have been updated in the last 30 days.
/// List all references
#[tracing::instrument]
pub async fn list_references(transport: &Transport) -> Result<Vec<Reference>, Report<Error>> {
let references = get_all_references(transport).await?;
references_that_need_scanning(transport, references).await
get_all_references(transport).await
}

/// Clone a [`Reference`] into a temporary directory.
Expand Down Expand Up @@ -181,118 +177,6 @@ pub fn pastable_ls_remote_command(transport: &Transport) -> Result<String, Repor
pastable_git_command(transport, &ls_remote_args(transport), None)
}

// Days until a commit is considered stale and will not be scanned
const DAYS_UNTIL_STALE: i64 = 30;

/// Get a list of all branches and tags for the given integration
/// This is done by doing this:
///
/// git init
/// git remote add origin <URL to git repo>
/// git ls-remote --quiet
///
/// and then parsing the results of git ls-remote
/// Filter references by looking at the date of their head commit and only including repos
/// that have been updated in the last 30 days
/// To do this we need a cloned repository so that we can run
/// `git log <some format string that includes that date of the commit> <branch_or_tag_name>`
/// in the cloned repo for each branch or tag
#[tracing::instrument(skip_all)]
async fn references_that_need_scanning(
transport: &Transport,
references: Vec<Reference>,
) -> Result<Vec<Reference>, Report<Error>> {
let tmpdir = blobless_clone(transport, None).await?;

// Filtering over the references async is a little harder than a sync filter,
// since the item being filtered may cross threads.
// Using `filter_map` simplifies this for the most part.
//
// Despite the fact that this is an async stream, it is currently still operating
// serially; to make it parallel use `buffer` or `buffer_unordered`.
let filtered_references = futures::stream::iter(references)
.filter_map(|reference| async {
reference_needs_scanning(transport, &reference, PathBuf::from(tmpdir.path()))
.await
.ok()
.and_then(|needs_scanning| match needs_scanning {
true => Some(reference),
false => None,
})
})
.collect()
.await;

debug!(
"references that have been updated in the last {} days: {:?}",
DAYS_UNTIL_STALE, filtered_references
);

Ok(filtered_references)
}

/// A reference needs scanning if its head commit is less than 30 days old
#[tracing::instrument(skip(transport))]
async fn reference_needs_scanning(
transport: &Transport,
reference: &Reference,
cloned_repo_dir: PathBuf,
) -> Result<bool, Report<Error>> {
let reference_string = match reference {
Reference::Branch { name, .. } => format!("origin/{name}"),
Reference::Tag { name, .. } => name.clone(),
};

let args = vec![
"log",
"-n",
"1",
"--format=%aI:::%cI",
reference_string.as_str(),
]
.into_iter()
.map(Value::new_plain)
.collect_vec();

// git log -n 1 --format="%aI:::%cI" <name of tag or branch>
// This will return one line containing the author date and committer date separated by ":::". E.g.:
// The "I" in "aI" and "cI" forces the date to be in strict ISO-8601 format
//
// git log -n 1 --format="%ai:::%ci" parse-config-file
// 2023-02-17 17:14:52 -0800:::2023-02-17 17:14:52 -0800
//
// The author and committer dates are almost always the same, but we'll parse both and take the most
// recent, just to be super safe

let output = run_git(transport, &args, Some(&cloned_repo_dir)).await?;
let date_strings = output.stdout_string_lossy();
let mut dates = date_strings.split(":::");
let earliest_commit_date_that_needs_to_be_scanned =
OffsetDateTime::checked_sub(OffsetDateTime::now_utc(), DAYS_UNTIL_STALE.days())
.ok_or_else(|| report!(Error::ParseGitOutput))?;

let author_date = dates
.next()
.map(|d| OffsetDateTime::parse(d, &Iso8601::DEFAULT));
if let Some(Ok(author_date)) = author_date {
if author_date > earliest_commit_date_that_needs_to_be_scanned {
return Ok(true);
}
}

let committer_date = dates
.next()
.map(|d| OffsetDateTime::parse(d, &Iso8601::DEFAULT));
if let Some(Ok(committer_date)) = committer_date {
if committer_date > earliest_commit_date_that_needs_to_be_scanned {
return Ok(true);
}
}

Ok(false)
}

/// Do a blobless clone of the repository, checking out the Reference if it exists
#[tracing::instrument(skip(transport))]
async fn blobless_clone(
Expand Down
23 changes: 12 additions & 11 deletions src/ext/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
use std::{
ffi::{OsStr, OsString},
fmt::Display,
ops::Deref,
path::PathBuf,
process::{ExitStatus, Stdio},
};
Expand Down Expand Up @@ -711,15 +710,16 @@ impl CommandDescriber for tokio::process::Command {
}
}

// Double derefs here because this is implemented on `&T`.
// Desugared, `&self` on `&T` means `&&T`.
// We want to deref all the way down to `T`, which means dereferencing twice.
// Dereference using `*self` as opposed to `self.deref()`.
// This is because `self.deref()` causes warnings about double references (`&&T`),
// whereas `*self` does not since it is more explicit.
//
// Normally derefs are handled automatically, so we'd just write e.g. `self.stdout()`,
// but in this case we want to be careful to access the `stdout()` method from the _underlying_ `&T`,
// not the `stdout()` method that is currently executing, so we need to deref to make that explicit.
//
// You can tell this is the case because if you remove the derefs clippy warns:
// You can tell this is the case because if you remove the deref clippy warns:
//
// > function cannot return without recursing
//
Expand All @@ -729,7 +729,7 @@ where
T: CommandDescriber,
{
fn describe(&self) -> Description {
self.deref().deref().describe()
(*self).describe()
}
}

Expand Down Expand Up @@ -796,15 +796,16 @@ impl OutputProvider for std::process::Output {
}
}

// Double derefs here because this is implemented on `&T`.
// Desugared, `&self` on `&T` means `&&T`.
// We want to deref all the way down to `T`, which means dereferencing twice.
// Dereference using `*self` as opposed to `self.deref()`.
// This is because `self.deref()` causes warnings about double references (`&&T`),
// whereas `*self` does not since it is more explicit.
//
// Normally derefs are handled automatically, so we'd just write e.g. `self.stdout()`,
// but in this case we want to be careful to access the `stdout()` method from the _underlying_ `&T`,
// not the `stdout()` method that is currently executing, so we need to deref to make that explicit.
//
// You can tell this is the case because if you remove the derefs clippy warns:
// You can tell this is the case because if you remove the deref clippy warns:
//
// > function cannot return without recursing
//
Expand All @@ -814,15 +815,15 @@ where
T: OutputProvider,
{
fn stdout(&self) -> Vec<u8> {
self.deref().deref().stdout()
(*self).stdout()
}

fn stderr(&self) -> Vec<u8> {
self.deref().deref().stderr()
(*self).stderr()
}

fn status(&self) -> ExitStatus {
self.deref().deref().status()
(*self).status()
}
}

Expand Down
2 changes: 0 additions & 2 deletions testdata/config/fossa-one-http-no-auth.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ integrations:
poll_interval: 1h
remote: https://github.com/fossas/one.git
import_branches: true
watched_branches:
- main
auth:
type: none
transport: http
3 changes: 1 addition & 2 deletions tests/it/remote_git.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ async fn references_on_public_repo_with_no_auth() {
.references()
.await
.expect("no results returned from get_references_that_need_scanning on a public repo!");
let expected_empty_vec: Vec<Reference> = Vec::new();
assert_eq!(expected_empty_vec, references);
assert!(!references.is_empty());
}

#[tokio::test]
Expand Down

0 comments on commit f4d31e9

Please sign in to comment.