Skip to content

Commit

Permalink
Extract from foundation-libs
Browse files Browse the repository at this point in the history
  • Loading branch information
jssblck committed Jun 3, 2024
1 parent 874edaf commit 9ccea79
Show file tree
Hide file tree
Showing 13 changed files with 960 additions and 46 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @fossas/analysis
2 changes: 1 addition & 1 deletion .github/workflows/check-dynamic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
setup: echo "no setup"
build: cargo build
- host: macos-latest
setup: rustup target add aarch64-apple-darwin
setup: rustup target add aarch64-apple-darwin && rustup target add x86_64-apple-darwin
build: cargo build --target aarch64-apple-darwin && cargo build --target x86_64-apple-darwin

runs-on: ${{ matrix.settings.host }}
Expand Down
15 changes: 11 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
[package]
name = "template-rust"
version = "0.1.0"
name = "fingerprint"
version = "1.0.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
getset = "0.1.2"
hex = "0.4.3"
iter-read = "0.3.1"
serde = { version = "1.0.140", features = ["derive"] }
thiserror = "1.0.31"
sha2 = "0.10.6"

[dev-dependencies]
typed-builder = "0.10.0"
30 changes: 1 addition & 29 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1 @@
# template-rust

Template repository for a Rust project.

TODOs for a new project:
- [ ] Change the license if MPL2 is not appropriate for the project. Make sure to do this before adding any code.
- [ ] Ensure the dev docs (in particular the release and compatibility semantics) are valid for this project.
- [ ] Set [CODEOWNERS] to the team that owns the repository.
- [ ] Create an API user in [FOSSA] and store it as a secret named `FOSSA_API_KEY`.
- Consider naming it with the pattern `ci-{REPO_NAME}`. For example, `ci-template-rust`.
- [ ] Update repository permissions as appropriate. Generally, the CODEOWNER team is set as admin.
- [ ] Update branch protection rules as appropriate.
- [ ] Update repository features and settings. Recommended defaults:
- [ ] Turn off all features (Wikis, Issues, Sponsorships, Discussions, Projects); FOSSA uses other systems for these.
- [ ] Only allow squash merging.
- [ ] Always suggest updating PR branches.
- [ ] Allow auto-merge.
- [ ] Automatically delete head branches.

Then just edit the included Rust project, or remove it and `cargo init` your project, and get going!

[codeowners]: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
[fossa]: https://app.fossa.com

# recommendations

- If publishing a Linux binary, consider providing two: one that [statically links libc](./docs/dev/reference/static-binary.md), and one that doesn't.
- If publishing a macOS binary, consider providing two: one for [Intel and one for M-series CPUs](./docs/dev/reference/macos-arch.md).
- If this application may be used on AWS Graviton or similar, consider providing an ARM build for Linux as well.
# lib-fingerprint
230 changes: 230 additions & 0 deletions src/fingerprint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
use std::io::{self, BufRead, BufReader, Cursor, Read, Write};

use iter_read::IterRead;
use sha2::{Digest, Sha256};

use crate::{stream::ConvertCRLFToLF, CommentStrippedSHA256, Error, Fingerprint, RawSHA256};

/// Fingerprint the file using the [`RawSHA256`] kind.
pub fn raw<R: BufRead>(stream: &mut R) -> Result<Fingerprint<RawSHA256>, Error> {
// Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
let BinaryCheck { read, is_binary } = content_is_binary(stream)?;

// Chain the part of the stream already read to evaluate binary along with the rest of the stream.
let mut stream = Cursor::new(read).chain(stream);
let mut hasher = Sha256::new();
if is_binary {
content_binary(&mut stream, &mut hasher)?;
} else {
content_text(&mut stream, &mut hasher)?;
}

Fingerprint::from_digest(hasher)
}

/// Fingerprint the file using the [`CommentStrippedSHA256`] kind.
pub fn comment_stripped<R: BufRead>(
stream: &mut R,
) -> Result<Option<Fingerprint<CommentStrippedSHA256>>, Error> {
// Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
let BinaryCheck { read, is_binary } = content_is_binary(stream)?;
if is_binary {
return Ok(None);
}

// Chain the part of the stream already read to evaluate binary along with the rest of the stream.
let mut stream = Cursor::new(read).chain(stream);
let mut hasher = Sha256::new();
match content_stripped(&mut stream, &mut hasher) {
Ok(_) => Some(Fingerprint::from_digest(hasher)).transpose(),
Err(err) => {
// The `io::Error` type is opaque.
// Handle the case of attempting to comment strip a binary file.
if err.to_string().to_lowercase().contains("utf-8") {
Ok(None)
} else {
Err(err)
}
}
}
}

/// The result of checking a file for whether it is binary.
pub(crate) struct BinaryCheck {
pub(crate) read: Vec<u8>,
pub(crate) is_binary: bool,
}

/// Inspect the file to determine if it is binary.
///
/// Uses the same method as git: "is there a zero byte in the first 8000 bytes of the file"
pub(crate) fn content_is_binary<R: Read>(stream: &mut R) -> Result<BinaryCheck, io::Error> {
let mut buf = Vec::new();
stream.take(8000).read_to_end(&mut buf)?;
let is_binary = buf.contains(&0);
Ok(BinaryCheck {
read: buf,
is_binary,
})
}

/// Reads the exact contents of a binary file without modification.
pub(crate) fn content_binary(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
io::copy(stream, w)?;
Ok(())
}

/// Reads text files in a platform independent manner.
///
/// Specifically:
/// - All text encodings are ignored; this function operates on raw bytes.
/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
/// while *nix checks them out with `\n`.
/// To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
pub(crate) fn content_text(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
let stream = BufReader::new(stream).bytes().crlf_to_lf().fuse();
io::copy(&mut IterRead::new(stream), w)?;
Ok(())
}

/// Hashes code files while removing C-style comments and blank lines in a platform independent manner.
///
/// Specifically:
/// - All text encodings are treated as utf8.
/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
/// while *nix checks them out with `\n`.
/// To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
/// - C-style comments are removed:
/// - `//` is considered the start of a single line comment; these bytes and any other bytes until right before a `\n` are removed.
/// - `/*` is considered the start of a multi line comment; these bytes and any other bytes until after a `*/` is read are removed.
/// - This function does not check for escaped comments.
/// - Any sequence of multiple contiguous `\n` bytes are collapsed to a single `\n` byte.
/// - The final `\n` byte is removed from the end of the stream if present.
pub(crate) fn content_stripped(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
let mut buffered_output_line = String::new();
let mut is_multiline_active = false;

for line in stream.lines() {
let mut line = line?;

// At this point we know we have a new line coming. If a previous line is buffered and ready to write, do so now.
// Write it with a trailing newline because we know we'll be writing a following line.
if !buffered_output_line.is_empty() {
writeln!(w, "{buffered_output_line}")?;
}

(line, is_multiline_active) = clean_line(line, is_multiline_active);
line.trim().clone_into(&mut buffered_output_line);
}

// Now that we're done reading the input stream, if there's a buffered output line write it *without a trailing newline*.
write!(w, "{buffered_output_line}")?;
Ok(())
}

/// Part comment stripping, part state machine. Cleans lines of comments based on whether a previous invocation
/// detected the start of a multi line comment.
///
/// This is very much not an ideal function: it scans the line multiple times instead of being forward-looking-only,
/// and the dual responsibility makes it complicated. We should fix this, but moving forward for now.
fn clean_line(line: String, is_multiline_active: bool) -> (String, bool) {
if is_multiline_active {
if let Some(end) = line.find("*/") {
return clean_line(line[end + 2..].to_string(), false);
}

(String::new(), true)
} else if let Some(start) = line.find("/*") {
let before_multi = line[..start].to_string();
let (after_multi, is_multi) = clean_line(line[start + 2..].to_string(), true);
(before_multi + &after_multi, is_multi)
} else if let Some(start) = line.find("//") {
(line[..start].to_string(), false)
} else {
(line, false)
}
}

#[cfg(test)]
mod tests {
//! Tests for internal logic.
use super::*;

/// Inspired by the Haskell implementation: https://github.com/fossas/fossa-cli/blob/8de74b71b80d77321d64f94d7573773e49306772/test/App/Fossa/VSI/testdata/multi_line_comment.c#L1-L10
#[test]
fn comment_strip_mixed() {
let content = r#"/*
* This is a placeholder file used to test comment stripping code.
*/
int main() {
int code = 0;
// code = 1;
return code; // perfect
}
"#;
let expected = r#"int main() {
int code = 0;
return code;
}"#;

let mut buf = Vec::new();
content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
assert_eq!(expected, String::from_utf8_lossy(&buf));
}

/// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L71-L79
#[test]
fn comment_strip_single_line_comments() {
let content = " content1 \n content2 //comment \n content3 ";
let expected = "content1\ncontent2\ncontent3";

let mut buf = Vec::new();
content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
assert_eq!(expected, String::from_utf8_lossy(&buf));
}

/// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L89-L97
#[test]
fn comment_strip_multi_line_comments() {
let content =
" content1 \n content2 /* begin comment \n end comment */ content3 \n content4 ";
let expected = "content1\ncontent2\ncontent3\ncontent4";

let mut buf = Vec::new();
content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
assert_eq!(expected, String::from_utf8_lossy(&buf));
}

#[test]
fn comment_strip_cr() {
let content = "hello world\r\nanother line\r\na final line\n";
let expected = "hello world\nanother line\na final line";

let mut buf = Vec::new();
content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
assert_eq!(expected, String::from_utf8_lossy(&buf));
}

#[test]
fn comment_strip_real_source() {
let content = include_bytes!("../testdata/facebook-folly-Version.cpp");
let expected = include_str!("../testdata/facebook-folly-Version.cpp.stripped");

let mut buf = Vec::new();
content_stripped(&mut Cursor::new(content), &mut buf).expect("must process");

assert_eq!(normalize_lf(expected), String::from_utf8_lossy(&buf));
}

/// Windows CI checks out CRLF. Normalize it to be LF only.
/// This function should only be applied to testing values, not responses from the functions being tested.
fn normalize_lf(input: impl Into<String>) -> String {
input.into().replace("\r\n", "\n")
}
}
Loading

0 comments on commit 9ccea79

Please sign in to comment.