Skip to content

Commit

Permalink
Limit line lengths in grep_file_contents.
Browse files Browse the repository at this point in the history
  • Loading branch information
panhania authored Nov 19, 2024
1 parent 04e65ca commit 38f7510
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 6 deletions.
3 changes: 3 additions & 0 deletions crates/rrg/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ version = "0.8.5"
[dev-dependencies.tempfile]
version = "3.13.0"

[dev-dependencies.quickcheck]
version = "1.0.3"

[target.'cfg(target_family = "windows")'.dev-dependencies.windows-sys]
version = "0.59.0"
features = [
Expand Down
11 changes: 5 additions & 6 deletions crates/rrg/src/action/grep_file_contents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,17 @@ where
let file = std::fs::File::open(&args.path)
.map_err(crate::session::Error::action)?;

let mut file = std::io::BufReader::new(file);
let mut file = crate::io::LineReader::new(file)
// We want to support lines only up to 1 MiB. Fleetspeak does not allow
// for messages bigger than 2 MiB anyway.
.with_max_line_len(1 * 1024 * 1024);

// TODO(@panhania): Read to a buffer of predefined size so that we do not
// allow reading lines of arbitrary length.
let mut line = String::new();
let mut offset = 0;

loop {
use std::io::BufRead as _;

line.clear();
let len = match file.read_line(&mut line) {
let len = match file.read_line_lossy(&mut line) {
Ok(0) => return Ok(()),
Ok(len) => len,
Err(error) => return Err(crate::session::Error::action(error)),
Expand Down
322 changes: 322 additions & 0 deletions crates/rrg/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,139 @@ where
Ok(written)
}

/// Buffered reader for efficent line reading.
///
/// This object works similarly to [`std::io::BufReader`] but is tailored for
/// line reading capabilities.
pub struct LineReader<R: Read> {
/// Content source to read from.
inner: R,
/// Buffer that we use for reading.
buf: Box<[u8]>,
/// Number of elements of `buf` that are actually available.
buf_fill_len: usize,
/// Limit on the length of a single line that the reader can read.
max_line_len: usize,
}

impl<R: Read> LineReader<R> {

/// Creates a new `LineReader` with default buffer capacity.
///
/// See also [`std::io::BufReader::new`].
pub fn new(inner: R) -> LineReader<R> {
LineReader::with_capacity(DEFAULT_BUF_SIZE, inner)
}

/// Creates a new `LineReader` with the specified buffer capacity.
///
/// See also [`std::io::BufReader::with_capacity`].
pub fn with_capacity(capacity: usize, inner: R) -> LineReader<R> {
LineReader {
inner,
buf: vec![0; capacity].into_boxed_slice(),
buf_fill_len: 0,
max_line_len: usize::MAX,
}
}

/// Sets the limit on the length of a single line that the reader can read.
///
/// This is useful to avoid situation in which a large file without any line
/// feed characters can cause the memory to be completely exhausted when
/// trying to read the line.
pub fn with_max_line_len(mut self, len: usize) -> LineReader<R> {
self.max_line_len = len;
self
}

/// Reads all bytes until a newline (the `0xA` byte) is reached, and appends
/// them to the provided `String` buffer.
///
/// Unlike [`std::io::BufRead::read_line`], this method does not fail when
/// an invalid UTF-8 sequence is encountered but instead uses [lossy UTF-8
/// conversion][1], which replaces such sequences with [`U+FFFD REPLACEMENT
/// CHARACTER`][2].
///
/// # Errors
///
/// This function will fail if an I/O error is raised when reading data. In
/// such cases `buf` may contain some new bytes that were read so far.
///
/// This will also fail if the line length limit was specified and the line
/// being read exceeds it.
///
/// [1]: std::string::String::from_utf8_lossy
/// [2]: std::char::REPLACEMENT_CHARACTER
pub fn read_line_lossy(&mut self, buf: &mut String) -> std::io::Result<usize> {
let mut len = 0;

loop {
// We may have a line feed somewhere in our buffer already. In such
// a case, we extend the result buffer with content up until that
// point (provided that the length limit is not exceeted) and
// advance the internal buffer accordingly.
if let Some(pos) = self.buf[..self.buf_fill_len].iter().position(|byte| *byte == b'\n') {
if len + pos + 1 > self.max_line_len {
return Err(std::io::Error::other(MaxLineLenError(self.max_line_len)));
}

buf.push_str(&String::from_utf8_lossy(&self.buf[..pos + 1]));
len += pos + 1;

self.buf.rotate_left(pos + 1);
self.buf_fill_len -= pos + 1;
return Ok(len);
}

// There is no line feed in our buffer. Thus, we put everything we
// have to the result string (provided that the length limit is not
// exceeded) and fill it again with new content.

if len + self.buf_fill_len > self.max_line_len {
return Err(std::io::Error::other(MaxLineLenError(self.max_line_len)));
}

buf.push_str(&String::from_utf8_lossy(&self.buf[..self.buf_fill_len]));
len += self.buf_fill_len;

self.buf_fill_len = 0;
loop {
match self.inner.read(&mut self.buf) {
Ok(0) => {
// We reached the end of the input without finding any
// line feed character.
return Ok(len);
}
Ok(len) => {
self.buf_fill_len = len;
break;
}
Err(error) if error.kind() == std::io::ErrorKind::Interrupted => {
// We do what the standard library does in case reads to a
// buffer fail with interruption errors: we try again.
continue;
}
Err(error) => return Err(error),
}
}
}
}
}

#[derive(Debug)]
struct MaxLineLenError(usize);

impl std::fmt::Display for MaxLineLenError {

fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "maximum line length ({} bytes) exceeded", self.0)
}
}

impl std::error::Error for MaxLineLenError {
}

/// An reader implementation for a stream of readers.
///
/// It turns a stream of `Read` instances into one `Read` instance where bytes
Expand Down Expand Up @@ -150,6 +283,8 @@ mod tests {

use super::*;

use quickcheck::quickcheck;

#[test]
fn test_copy_until_with_empty_buffer() {
let mut reader: &[u8] = b"";
Expand Down Expand Up @@ -199,6 +334,193 @@ mod tests {
assert!(writer.len() > limit);
}

#[test]
fn line_reader_empty() {
let mut reader = LineReader::new("".as_bytes());
let mut line = String::new();

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 0);
assert_eq!(line, "");
}

#[test]
fn line_reader_one_line_without_line_feed() {
let mut reader = LineReader::new("foo".as_bytes());
let mut line = String::new();

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3);
assert_eq!(line, "foo");
}

#[test]
fn line_reader_one_line_with_line_feed() {
let mut reader = LineReader::new("foo\n".as_bytes());
let mut line = String::new();

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "foo\n");
}

#[test]
fn line_reader_many_lines() {
let mut reader = LineReader::new("foo\nbar\nbaz".as_bytes());
let mut line = String::new();

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "foo\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "bar\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3);
assert_eq!(line, "baz");
}

#[test]
fn line_reader_small_capacity_one_line_without_line_feed() {
let mut reader = LineReader::with_capacity(2, "quux".as_bytes());
let mut line = String::new();

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "quux");
}

#[test]
fn line_reader_small_capacity_one_line_with_line_feed() {
let mut reader = LineReader::with_capacity(2, "quux\n".as_bytes());
let mut line = String::new();

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 5);
assert_eq!(line, "quux\n");
}

#[test]
fn line_reader_small_capacity_many_lines() {
let mut reader = LineReader::with_capacity(2, "foo\nbar\nbaz".as_bytes());
let mut line = String::new();

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "foo\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "bar\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3);
assert_eq!(line, "baz");
}

#[test]
fn line_reader_small_max_line_len_one_line_without_line_feed() {
let mut reader = LineReader::new("foo".as_bytes())
.with_max_line_len(2);

let error = reader.read_line_lossy(&mut String::new())
.unwrap_err().into_inner().unwrap()
.downcast::<MaxLineLenError>().unwrap();
assert!(matches!(error.as_ref(), MaxLineLenError(2)));
}

#[test]
fn line_reader_small_max_line_len_one_line_with_line_feed() {
let mut reader = LineReader::new("foo\n".as_bytes())
.with_max_line_len(3);

let error = reader.read_line_lossy(&mut String::new())
.unwrap_err().into_inner().unwrap()
.downcast::<MaxLineLenError>().unwrap();
assert!(matches!(error.as_ref(), MaxLineLenError(3)));
}

#[test]
fn line_reader_invalid_utf8_without_line_feed() {
let mut reader = LineReader::new(&b"ba\xF0\x90\x80"[..]);
let mut line = String::new();

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 5);
assert_eq!(line, "ba�");
}

#[test]
fn line_reader_invalid_utf8_with_line_feed() {
let mut reader = LineReader::new(&b"ba\xF0\x90\x80\n"[..]);
let mut line = String::new();

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 6);
assert_eq!(line, "ba�\n");
}

#[test]
fn line_reader_append() {
let mut reader = LineReader::new("content".as_bytes());
let mut line = String::from("prefix");

assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 7);
assert_eq!(line, "prefixcontent");
}

#[test]
fn line_reader_empty_lines() {
let mut reader = LineReader::new("\n\nfoo\n\n".as_bytes());
let mut line = String::new();

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1);
assert_eq!(line, "\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1);
assert_eq!(line, "\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4);
assert_eq!(line, "foo\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1);
assert_eq!(line, "\n");

line.clear();
assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 0);
assert_eq!(line, "");
}

quickcheck! {

fn line_reader_joined_lines(strings: Vec<String>) -> quickcheck::TestResult {
// This property holds only for strings without line feed chars as
// otherwise an input string can get an extra split when reading.
if strings.iter().any(|string| string.contains('\n')) {
return quickcheck::TestResult::discard();
}

let mut content = strings.join("\n");
content.push('\n');

let mut reader = LineReader::new(content.as_bytes());
let mut line = String::new();

for string in &strings {
line.clear();
if reader.read_line_lossy(&mut line).unwrap() != string.len() + 1 {
return quickcheck::TestResult::failed();
}
if line != format!("{string}\n") {
return quickcheck::TestResult::failed();
}
}

quickcheck::TestResult::passed()
}
}

#[test]
fn test_iter_reader_with_empty_iter() {
let mut reader = IterReader::new(std::iter::empty::<&[u8]>());
Expand Down

0 comments on commit 38f7510

Please sign in to comment.