diff --git a/crates/rrg/Cargo.toml b/crates/rrg/Cargo.toml index e6d0e235..a2a7a31b 100644 --- a/crates/rrg/Cargo.toml +++ b/crates/rrg/Cargo.toml @@ -124,6 +124,9 @@ version = "0.8.5" [dev-dependencies.tempfile] version = "3.13.0" +[dev-dependencies.quickcheck] +version = "1.0.3" + [target.'cfg(target_family = "windows")'.dev-dependencies.windows-sys] version = "0.59.0" features = [ diff --git a/crates/rrg/src/action/grep_file_contents.rs b/crates/rrg/src/action/grep_file_contents.rs index 66039171..b97ce198 100644 --- a/crates/rrg/src/action/grep_file_contents.rs +++ b/crates/rrg/src/action/grep_file_contents.rs @@ -29,18 +29,17 @@ where let file = std::fs::File::open(&args.path) .map_err(crate::session::Error::action)?; - let mut file = std::io::BufReader::new(file); + let mut file = crate::io::LineReader::new(file) + // We want to support lines only up to 1 MiB. Fleetspeak does not allow + // for messages bigger than 2 MiB anyway. + .with_max_line_len(1 * 1024 * 1024); - // TODO(@panhania): Read to a buffer of predefined size so that we do not - // allow reading lines of arbitrary length. let mut line = String::new(); let mut offset = 0; loop { - use std::io::BufRead as _; - line.clear(); - let len = match file.read_line(&mut line) { + let len = match file.read_line_lossy(&mut line) { Ok(0) => return Ok(()), Ok(len) => len, Err(error) => return Err(crate::session::Error::action(error)), diff --git a/crates/rrg/src/io.rs b/crates/rrg/src/io.rs index 89f08940..a169a037 100644 --- a/crates/rrg/src/io.rs +++ b/crates/rrg/src/io.rs @@ -75,6 +75,139 @@ where Ok(written) } +/// Buffered reader for efficent line reading. +/// +/// This object works similarly to [`std::io::BufReader`] but is tailored for +/// line reading capabilities. +pub struct LineReader { + /// Content source to read from. + inner: R, + /// Buffer that we use for reading. + buf: Box<[u8]>, + /// Number of elements of `buf` that are actually available. + buf_fill_len: usize, + /// Limit on the length of a single line that the reader can read. + max_line_len: usize, +} + +impl LineReader { + + /// Creates a new `LineReader` with default buffer capacity. + /// + /// See also [`std::io::BufReader::new`]. + pub fn new(inner: R) -> LineReader { + LineReader::with_capacity(DEFAULT_BUF_SIZE, inner) + } + + /// Creates a new `LineReader` with the specified buffer capacity. + /// + /// See also [`std::io::BufReader::with_capacity`]. + pub fn with_capacity(capacity: usize, inner: R) -> LineReader { + LineReader { + inner, + buf: vec![0; capacity].into_boxed_slice(), + buf_fill_len: 0, + max_line_len: usize::MAX, + } + } + + /// Sets the limit on the length of a single line that the reader can read. + /// + /// This is useful to avoid situation in which a large file without any line + /// feed characters can cause the memory to be completely exhausted when + /// trying to read the line. + pub fn with_max_line_len(mut self, len: usize) -> LineReader { + self.max_line_len = len; + self + } + + /// Reads all bytes until a newline (the `0xA` byte) is reached, and appends + /// them to the provided `String` buffer. + /// + /// Unlike [`std::io::BufRead::read_line`], this method does not fail when + /// an invalid UTF-8 sequence is encountered but instead uses [lossy UTF-8 + /// conversion][1], which replaces such sequences with [`U+FFFD REPLACEMENT + /// CHARACTER`][2]. + /// + /// # Errors + /// + /// This function will fail if an I/O error is raised when reading data. In + /// such cases `buf` may contain some new bytes that were read so far. + /// + /// This will also fail if the line length limit was specified and the line + /// being read exceeds it. + /// + /// [1]: std::string::String::from_utf8_lossy + /// [2]: std::char::REPLACEMENT_CHARACTER + pub fn read_line_lossy(&mut self, buf: &mut String) -> std::io::Result { + let mut len = 0; + + loop { + // We may have a line feed somewhere in our buffer already. In such + // a case, we extend the result buffer with content up until that + // point (provided that the length limit is not exceeted) and + // advance the internal buffer accordingly. + if let Some(pos) = self.buf[..self.buf_fill_len].iter().position(|byte| *byte == b'\n') { + if len + pos + 1 > self.max_line_len { + return Err(std::io::Error::other(MaxLineLenError(self.max_line_len))); + } + + buf.push_str(&String::from_utf8_lossy(&self.buf[..pos + 1])); + len += pos + 1; + + self.buf.rotate_left(pos + 1); + self.buf_fill_len -= pos + 1; + return Ok(len); + } + + // There is no line feed in our buffer. Thus, we put everything we + // have to the result string (provided that the length limit is not + // exceeded) and fill it again with new content. + + if len + self.buf_fill_len > self.max_line_len { + return Err(std::io::Error::other(MaxLineLenError(self.max_line_len))); + } + + buf.push_str(&String::from_utf8_lossy(&self.buf[..self.buf_fill_len])); + len += self.buf_fill_len; + + self.buf_fill_len = 0; + loop { + match self.inner.read(&mut self.buf) { + Ok(0) => { + // We reached the end of the input without finding any + // line feed character. + return Ok(len); + } + Ok(len) => { + self.buf_fill_len = len; + break; + } + Err(error) if error.kind() == std::io::ErrorKind::Interrupted => { + // We do what the standard library does in case reads to a + // buffer fail with interruption errors: we try again. + continue; + } + Err(error) => return Err(error), + } + } + } + } +} + +#[derive(Debug)] +struct MaxLineLenError(usize); + +impl std::fmt::Display for MaxLineLenError { + + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "maximum line length ({} bytes) exceeded", self.0) + } +} + +impl std::error::Error for MaxLineLenError { +} + /// An reader implementation for a stream of readers. /// /// It turns a stream of `Read` instances into one `Read` instance where bytes @@ -150,6 +283,8 @@ mod tests { use super::*; + use quickcheck::quickcheck; + #[test] fn test_copy_until_with_empty_buffer() { let mut reader: &[u8] = b""; @@ -199,6 +334,193 @@ mod tests { assert!(writer.len() > limit); } + #[test] + fn line_reader_empty() { + let mut reader = LineReader::new("".as_bytes()); + let mut line = String::new(); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 0); + assert_eq!(line, ""); + } + + #[test] + fn line_reader_one_line_without_line_feed() { + let mut reader = LineReader::new("foo".as_bytes()); + let mut line = String::new(); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3); + assert_eq!(line, "foo"); + } + + #[test] + fn line_reader_one_line_with_line_feed() { + let mut reader = LineReader::new("foo\n".as_bytes()); + let mut line = String::new(); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "foo\n"); + } + + #[test] + fn line_reader_many_lines() { + let mut reader = LineReader::new("foo\nbar\nbaz".as_bytes()); + let mut line = String::new(); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "foo\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "bar\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3); + assert_eq!(line, "baz"); + } + + #[test] + fn line_reader_small_capacity_one_line_without_line_feed() { + let mut reader = LineReader::with_capacity(2, "quux".as_bytes()); + let mut line = String::new(); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "quux"); + } + + #[test] + fn line_reader_small_capacity_one_line_with_line_feed() { + let mut reader = LineReader::with_capacity(2, "quux\n".as_bytes()); + let mut line = String::new(); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 5); + assert_eq!(line, "quux\n"); + } + + #[test] + fn line_reader_small_capacity_many_lines() { + let mut reader = LineReader::with_capacity(2, "foo\nbar\nbaz".as_bytes()); + let mut line = String::new(); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "foo\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "bar\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 3); + assert_eq!(line, "baz"); + } + + #[test] + fn line_reader_small_max_line_len_one_line_without_line_feed() { + let mut reader = LineReader::new("foo".as_bytes()) + .with_max_line_len(2); + + let error = reader.read_line_lossy(&mut String::new()) + .unwrap_err().into_inner().unwrap() + .downcast::().unwrap(); + assert!(matches!(error.as_ref(), MaxLineLenError(2))); + } + + #[test] + fn line_reader_small_max_line_len_one_line_with_line_feed() { + let mut reader = LineReader::new("foo\n".as_bytes()) + .with_max_line_len(3); + + let error = reader.read_line_lossy(&mut String::new()) + .unwrap_err().into_inner().unwrap() + .downcast::().unwrap(); + assert!(matches!(error.as_ref(), MaxLineLenError(3))); + } + + #[test] + fn line_reader_invalid_utf8_without_line_feed() { + let mut reader = LineReader::new(&b"ba\xF0\x90\x80"[..]); + let mut line = String::new(); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 5); + assert_eq!(line, "ba�"); + } + + #[test] + fn line_reader_invalid_utf8_with_line_feed() { + let mut reader = LineReader::new(&b"ba\xF0\x90\x80\n"[..]); + let mut line = String::new(); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 6); + assert_eq!(line, "ba�\n"); + } + + #[test] + fn line_reader_append() { + let mut reader = LineReader::new("content".as_bytes()); + let mut line = String::from("prefix"); + + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 7); + assert_eq!(line, "prefixcontent"); + } + + #[test] + fn line_reader_empty_lines() { + let mut reader = LineReader::new("\n\nfoo\n\n".as_bytes()); + let mut line = String::new(); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1); + assert_eq!(line, "\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1); + assert_eq!(line, "\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 4); + assert_eq!(line, "foo\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 1); + assert_eq!(line, "\n"); + + line.clear(); + assert_eq!(reader.read_line_lossy(&mut line).unwrap(), 0); + assert_eq!(line, ""); + } + + quickcheck! { + + fn line_reader_joined_lines(strings: Vec) -> quickcheck::TestResult { + // This property holds only for strings without line feed chars as + // otherwise an input string can get an extra split when reading. + if strings.iter().any(|string| string.contains('\n')) { + return quickcheck::TestResult::discard(); + } + + let mut content = strings.join("\n"); + content.push('\n'); + + let mut reader = LineReader::new(content.as_bytes()); + let mut line = String::new(); + + for string in &strings { + line.clear(); + if reader.read_line_lossy(&mut line).unwrap() != string.len() + 1 { + return quickcheck::TestResult::failed(); + } + if line != format!("{string}\n") { + return quickcheck::TestResult::failed(); + } + } + + quickcheck::TestResult::passed() + } + } + #[test] fn test_iter_reader_with_empty_iter() { let mut reader = IterReader::new(std::iter::empty::<&[u8]>());