From e8f88765341bb80e7fa8dbce9d4ca313dd610f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Hanuszczak?= <1961136+panhania@users.noreply.github.com> Date: Fri, 27 Sep 2024 12:33:20 +0200 Subject: [PATCH] Implement the `grep_file_contents` action. --- crates/rrg-proto/build.rs | 1 + crates/rrg/Cargo.toml | 2 + crates/rrg/src/action.rs | 7 + crates/rrg/src/action/grep_file_contents.rs | 225 ++++++++++++++++++++ crates/rrg/src/request.rs | 4 + proto/rrg.proto | 2 + proto/rrg/action/grep_file_contents.proto | 38 ++++ 7 files changed, 279 insertions(+) create mode 100644 crates/rrg/src/action/grep_file_contents.rs create mode 100644 proto/rrg/action/grep_file_contents.proto diff --git a/crates/rrg-proto/build.rs b/crates/rrg-proto/build.rs index a6bf929f..c6e7d7da 100644 --- a/crates/rrg-proto/build.rs +++ b/crates/rrg-proto/build.rs @@ -19,6 +19,7 @@ const PROTOS: &'static [&'static str] = &[ "../../proto/rrg/action/get_filesystem_timeline.proto", "../../proto/rrg/action/get_system_metadata.proto", "../../proto/rrg/action/get_winreg_value.proto", + "../../proto/rrg/action/grep_file_contents.proto", "../../proto/rrg/action/list_connections.proto", "../../proto/rrg/action/list_interfaces.proto", "../../proto/rrg/action/list_mounts.proto", diff --git a/crates/rrg/Cargo.toml b/crates/rrg/Cargo.toml index c4f656e9..3bdacf1b 100644 --- a/crates/rrg/Cargo.toml +++ b/crates/rrg/Cargo.toml @@ -10,6 +10,7 @@ default = [ "action-get_system_metadata", "action-get_file_metadata", "action-get_file_contents", + "action-grep_file_contents", "action-get_filesystem_timeline", "action-list_connections", "action-list_interfaces", @@ -23,6 +24,7 @@ default = [ action-get_system_metadata = [] action-get_file_metadata = [] action-get_file_contents = ["dep:sha2"] +action-grep_file_contents = [] action-get_filesystem_timeline = ["dep:flate2", "dep:sha2"] action-list_connections = [] action-list_interfaces = [] diff --git a/crates/rrg/src/action.rs b/crates/rrg/src/action.rs index 678ef690..23db0798 100644 --- a/crates/rrg/src/action.rs +++ b/crates/rrg/src/action.rs @@ -24,6 +24,9 @@ pub mod get_file_metadata; #[cfg(feature = "action-get_file_contents")] pub mod get_file_contents; +#[cfg(feature = "action-grep_file_contents")] +pub mod grep_file_contents; + #[cfg(feature = "action-get_filesystem_timeline")] pub mod get_filesystem_timeline; @@ -86,6 +89,10 @@ where GetFileContents => { handle(session, request, self::get_file_contents::handle) } + #[cfg(feature = "action-grep_file_contents")] + GrepFileContents => { + handle(session, request, self::grep_file_contents::handle) + } #[cfg(feature = "action-get_filesystem_timeline")] GetFilesystemTimeline => { handle(session, request, self::get_filesystem_timeline::handle) diff --git a/crates/rrg/src/action/grep_file_contents.rs b/crates/rrg/src/action/grep_file_contents.rs new file mode 100644 index 00000000..66039171 --- /dev/null +++ b/crates/rrg/src/action/grep_file_contents.rs @@ -0,0 +1,225 @@ +// Copyright 2024 Google LLC +// +// Use of this source code is governed by an MIT-style license that can be found +// in the LICENSE file or at https://opensource.org/licenses/MIT. + +use std::path::PathBuf; + +/// Arguments of the `grep_file_contents` action. +pub struct Args { + /// Path to the file to grep the contents of. + path: PathBuf, + /// Regular expression to search for in the file contents. + regex: regex::Regex, +} + +/// Result of the `grep_file_contents` action. +pub struct Item { + /// Byte offset within the file from which the content matched. + offset: u64, + /// Content that matched the specified regular expression. + content: String, +} + +/// Handles invocations of the `grep_file_contents` action. +pub fn handle(session: &mut S, args: Args) -> crate::session::Result<()> +where + S: crate::session::Session, +{ + let file = std::fs::File::open(&args.path) + .map_err(crate::session::Error::action)?; + + let mut file = std::io::BufReader::new(file); + + // TODO(@panhania): Read to a buffer of predefined size so that we do not + // allow reading lines of arbitrary length. + let mut line = String::new(); + let mut offset = 0; + + loop { + use std::io::BufRead as _; + + line.clear(); + let len = match file.read_line(&mut line) { + Ok(0) => return Ok(()), + Ok(len) => len, + Err(error) => return Err(crate::session::Error::action(error)), + }; + + for matcz in args.regex.find_iter(&line) { + session.reply(Item { + offset: offset + matcz.start() as u64, + content: matcz.as_str().to_string(), + })?; + } + + offset += len as u64; + } +} + +impl crate::request::Args for Args { + + type Proto = rrg_proto::grep_file_contents::Args; + + fn from_proto(mut proto: Self::Proto) -> Result { + use crate::request::ParseArgsError; + + let path = PathBuf::try_from(proto.take_path()) + .map_err(|error| ParseArgsError::invalid_field("path", error))?; + + let regex = regex::Regex::new(proto.regex()) + .map_err(|error| ParseArgsError::invalid_field("regex", error))?; + + Ok(Args { + path, + regex, + }) + } +} + +impl crate::response::Item for Item { + + type Proto = rrg_proto::grep_file_contents::Result; + + fn into_proto(self) -> Self::Proto { + let mut proto = Self::Proto::default(); + proto.set_offset(self.offset); + proto.set_content(self.content); + + proto + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn handle_empty_file_non_empty_regex() { + let tempdir = tempfile::tempdir() + .unwrap(); + + std::fs::write(tempdir.path().join("file"), b"") + .unwrap(); + + let args = Args { + path: tempdir.path().join("file"), + regex: regex::Regex::new("").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + handle(&mut session, args) + .unwrap(); + + assert_eq!(session.reply_count(), 0); + } + + #[test] + fn handle_regex_no_matches() { + let tempdir = tempfile::tempdir() + .unwrap(); + + std::fs::write(tempdir.path().join("file"), b"foo") + .unwrap(); + + let args = Args { + path: tempdir.path().join("file"), + regex: regex::Regex::new("bar").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + handle(&mut session, args) + .unwrap(); + + assert_eq!(session.reply_count(), 0); + } + + #[test] + fn handle_regex_single_match() { + let tempdir = tempfile::tempdir() + .unwrap(); + + std::fs::write(tempdir.path().join("file"), b"bar") + .unwrap(); + + let args = Args { + path: tempdir.path().join("file"), + regex: regex::Regex::new("bar").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + handle(&mut session, args) + .unwrap(); + + assert_eq!(session.reply_count(), 1); + + let item = session.reply::(0); + assert_eq!(item.offset, 0); + assert_eq!(item.content, "bar"); + } + + #[test] + fn handle_regex_multiple_matches_multiple_lines() { + let tempdir = tempfile::tempdir() + .unwrap(); + + std::fs::write(tempdir.path().join("file"), b"bar\nbas\nbaz\nbar") + .unwrap(); + + let args = Args { + path: tempdir.path().join("file"), + regex: regex::Regex::new("ba[rz]").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + handle(&mut session, args) + .unwrap(); + + assert_eq!(session.reply_count(), 3); + + let item = session.reply::(0); + assert_eq!(item.offset, 0); + assert_eq!(item.content, "bar"); + + let item = session.reply::(1); + assert_eq!(item.offset, 8); + assert_eq!(item.content, "baz"); + + let item = session.reply::(2); + assert_eq!(item.offset, 12); + assert_eq!(item.content, "bar"); + } + + #[test] + fn handle_regex_multiple_matches_single_line() { + let tempdir = tempfile::tempdir() + .unwrap(); + + std::fs::write(tempdir.path().join("file"), b"bar bas baz bar") + .unwrap(); + + let args = Args { + path: tempdir.path().join("file"), + regex: regex::Regex::new("ba[rz]").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + handle(&mut session, args) + .unwrap(); + + assert_eq!(session.reply_count(), 3); + + let item = session.reply::(0); + assert_eq!(item.offset, 0); + assert_eq!(item.content, "bar"); + + let item = session.reply::(1); + assert_eq!(item.offset, 8); + assert_eq!(item.content, "baz"); + + let item = session.reply::(2); + assert_eq!(item.offset, 12); + assert_eq!(item.content, "bar"); + } +} diff --git a/crates/rrg/src/request.rs b/crates/rrg/src/request.rs index 2f66eba6..83e86881 100644 --- a/crates/rrg/src/request.rs +++ b/crates/rrg/src/request.rs @@ -23,6 +23,8 @@ pub enum Action { GetFileContents, /// Get hash of the specified file. GetFileHash, + /// Grep the specified file for a pattern. + GrepFileContents, /// List contents of a directory. ListDirectory, /// List processes available on the system. @@ -57,6 +59,7 @@ impl std::fmt::Display for Action { Action::GetFileMetadata => write!(fmt, "get_file_metadata"), Action::GetFileContents => write!(fmt, "get_file_contents"), Action::GetFileHash => write!(fmt, "get_file_hash"), + Action::GrepFileContents => write!(fmt, "grep_file_contents"), Action::ListDirectory => write!(fmt, "list_directory"), Action::ListProcesses => write!(fmt, "list_processes"), Action::ListConnections => write!(fmt, "list_connections"), @@ -105,6 +108,7 @@ impl TryFrom for Action { GET_FILE_METADATA => Ok(Action::GetFileMetadata), GET_FILE_CONTENTS => Ok(Action::GetFileContents), GET_FILE_HASH => Ok(Action::GetFileHash), + GREP_FILE_CONTENTS => Ok(Action::GrepFileContents), LIST_DIRECTORY => Ok(Action::ListDirectory), LIST_PROCESSES => Ok(Action::ListProcesses), LIST_CONNECTIONS => Ok(Action::ListConnections), diff --git a/proto/rrg.proto b/proto/rrg.proto index c1079f6c..65fc1425 100644 --- a/proto/rrg.proto +++ b/proto/rrg.proto @@ -46,6 +46,8 @@ enum Action { LIST_WINREG_KEYS = 15; // Query WMI using WQL (Windows-only). QUERY_WMI = 16; + /// Grep the specified file for a pattern. + GREP_FILE_CONTENTS = 17; // TODO: Define more actions that should be supported. diff --git a/proto/rrg/action/grep_file_contents.proto b/proto/rrg/action/grep_file_contents.proto new file mode 100644 index 00000000..425408a6 --- /dev/null +++ b/proto/rrg/action/grep_file_contents.proto @@ -0,0 +1,38 @@ +// Copyright 2024 Google LLC +// +// Use of this source code is governed by an MIT-style license that can be found +// in the LICENSE file or at https://opensource.org/licenses/MIT. +syntax = "proto3"; + +package rrg.action.get_file_contents; + +import "rrg/fs.proto"; + +message Args { + // Absolute path to the file to grep the contents of. + // + // The file content must be valid UTF-8. + rrg.fs.Path path = 1; + + // Regular expression to search for in the file contents. + // + // The specific syntax of the regex language is left unspecified as the + // implementation detail but most common regex features can be expected to + // be supported. + string regex = 2; + + // TODO(@panhania): Add support for files that not necessarily conform to + // Unicode. + + // TODO(@panhania): Add support for different file encodings. +} + +message Result { + // Byte offset within the file from which the content matched. + uint64 offset = 1; + + // Content that matched the specified regular expression. + string content = 2; + + // TODO(@panhania): Add support for capture groups. +}