diff --git a/boreal-cli/src/main.rs b/boreal-cli/src/main.rs index 2140203f..d4e38db5 100644 --- a/boreal-cli/src/main.rs +++ b/boreal-cli/src/main.rs @@ -531,11 +531,11 @@ fn scan_file(scanner: &Scanner, path: &Path, options: &ScanOptions) -> Result<() let what = path.display().to_string(); match res { Ok(res) => { - display_scan_results(res, &what, options); + display_scan_results(scanner, res, &what, options); Ok(()) } Err((err, res)) => { - display_scan_results(res, &what, options); + display_scan_results(scanner, res, &what, options); Err(err) } } @@ -545,17 +545,17 @@ fn scan_process(scanner: &Scanner, pid: u32, options: &ScanOptions) -> Result<() let what = pid.to_string(); match scanner.scan_process(pid) { Ok(res) => { - display_scan_results(res, &what, options); + display_scan_results(scanner, res, &what, options); Ok(()) } Err((err, res)) => { - display_scan_results(res, &what, options); + display_scan_results(scanner, res, &what, options); Err(err) } } } -fn display_scan_results(res: ScanResult, what: &str, options: &ScanOptions) { +fn display_scan_results(scanner: &Scanner, res: ScanResult, what: &str, options: &ScanOptions) { // Print module data first if options.print_module_data { for (module_name, module_value) in res.module_values { @@ -595,7 +595,7 @@ fn display_scan_results(res: ScanResult, what: &str, options: &ScanOptions) { write!(stdout, " [{}]", rule.tags.join(",")).unwrap(); } if options.print_metadata { - print_metadata(&mut stdout, rule.metadatas); + print_metadata(&mut stdout, scanner, rule.metadatas); } writeln!(stdout, " {}", what).unwrap(); @@ -624,17 +624,17 @@ fn display_scan_results(res: ScanResult, what: &str, options: &ScanOptions) { } } -fn print_metadata(stdout: &mut StdoutLock, metadatas: &[Metadata]) { +fn print_metadata(stdout: &mut StdoutLock, scanner: &Scanner, metadatas: &[Metadata]) { write!(stdout, " [").unwrap(); for (i, meta) in metadatas.iter().enumerate() { if i != 0 { write!(stdout, ",").unwrap(); } - write!(stdout, "{}=", meta.name).unwrap(); - match &meta.value { + write!(stdout, "{}=", scanner.get_string_symbol(meta.name)).unwrap(); + match meta.value { MetadataValue::Bytes(b) => { write!(stdout, "\"").unwrap(); - print_bytes(stdout, b); + print_bytes(stdout, scanner.get_bytes_symbol(b)); write!(stdout, "\"").unwrap(); } MetadataValue::Integer(i) => { diff --git a/boreal/src/bytes_pool.rs b/boreal/src/bytes_pool.rs new file mode 100644 index 00000000..81f97454 --- /dev/null +++ b/boreal/src/bytes_pool.rs @@ -0,0 +1,76 @@ +/// Bytes intern pool. +#[derive(Default, Debug)] +pub(crate) struct BytesPool { + buffer: Vec, +} + +/// Symbol for a bytes string stored in a bytes intern pool. +#[derive(Copy, Clone, Debug)] +pub struct BytesSymbol { + from: usize, + to: usize, +} + +/// Symbol for a string stored in a bytes intern pool. +#[derive(Copy, Clone, Debug)] +pub struct StringSymbol { + from: usize, + to: usize, +} + +impl BytesPool { + /// Insert bytes into the bytes pool. + /// + /// The returned symbol can then be used to retrieve those bytes from the pool. + pub(crate) fn insert(&mut self, v: &[u8]) -> BytesSymbol { + let from = self.buffer.len(); + self.buffer.extend(v); + + BytesSymbol { + from, + to: self.buffer.len(), + } + } + + /// Insert a string into the bytes pool. + /// + /// The returned symbol can then be used to retrieve the string from the pool. + pub(crate) fn insert_str(&mut self, v: &str) -> StringSymbol { + let from = self.buffer.len(); + self.buffer.extend(v.as_bytes()); + + StringSymbol { + from, + to: self.buffer.len(), + } + } + + /// Get a byte string from the pool + pub(crate) fn get(&self, symbol: BytesSymbol) -> &[u8] { + &self.buffer[symbol.from..symbol.to] + } + + /// Get a string from the pool + pub(crate) fn get_str(&self, symbol: StringSymbol) -> &str { + // Safety: + // - A StringSymbol can only be constructed from `insert_str` + // - Once a symbol is created, it is guaranteed that the indexes in the symbol + // will always refer to the same bytes (the buffer can only grow). + // It is thus safe to rebuild the string from the stored bytes. + unsafe { std::str::from_utf8_unchecked(&self.buffer[symbol.from..symbol.to]) } + } +} + +#[cfg(test)] +mod tests { + use crate::test_helpers::{test_type_traits, test_type_traits_non_clonable}; + + use super::*; + + #[test] + fn test_types_traits() { + test_type_traits_non_clonable(BytesPool::default()); + test_type_traits(BytesSymbol { from: 0, to: 0 }); + test_type_traits(StringSymbol { from: 0, to: 0 }); + } +} diff --git a/boreal/src/compiler/mod.rs b/boreal/src/compiler/mod.rs index c6bab02a..f51646fa 100644 --- a/boreal/src/compiler/mod.rs +++ b/boreal/src/compiler/mod.rs @@ -21,6 +21,7 @@ pub use params::CompilerParams; pub(crate) mod rule; pub(crate) mod variable; +use crate::bytes_pool::BytesPool; use crate::{statistics, Scanner}; /// Object used to compile rules. @@ -59,6 +60,11 @@ pub struct Compiler { /// Externally defined symbols. external_symbols: Vec, + /// Bytes intern pool. + /// + /// This is used to reduce memory footprint and share byte strings. + bytes_pool: BytesPool, + /// Compilation parameters params: CompilerParams, } @@ -103,6 +109,7 @@ impl Default for Compiler { available_modules: HashMap::new(), imported_modules: Vec::new(), external_symbols: Vec::new(), + bytes_pool: BytesPool::default(), params: CompilerParams::default(), } } @@ -395,6 +402,7 @@ impl Compiler { &self.external_symbols, &self.params, parsed_contents, + &mut self.bytes_pool, ) .map_err(|error| AddRuleError { path: current_filepath.map(Path::to_path_buf), @@ -517,6 +525,7 @@ impl Compiler { self.imported_modules, self.external_symbols, namespaces, + self.bytes_pool, ) } } diff --git a/boreal/src/compiler/rule.rs b/boreal/src/compiler/rule.rs index d86822a3..7b45551f 100644 --- a/boreal/src/compiler/rule.rs +++ b/boreal/src/compiler/rule.rs @@ -8,6 +8,9 @@ use boreal_parser::rule; use super::expression::{compile_bool_expression, Expression, VariableIndex}; use super::external_symbol::ExternalSymbol; use super::{variable, CompilationError, CompilerParams, Namespace}; +use crate::bytes_pool::BytesPool; +use crate::bytes_pool::BytesSymbol; +use crate::bytes_pool::StringSymbol; use crate::module::Type as ModuleType; use crate::statistics; @@ -42,18 +45,22 @@ pub(crate) struct Rule { /// A metadata associated with a rule. #[derive(Debug)] pub struct Metadata { - /// Index of the name of the metadata in the metadata string table. - pub name: String, + /// Name of the metadata. + /// + /// Use [`boreal::Scanner::get_string_symbol`] to retrieve the string. + pub name: StringSymbol, /// The value of the metadata. pub value: MetadataValue, } /// Value of a rule metadata. -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug)] pub enum MetadataValue { /// Bytestring variant. - Bytes(Vec), + /// + /// Use [`boreal::Scanner::get_bytes_symbol`] to retrieve the string. + Bytes(BytesSymbol), /// Integer variant. Integer(i64), /// Boolean variant. @@ -228,6 +235,7 @@ pub(super) fn compile_rule( external_symbols: &Vec, params: &CompilerParams, parsed_contents: &str, + bytes_pool: &mut BytesPool, ) -> Result { // Check duplication of tags let mut tags_spans = HashMap::with_capacity(rule.tags.len()); @@ -271,9 +279,11 @@ pub(super) fn compile_rule( .metadatas .into_iter() .map(|rule::Metadata { name, value }| Metadata { - name, + name: bytes_pool.insert_str(&name), value: match value { - rule::MetadataValue::Bytes(v) => MetadataValue::Bytes(v), + rule::MetadataValue::Bytes(v) => { + MetadataValue::Bytes(bytes_pool.insert(&v)) + } rule::MetadataValue::Integer(v) => MetadataValue::Integer(v), rule::MetadataValue::Boolean(v) => MetadataValue::Boolean(v), }, @@ -338,8 +348,9 @@ mod tests { name: "a".to_owned(), used: false, }); + let mut pool = BytesPool::default(); test_type_traits_non_clonable(Metadata { - name: String::new(), + name: pool.insert_str(""), value: MetadataValue::Boolean(true), }); test_type_traits(MetadataValue::Boolean(true)); diff --git a/boreal/src/lib.rs b/boreal/src/lib.rs index 39454d8c..ba16485a 100644 --- a/boreal/src/lib.rs +++ b/boreal/src/lib.rs @@ -92,6 +92,8 @@ use tlsh2 as _; pub(crate) mod atoms; mod bitmaps; +mod bytes_pool; +pub use bytes_pool::{BytesSymbol, StringSymbol}; pub mod compiler; pub use compiler::rule::{Metadata, MetadataValue}; pub use compiler::Compiler; diff --git a/boreal/src/scanner/mod.rs b/boreal/src/scanner/mod.rs index 1bdee5d9..2a2818e2 100644 --- a/boreal/src/scanner/mod.rs +++ b/boreal/src/scanner/mod.rs @@ -3,6 +3,7 @@ use std::any::TypeId; use std::collections::HashMap; use std::sync::Arc; +use crate::bytes_pool::{BytesPool, BytesSymbol, StringSymbol}; use crate::compiler::external_symbol::{ExternalSymbol, ExternalValue}; use crate::compiler::rule::Rule; use crate::compiler::variable::Variable; @@ -107,6 +108,7 @@ impl Scanner { modules: Vec>, external_symbols: Vec, namespaces: Vec>, + bytes_pool: BytesPool, ) -> Self { let ac_scan = ac_scan::AcScan::new(&variables); @@ -130,6 +132,7 @@ impl Scanner { modules, external_symbols_map, namespaces, + bytes_pool, }), scan_params: ScanParams::default(), external_symbols_values, @@ -340,6 +343,18 @@ impl Scanner { pub fn scan_params(&self) -> &ScanParams { &self.scan_params } + + /// Get the value of a bytes symbol. + #[must_use] + pub fn get_bytes_symbol(&self, symbol: BytesSymbol) -> &[u8] { + self.inner.bytes_pool.get(symbol) + } + + /// Get the value of a string symbol. + #[must_use] + pub fn get_string_symbol(&self, symbol: StringSymbol) -> &str { + self.inner.bytes_pool.get_str(symbol) + } } #[derive(Debug)] @@ -378,6 +393,9 @@ struct Inner { /// /// None is used for the default namespace. namespaces: Vec>, + + /// Bytes intern pool. + bytes_pool: BytesPool, } impl Inner { @@ -1483,6 +1501,7 @@ mod tests { Vec::new(), Vec::new(), Vec::new(), + BytesPool::default(), )); test_type_traits_non_clonable(ScanResult { matched_rules: Vec::new(),