Skip to content

Commit

Permalink
feat: factorize values in bytes pool
Browse files Browse the repository at this point in the history
Add a map on top of the bytes pool to avoid storing the same string
multiple times. This is extremely useful for metadata keys, which tends
to always be the same in a set of rules.
  • Loading branch information
vthib committed Jul 28, 2024
1 parent bb1e4c6 commit 1faa205
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 11 deletions.
86 changes: 82 additions & 4 deletions boreal/src/bytes_pool.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,32 @@
/// Bytes intern pool
///
/// This module defines the [`BytesPool`] and its builder type [`BytesPoolBuilder`].
///
/// This object is used to reduce the memory consumption of compiled rules, by
/// storing all bytes & strings literals used in rules (excluding those from variables,
/// or "strings" in YARA terms, but that is just confusing). This is mainly used
/// for metadata keys and values, but also for literals used in conditions.
///
/// Memory consumption is reduced thanks to two simple points.
///
/// - A single allocation is used, reduce memory fragmentation and allocation overheads.
/// - Added bytes are deduplicated. This is especially useful for metadata key names for
/// example, which tends to always be the same ones in a set of rules.
use std::collections::HashMap;

/// Bytes intern pool.
#[derive(Default, Debug)]
///
/// This object is used to store bytes in a single place to reduce memory consumption.
///
/// The implementation is extremely naive:
///
/// - A single Vec is used to stored the bytes, every added bytes are appended to the vec.
/// - Handles (or symbols as named here) are simply the offsets into the vec.
///
/// Some other implementations could be attempted to improve memory consumption further.
/// For example, by adding a second vec to map an index to the (from, to) pair, so that the
/// symbol can be a single usize.
#[derive(Debug, Default)]
pub(crate) struct BytesPool {
buffer: Vec<u8>,
}
Expand All @@ -22,7 +49,7 @@ impl BytesPool {
/// Insert bytes into the bytes pool.
///
/// The returned symbol can then be used to retrieve those bytes from the pool.
pub(crate) fn insert(&mut self, v: &[u8]) -> BytesSymbol {
fn insert(&mut self, v: &[u8]) -> BytesSymbol {
let from = self.buffer.len();
self.buffer.extend(v);

Expand All @@ -35,7 +62,7 @@ impl BytesPool {
/// Insert a string into the bytes pool.
///
/// The returned symbol can then be used to retrieve the string from the pool.
pub(crate) fn insert_str(&mut self, v: &str) -> StringSymbol {
fn insert_str(&mut self, v: &str) -> StringSymbol {
let from = self.buffer.len();
self.buffer.extend(v.as_bytes());

Expand All @@ -61,6 +88,56 @@ impl BytesPool {
}
}

/// A builder for the [`BytesPool`] object.
///
/// This builder will deduplicate bytes added in the pool to reduce
/// the memory usage of the final pool.
#[derive(Default, Debug)]
pub(crate) struct BytesPoolBuilder {
/// The pool being constructed.
pool: BytesPool,
/// Map of bytes symbols already added in the pool.
bytes_map: HashMap<Vec<u8>, BytesSymbol>,
/// Map of string symbols already added in the pool.
str_map: HashMap<String, StringSymbol>,
}

impl BytesPoolBuilder {
/// Insert bytes into the bytes pool.
///
/// If the byte string was already added, the existing symbol will be returned.
pub(crate) fn insert(&mut self, v: &[u8]) -> BytesSymbol {
match self.bytes_map.get(v) {
Some(v) => *v,
None => {
let symbol = self.pool.insert(v);
let _r = self.bytes_map.insert(v.to_vec(), symbol);
symbol
}
}
}

/// Insert a string into the bytes pool.
///
/// If the string was already added, the existing symbol will be returned.
pub(crate) fn insert_str(&mut self, v: &str) -> StringSymbol {
match self.str_map.get(v) {
Some(v) => *v,
None => {
let symbol = self.pool.insert_str(v);
let _r = self.str_map.insert(v.to_owned(), symbol);
symbol
}
}
}

/// Build the final bytes pool object.
pub(crate) fn into_pool(mut self) -> BytesPool {
self.pool.buffer.shrink_to_fit();
self.pool
}
}

#[cfg(test)]
mod tests {
use crate::test_helpers::{test_type_traits, test_type_traits_non_clonable};
Expand All @@ -69,7 +146,8 @@ mod tests {

#[test]
fn test_types_traits() {
test_type_traits_non_clonable(BytesPool::default());
test_type_traits_non_clonable(BytesPoolBuilder::default());
test_type_traits_non_clonable(BytesPoolBuilder::default().into_pool());
test_type_traits(BytesSymbol { from: 0, to: 0 });
test_type_traits(StringSymbol { from: 0, to: 0 });
}
Expand Down
8 changes: 4 additions & 4 deletions boreal/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub use params::CompilerParams;
pub(crate) mod rule;
pub(crate) mod variable;

use crate::bytes_pool::BytesPool;
use crate::bytes_pool::BytesPoolBuilder;
use crate::{statistics, Scanner};

/// Object used to compile rules.
Expand Down Expand Up @@ -63,7 +63,7 @@ pub struct Compiler {
/// Bytes intern pool.
///
/// This is used to reduce memory footprint and share byte strings.
bytes_pool: BytesPool,
bytes_pool: BytesPoolBuilder,

/// Compilation parameters
params: CompilerParams,
Expand Down Expand Up @@ -109,7 +109,7 @@ impl Default for Compiler {
available_modules: HashMap::new(),
imported_modules: Vec::new(),
external_symbols: Vec::new(),
bytes_pool: BytesPool::default(),
bytes_pool: BytesPoolBuilder::default(),
params: CompilerParams::default(),
}
}
Expand Down Expand Up @@ -525,7 +525,7 @@ impl Compiler {
self.imported_modules,
self.external_symbols,
namespaces,
self.bytes_pool,
self.bytes_pool.into_pool(),
)
}
}
Expand Down
6 changes: 3 additions & 3 deletions boreal/src/compiler/rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use boreal_parser::rule;
use super::expression::{compile_bool_expression, Expression, VariableIndex};
use super::external_symbol::ExternalSymbol;
use super::{variable, CompilationError, CompilerParams, Namespace};
use crate::bytes_pool::BytesPool;
use crate::bytes_pool::BytesPoolBuilder;
use crate::bytes_pool::BytesSymbol;
use crate::bytes_pool::StringSymbol;
use crate::module::Type as ModuleType;
Expand Down Expand Up @@ -235,7 +235,7 @@ pub(super) fn compile_rule(
external_symbols: &Vec<ExternalSymbol>,
params: &CompilerParams,
parsed_contents: &str,
bytes_pool: &mut BytesPool,
bytes_pool: &mut BytesPoolBuilder,
) -> Result<CompiledRule, CompilationError> {
// Check duplication of tags
let mut tags_spans = HashMap::with_capacity(rule.tags.len());
Expand Down Expand Up @@ -348,7 +348,7 @@ mod tests {
name: "a".to_owned(),
used: false,
});
let mut pool = BytesPool::default();
let mut pool = BytesPoolBuilder::default();
test_type_traits_non_clonable(Metadata {
name: pool.insert_str(""),
value: MetadataValue::Boolean(true),
Expand Down

0 comments on commit 1faa205

Please sign in to comment.