From e1b713c4a4b811dccf4b5385c7f0babb2b678ceb Mon Sep 17 00:00:00 2001 From: c3rb3ru5d3d53c Date: Thu, 26 Dec 2024 14:00:51 -0400 Subject: [PATCH] Control Flow Graph Vector Encoding - VecGraph (control flow graph) - VecNode (control flow node) - Adds support to encode control flow graphs to vectors - Can later be reduced for vector databases --- src/bindings/python/src/controlflow/block.rs | 15 ++ .../python/src/controlflow/function.rs | 76 +++------ src/bindings/python/src/types/mod.rs | 2 + src/bindings/python/src/types/vecnode.rs | 117 +++++++++----- src/controlflow/block.rs | 8 + src/controlflow/function.rs | 36 ++--- src/types/mod.rs | 1 + src/types/vecnode.rs | 153 ++++++++---------- 8 files changed, 204 insertions(+), 204 deletions(-) diff --git a/src/bindings/python/src/controlflow/block.rs b/src/bindings/python/src/controlflow/block.rs index d62bbf0..23f78e8 100644 --- a/src/bindings/python/src/controlflow/block.rs +++ b/src/bindings/python/src/controlflow/block.rs @@ -304,11 +304,26 @@ impl BlockJsonDeserializer { self.inner.lock().unwrap().edges() } + #[pyo3(text_signature = "($self)")] + pub fn blocks(&self) -> BTreeSet { + self.inner.lock().unwrap().blocks() + } + #[pyo3(text_signature = "($self)")] pub fn to(&self) -> BTreeSet { self.inner.lock().unwrap().to() } + #[pyo3(text_signature = "($self)")] + pub fn conditional(&self) -> bool { + self.inner.lock().unwrap().conditional() + } + + #[pyo3(text_signature = "($self)")] + pub fn entropy(&self) -> Option { + self.inner.lock().unwrap().entropy() + } + #[pyo3(text_signature = "($self)")] pub fn next(&self) -> Option { self.inner.lock().unwrap().next() diff --git a/src/bindings/python/src/controlflow/function.rs b/src/bindings/python/src/controlflow/function.rs index bc87101..8ea9d78 100644 --- a/src/bindings/python/src/controlflow/function.rs +++ b/src/bindings/python/src/controlflow/function.rs @@ -164,6 +164,8 @@ // permanent authorization for you to choose that version for the // Library. +use crate::controlflow::BlockJsonDeserializer; +use binlex::controlflow::BlockJsonDeserializer as InnerBlockJsonDeserializer; use pyo3::prelude::*; use pyo3::Py; use std::collections::BTreeMap; @@ -199,6 +201,28 @@ impl FunctionJsonDeserializer { }) } + #[pyo3(text_signature = "($self)")] + pub fn blocks(&self) -> Vec { + let mut result = Vec::::new(); + let blocks = self.inner.lock().unwrap().json.blocks.clone(); + let inner_config = self.inner.lock().unwrap().config.clone(); + for block_json in blocks { + let block_json_deserializer = BlockJsonDeserializer { + inner: Arc::new(Mutex::new(InnerBlockJsonDeserializer { + json: block_json, + config: inner_config.clone(), + })), + }; + result.push(block_json_deserializer) + } + result + } + + #[pyo3(text_signature = "($self)")] + pub fn functions(&self) -> BTreeMap { + self.inner.lock().unwrap().functions() + } + #[pyo3(text_signature = "($self)")] pub fn size(&self) -> usize { self.inner.lock().unwrap().size() @@ -563,58 +587,6 @@ impl Function { }) } - // #[pyo3(text_signature = "($self, rhs_functions)")] - // pub fn compare_many(&self, py: Python, rhs_functions: Py) -> PyResult> { - // self.with_inner_function(py, |function| { - // let mut tasks = Vec::<(u64, Arc>)>::new(); - - // let list = rhs_functions.bind(py); - - // let items: Vec> = list.iter().map(|item| item.into()).collect(); - - // for item in items { - // let py_item = item.bind(py); - // if !py_item.is_instance_of::() { - // return Err(pyo3::exceptions::PyTypeError::new_err( - // "all items in rhs_functions must be instances of Function", - // )); - // } - // let rhs: Option> = py_item.extract().ok(); - // if rhs.is_none() { continue; } - // let rhs_binding_0 = rhs.unwrap(); - // let rhs_binding_1 = rhs_binding_0.borrow(py); - // let address = rhs_binding_1.address(); - // let rhs_cfg = Arc::clone(&rhs_binding_1.cfg.borrow(py).inner); - // tasks.push((address, rhs_cfg)); - // }; - - // let pool = ThreadPoolBuilder::new() - // .num_threads(function.cfg.config.general.threads) - // .build() - // .map_err(|error| pyo3::exceptions::PyRuntimeError::new_err(format!("{}", error)))?; - - // let results: BTreeMap = pool.install(|| { - // tasks - // .par_iter() - // .filter_map(|(address, inner_cfg)| { - // let c = inner_cfg.lock().unwrap(); - // let rhs_function = InnerFunction::new(*address, &c).ok()?; - // let similarity = function.compare(&rhs_function).ok()?; - // similarity.map(|similarity| { - // ( - // *address, - // ChromosomeSimilarity { - // inner: Arc::new(Mutex::new(similarity)), - // }, - // ) - // }) - // }) - // .collect() - // }); - // Ok(results) - // }) - // } - #[pyo3(text_signature = "($self)")] pub fn chromosome_minhash_ratio(&self, py: Python) -> PyResult { self.with_inner_function(py, |function| { diff --git a/src/bindings/python/src/types/mod.rs b/src/bindings/python/src/types/mod.rs index 4d3e22b..5c73c56 100644 --- a/src/bindings/python/src/types/mod.rs +++ b/src/bindings/python/src/types/mod.rs @@ -175,6 +175,7 @@ use crate::types::lz4string::lz4string_init; pub use crate::types::memorymappedfile::MemoryMappedFile; pub use crate::types::lz4string::LZ4String; pub use crate::types::vecnode::VecNode; +pub use crate::types::vecnode::VecGraph; use pyo3::{prelude::*, wrap_pymodule}; @@ -187,6 +188,7 @@ pub fn types_init(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; py.import_bound("sys")? .getattr("modules")? .set_item("binlex.types", m)?; diff --git a/src/bindings/python/src/types/vecnode.rs b/src/bindings/python/src/types/vecnode.rs index e3a8972..cef3651 100644 --- a/src/bindings/python/src/types/vecnode.rs +++ b/src/bindings/python/src/types/vecnode.rs @@ -1,89 +1,117 @@ pub use binlex::types::VecNode as InnerVecNode; +pub use binlex::types::VecGraph as InnerVecGraph; use std::sync::{Arc, Mutex}; use pyo3::prelude::*; #[pyclass] -pub struct VecNode { - pub inner: Arc>, +pub struct VecGraph { + pub inner: Arc>, } #[pymethods] -impl VecNode { +impl VecGraph { #[new] - #[pyo3(text_signature = "(id)")] - pub fn new(id: u64) -> Self { - let inner = InnerVecNode::new(id); + #[pyo3(text_signature = "()")] + pub fn new() -> Self { Self { - inner: Arc::new(Mutex::new(inner)) + inner: Arc::new(Mutex::new(InnerVecGraph::new())) } } - #[pyo3(text_signature = "($self, child)")] - pub fn add_child(&self, py: Python, child: Py) { - let inner_child = child.borrow(py).inner.lock().unwrap().clone(); - self.inner.lock().unwrap().add_child(inner_child); + #[pyo3(text_signature = "($self, node)")] + pub fn insert_node(&mut self, py: Python, node: Py) { + let inner_node = node + .borrow(py) + .inner.lock() + .unwrap() + .clone(); + self.inner.lock().unwrap().insert_node(inner_node); + } + + #[pyo3(text_signature = "($self, id)")] + pub fn get_node(&self, id: u64) -> Option { + let binding = self.inner.lock().unwrap(); + let inner_node = binding.get_node(id)?; + let node = VecNode { + inner: Arc::new(Mutex::new(inner_node.clone())) + }; + Some(node) } - #[pyo3(text_signature = "($self, parent)")] - pub fn add_parent(&self, py: Python, parent: Py) { - let inner_parent = parent.borrow(py).inner.lock().unwrap().clone(); - self.inner.lock().unwrap().add_parent(inner_parent); + #[pyo3(text_signature = "($self, node1_id, node2_id)")] + pub fn add_relationship(&mut self, node1_id: u64, node2_id: u64) { + let mut binding = self.inner + .lock() + .unwrap(); + binding.add_relationship(node1_id, node2_id) } #[pyo3(text_signature = "($self)")] - pub fn children(&self) -> Vec { - let mut result = Vec::::new(); - for child in self.inner.lock().unwrap().children() { - let a = VecNode { - inner: Arc::new(Mutex::new(child.clone())) - }; - result.push(a); - } - result + pub fn to_vec(&self) -> Vec { + self.inner.lock().unwrap().to_vec() } #[pyo3(text_signature = "($self)")] - pub fn parents(&self) -> Vec { - let mut result = Vec::::new(); - for parent in self.inner.lock().unwrap().parents() { - let a = VecNode { - inner: Arc::new(Mutex::new(parent.clone())) - }; - result.push(a); + pub fn print(&self) { + self.inner + .lock() + .unwrap() + .print() + } + +} + +#[pyclass] +pub struct VecNode { + pub inner: Arc>, +} + +#[pymethods] +impl VecNode { + #[new] + #[pyo3(text_signature = "(id)")] + pub fn new(id: u64) -> Self { + let inner = InnerVecNode::new(id); + Self { + inner: Arc::new(Mutex::new(inner)) } - result } - #[pyo3(text_signature = "($self, key, value)")] - pub fn add_property(&mut self, key: String, value: f64) { + #[pyo3(text_signature = "($self)")] + pub fn id(&self) -> u64 { + self.inner.lock().unwrap().id() + } + + #[pyo3(text_signature = "($self)")] + pub fn relationships(&self) -> Vec { self.inner .lock() .unwrap() - .add_property(&key, value) + .relationships() + .clone() } - #[pyo3(text_signature = "($self, key, values)")] - pub fn add_properties(&mut self, key: String, values: Vec) { + pub fn add_relationship(&self, id: u64) { self.inner .lock() .unwrap() - .add_properties(&key, values) + .add_relationship(id) } - #[pyo3(text_signature = "($self)")] - pub fn print(&self) { + #[pyo3(text_signature = "($self, key, value)")] + pub fn add_property(&mut self, key: String, value: f64) { self.inner .lock() .unwrap() - .print() + .add_property(&key, value) } - #[pyo3(text_signature = "($self)")] - pub fn to_vec(&self) -> Vec { + #[pyo3(text_signature = "($self, key, values)")] + pub fn add_properties(&mut self, key: String, values: Vec) { self.inner .lock() .unwrap() - .to_vec() + .add_properties(&key, values) } } @@ -91,6 +119,7 @@ impl VecNode { #[pyo3(name = "vecnode")] pub fn vecnode_init(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add_class::()?; py.import_bound("sys")? .getattr("modules")? .set_item("binlex.types.vecnode", m)?; diff --git a/src/controlflow/block.rs b/src/controlflow/block.rs index db8846d..a9b9c43 100644 --- a/src/controlflow/block.rs +++ b/src/controlflow/block.rs @@ -215,6 +215,8 @@ pub struct BlockJson { pub bytes: String, /// A map of function addresses related to this block. pub functions: BTreeMap, + // Blocks this blocks has as children. + pub blocks: BTreeSet, /// The number of instructions in this block. pub number_of_instructions: usize, /// Instructions assocated with this block. @@ -280,6 +282,11 @@ impl BlockJsonDeserializer { }) } + #[allow(dead_code)] + pub fn blocks(&self) -> BTreeSet { + self.json.blocks.clone() + } + #[allow(dead_code)] pub fn edges(&self) -> usize { self.json.edges @@ -522,6 +529,7 @@ impl<'block> Block<'block> { number_of_instructions: self.number_of_instructions(), instructions: self.instructions_json(), functions: self.functions(), + blocks: self.blocks(), entropy: self.entropy(), sha256: self.sha256(), minhash: self.minhash(), diff --git a/src/controlflow/function.rs b/src/controlflow/function.rs index 38ded57..a4d1e42 100644 --- a/src/controlflow/function.rs +++ b/src/controlflow/function.rs @@ -269,6 +269,18 @@ impl FunctionJsonDeserializer { self.json.address } + pub fn blocks(&self) -> Vec { + let mut result = Vec::::new(); + for block in &self.json.blocks { + let block_json_seserializer = BlockJsonDeserializer { + json: block.clone(), + config: self.config.clone(), + }; + result.push(block_json_seserializer); + } + result + } + #[allow(dead_code)] pub fn bytes(&self) -> Option> { if self.json.bytes.is_none() { return None; } @@ -301,24 +313,8 @@ impl FunctionJsonDeserializer { } #[allow(dead_code)] - pub fn blocks(&self) -> Result, Error> { - let mut result = Vec::::new(); - for block in &self.json.blocks { - let string = match serde_json::to_string(block) { - Ok(string) => string, - Err(error) => { - return Err(Error::new(ErrorKind::InvalidData, format!("{}", error))); - } - }; - let blockjsondeserializer= match BlockJsonDeserializer::new(string, self.config.clone()) { - Ok(blockjsondeserializer) => blockjsondeserializer, - Err(error) => { - return Err(Error::new(ErrorKind::InvalidData, format!("{}", error))); - } - }; - result.push(blockjsondeserializer); - } - Ok(result) + pub fn functions(&self) -> BTreeMap { + self.json.functions.clone() } #[allow(dead_code)] @@ -380,11 +376,11 @@ impl FunctionJsonDeserializer { let mut minhashes = Vec::::new(); let mut tls_values = Vec::::new(); - for lhs_block in self.blocks()? { + for lhs_block in self.blocks() { let mut best_minhash: Option = None; let mut best_tls: Option = None; - let results = match lhs_block.compare_many(rhs.blocks()?) { + let results = match lhs_block.compare_many(rhs.blocks()) { Ok(results) => results, Err(error) => { return Err(Error::new(ErrorKind::InvalidData, format!("{}", error))); diff --git a/src/types/mod.rs b/src/types/mod.rs index 7564f97..f58fac2 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -171,3 +171,4 @@ pub mod vecnode; pub use lz4string::LZ4String; pub use memorymappedfile::MemoryMappedFile; pub use vecnode::VecNode; +pub use vecnode::VecGraph; diff --git a/src/types/vecnode.rs b/src/types/vecnode.rs index eed04ca..a370586 100644 --- a/src/types/vecnode.rs +++ b/src/types/vecnode.rs @@ -1,11 +1,10 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; #[derive(Debug, Clone)] pub struct VecNode { id: u64, properties: BTreeMap>, - children: Vec, - parents: Vec, + relationships: Vec, } impl VecNode { @@ -13,8 +12,7 @@ impl VecNode { VecNode { id, properties: BTreeMap::new(), - children: Vec::new(), - parents: Vec::new(), + relationships: Vec::new(), } } @@ -22,12 +20,8 @@ impl VecNode { self.id } - pub fn children(&self) -> &Vec { - &self.children - } - - pub fn parents(&self) -> &Vec { - &self.parents + pub fn relationships(&self) -> &Vec { + &self.relationships } pub fn add_property(&mut self, key: &str, value: f64) { @@ -44,106 +38,89 @@ impl VecNode { .extend(values); } - pub fn add_child(&mut self, mut child: VecNode) { - if !self.children.iter().any(|c| c.id == child.id) { - self.children.push(child.clone()); - } - - if !child.parents.iter().any(|p| p.id == self.id) { - child.parents.push(self.clone()); + pub fn add_relationship(&mut self, id: u64) { + if !self.relationships.contains(&id) { + self.relationships.push(id); } } +} - pub fn add_parent(&mut self, mut parent: VecNode) { - if !self.parents.iter().any(|p| p.id == parent.id) { - self.parents.push(parent.clone()); - } +#[derive(Debug)] +pub struct VecGraph { + nodes: HashMap, +} - if !parent.children.iter().any(|c| c.id == self.id) { - parent.children.push(self.clone()); +impl VecGraph { + pub fn new() -> Self { + VecGraph { + nodes: HashMap::new(), } } - pub fn print(&self) { - self.print_internal(0); + pub fn insert_node(&mut self, node: VecNode) { + self.nodes.insert(node.id(), node); } - fn print_internal(&self, depth: usize) { - let indent = " ".repeat(depth); - println!("{}Node ID: {}", indent, self.id); + pub fn get_node(&self, id: u64) -> Option<&VecNode> { + self.nodes.get(&id) + } - if self.properties.is_empty() { - println!("{} Properties: None", indent); - } else { - println!("{} Properties:", indent); - for (key, values) in &self.properties { - println!("{} {}: {:?}", indent, key, values); - } + pub fn add_relationship(&mut self, node1_id: u64, node2_id: u64) { + if let Some(node1) = self.nodes.get_mut(&node1_id) { + node1.add_relationship(node2_id); } - - if self.parents.is_empty() { - println!("{} Parents: None", indent); - } else { - println!( - "{} Parents: {:?}", - indent, - self.parents.iter().map(|p| p.id).collect::>() - ); + if let Some(node2) = self.nodes.get_mut(&node2_id) { + node2.add_relationship(node1_id); } + } - if self.children.is_empty() { - println!("{} Children: None", indent); - } else { - println!("{} Children:", indent); - for child in &self.children { - child.print_internal(depth + 1); + pub fn print(&self) { + for node in self.nodes.values() { + println!("Node ID: {}", node.id); + println!(" Properties:"); + for (key, values) in &node.properties { + println!(" {}: {:?}", key, values); } + println!(" Relationships: {:?}", node.relationships); } } pub fn to_vec(&self) -> Vec { - fn encode_tree( - node: &VecNode, - depth: usize, - visited: &mut HashSet, - id_to_index: &mut HashMap, - next_index: &mut usize, - ) -> Vec { - let mut node_vector = Vec::new(); - - if visited.contains(&node.id) { - let reference_index = *id_to_index.get(&node.id).unwrap(); - node_vector.push(reference_index as f64); - return node_vector; - } - - visited.insert(node.id); - - let index = *id_to_index.entry(node.id).or_insert_with(|| { - let current = *next_index; - *next_index += 1; - current - }); + let mut graph_vector = Vec::new(); + let mut visited = HashMap::new(); + let mut next_index = 0; - node_vector.push(index as f64); - node_vector.push(depth as f64); - for values in node.properties.values() { - node_vector.extend(values); + for node in self.nodes.values() { + if !visited.contains_key(&node.id) { + self.encode_node(node, &mut graph_vector, &mut visited, &mut next_index); } + } - for child in &node.children { - let child_vector = - encode_tree(child, depth + 1, visited, id_to_index, next_index); - node_vector.extend(child_vector); - } + graph_vector + } - node_vector + fn encode_node( + &self, + node: &VecNode, + graph_vector: &mut Vec, + visited: &mut HashMap, + next_index: &mut usize, + ) { + let index = *visited.entry(node.id).or_insert_with(|| { + let current = *next_index; + *next_index += 1; + current + }); + + graph_vector.push(index as f64); + for values in node.properties.values() { + graph_vector.extend(values); } - let mut visited = HashSet::new(); - let mut id_to_index = HashMap::new(); - let mut next_index = 0; - - encode_tree(self, 0, &mut visited, &mut id_to_index, &mut next_index) + for related_id in &node.relationships { + if let Some(related_index) = visited.get(related_id) { + graph_vector.push(*related_index as f64); + } + } } }