diff --git a/benches/benches_main.rs b/benches/benches_main.rs index 626b1d2..d32eabe 100755 --- a/benches/benches_main.rs +++ b/benches/benches_main.rs @@ -26,6 +26,25 @@ fn shannon_entropy(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, shannon_entropy); +fn chi_square_probability(c: &mut Criterion) { + let mut sample = vec![0u8; 1 * MB]; + StdRng::seed_from_u64(5).fill(&mut sample[..]); + + let mut group = c.benchmark_group("Chi square probability"); + + for sample_size in [256, 1 * kB, 64 * kB, 256 * kB, 1 * MB] { + group.throughput(Throughput::Bytes(sample_size as u64)); + group.bench_with_input( + BenchmarkId::from_parameter(sample_size), + &sample_size, + |b, &size| { + b.iter(|| unblob_native::math_tools::chi_square_probability(&sample[0..size])); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, shannon_entropy, chi_square_probability); criterion_main!(benches); diff --git a/python/unblob_native/math_tools.pyi b/python/unblob_native/math_tools.pyi index 06acd28..b7a3c7c 100644 --- a/python/unblob_native/math_tools.pyi +++ b/python/unblob_native/math_tools.pyi @@ -1 +1,2 @@ def shannon_entropy(data: bytes) -> float: ... +def chi_square_probability(data: bytes) -> float: ... diff --git a/src/math_tools.rs b/src/math_tools.rs index e5eb118..b26b821 100644 --- a/src/math_tools.rs +++ b/src/math_tools.rs @@ -1,4 +1,5 @@ use pyo3::prelude::*; +use statrs::distribution::{ChiSquared, ContinuousCDF}; pub fn shannon_entropy(data: &[u8]) -> f64 { let mut entropy = 0.0; @@ -25,9 +26,47 @@ pub fn py_shannon_entropy(py: Python, data: &[u8]) -> PyResult { py.allow_threads(|| Ok(shannon_entropy(data))) } +pub fn chi_square_probability(data: &[u8]) -> f64 { + if data.is_empty() { + return 0.0; + } + + // Total number of possible byte values (0–255) + let num_bins = 256; + let expected_count = data.len() as f64 / num_bins as f64; + + // Frequency count for each byte value + let mut frequencies = [0u32; 256]; + for &byte in data { + frequencies[byte as usize] += 1; + } + + // Calculate chi-square statistic + let chi_square: f64 = frequencies + .iter() + .map(|&obs| { + let observed = obs as f64; + (observed - expected_count).powi(2) / expected_count + }) + .sum(); + + // Degrees of freedom: 255 (256 bins - 1) + let degrees_of_freedom = (num_bins - 1) as f64; + let chi_squared = ChiSquared::new(degrees_of_freedom).unwrap(); + + // Compute p-value (chi-square probability) + 1.0 - chi_squared.cdf(chi_square) +} +/// Calculates Chi Square of data +#[pyfunction(name = "chi_square_probability")] +pub fn py_chi_square_probability(py: Python, data: &[u8]) -> PyResult { + py.allow_threads(|| Ok(chi_square_probability(data))) +} + pub fn init_module(root_module: &Bound<'_, PyModule>) -> PyResult<()> { let module = PyModule::new_bound(root_module.py(), "math_tools")?; module.add_function(wrap_pyfunction!(py_shannon_entropy, &module)?)?; + module.add_function(wrap_pyfunction!(py_chi_square_probability, &module)?)?; root_module.add_submodule(&module)?; @@ -46,10 +85,59 @@ mod tests { use super::*; - #[test] - fn test_shannon_entropy() { - let input = b"000111"; // 50% entropy distribution ~ 1 bit information + mod shannon { + use super::*; + + #[test] + fn test_shannon_entropy() { + let input = b"000111"; // 50% entropy distribution ~ 1 bit information - assert_relative_eq!(shannon_entropy(input), 1.0); + assert_relative_eq!(shannon_entropy(input), 1.0); + } + } + + mod chi_square { + use super::*; + use rand_core::{OsRng, RngCore}; + + #[test] + fn test_uniform_random_data() { + let mut random_data = [0u8; 4096]; + OsRng.fill_bytes(&mut random_data); + let chi_square_value = chi_square_probability(&random_data); + + // Expect chi-square to be relatively low for uniform random data + assert!( + chi_square_value < 1.0 && chi_square_value > 0.0, + "Chi-square probability for random data was not within bounds: {}", + chi_square_value + ); + } + + #[test] + fn test_non_random_data() { + // Create non-random data (all values the same) + let non_random_data = vec![42u8; 4096]; + let chi_square_value = chi_square_probability(&non_random_data); + + assert!( + chi_square_value == 0.0, + "Chi-square probability for non-random data was too high: {}", + chi_square_value + ); + } + + #[test] + fn test_empty_data() { + // Edge case for empty data + let empty_data: Vec = Vec::new(); + let chi_square_value = chi_square_probability(&empty_data); + + // For empty data, chi-square should handle gracefully and return 0.0 + assert_eq!( + chi_square_value, 0.0, + "Chi-square for empty data should be 0.0" + ); + } } } diff --git a/tests/test_math.py b/tests/test_math.py index e14d354..6e2da51 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -2,6 +2,9 @@ from unblob_native import math_tools +UNIFORM_DISTRIBUTION = bytes(x for x in range(256)) +NON_UNIFORM_DISTRIBUTION = bytes([0] * 256) + @pytest.mark.parametrize( "data,entropy", @@ -15,3 +18,22 @@ ) def test_shannon_entropy(data: bytes, entropy: float): assert math_tools.shannon_entropy(data) == pytest.approx(entropy) + + +@pytest.mark.parametrize( + "data,chi_square_value", + [ + pytest.param(b"", 0, id="empty"), + pytest.param(UNIFORM_DISTRIBUTION, 1.0, id="uniform distribution"), + pytest.param(NON_UNIFORM_DISTRIBUTION, 0.0, id="non uniform distribution"), + pytest.param( + UNIFORM_DISTRIBUTION + NON_UNIFORM_DISTRIBUTION, + 0.0, + id="partially uniform distribution", + ), + ], +) +def test_chi_square_entropy(data: bytes, chi_square_value: float): + assert math_tools.chi_square_probability(data) == pytest.approx( + chi_square_value, abs=1e-4 + )