Skip to content

Commit

Permalink
feat(math): add chi square calculation function.
Browse files Browse the repository at this point in the history
Add chi_square_probability function to math_tools module. This function
returns the Chi Square distribution probability.

Chi-square tests are effective for distinguishing compressed from
encrypted data because they evaluate the uniformity of byte
distributions more rigorously than Shannon entropy.

In compressed files, bytes often cluster around certain values due to
patterns that still exist (albeit less detectable), resulting in a
non-uniform distribution. Encrypted data, by contrast, exhibits nearly
perfect uniformity, as each byte value from 0–255 is expected to appear
with almost equal frequency, making it harder to detect any discernible
patterns.

The chi-square distribution is calculated for the stream of bytes in the
chunk and expressed as an absolute number and a percentage which
indicates how frequently a truly random sequence would exceed the value
calculated. The percentage is the only value that is of interest from
unblob's perspective, so that's why we only return it.

According to ent doc⁰:

> We [can] interpret the percentage as the degree to which the
> sequence tested is suspected of being non-random. If the percentage is
> greater than 99% or less than 1%, the sequence is almost certainly not
> random. If the percentage is between 99% and 95% or between 1% and 5%,
> the sequence is suspect. Percentages between 90% and 95% and 5% and 10%
> indicate the sequence is “almost suspect”.

[0] - https://www.fourmilab.ch/random/
  • Loading branch information
qkaiser committed Oct 27, 2024
1 parent 417567f commit bec8800
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 4 deletions.
1 change: 1 addition & 0 deletions python/unblob_native/math_tools.pyi
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
def shannon_entropy(data: bytes) -> float: ...
def chi_square_probability(data: bytes) -> float: ...
97 changes: 93 additions & 4 deletions src/math_tools.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use pyo3::prelude::*;
use statrs::distribution::{ChiSquared, ContinuousCDF};

pub fn shannon_entropy(data: &[u8]) -> f64 {
let mut entropy = 0.0;
Expand All @@ -25,9 +26,47 @@ pub fn py_shannon_entropy(py: Python, data: &[u8]) -> PyResult<f64> {
py.allow_threads(|| Ok(shannon_entropy(data)))
}

pub fn chi_square_probability(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}

// Total number of possible byte values (0–255)
let num_bins = 256;
let expected_count = data.len() as f64 / num_bins as f64;

// Frequency count for each byte value
let mut frequencies = [0u32; 256];
for &byte in data {
frequencies[byte as usize] += 1;
}

// Calculate chi-square statistic
let chi_square: f64 = frequencies
.iter()
.map(|&obs| {
let observed = obs as f64;
(observed - expected_count).powi(2) / expected_count
})
.sum();

// Degrees of freedom: 255 (256 bins - 1)
let degrees_of_freedom = (num_bins - 1) as f64;
let chi_squared = ChiSquared::new(degrees_of_freedom).unwrap();

// Compute p-value (chi-square probability)
1.0 - chi_squared.cdf(chi_square)
}
/// Calculates Chi Square of data
#[pyfunction(name = "chi_square_probability")]
pub fn py_chi_square_probability(py: Python, data: &[u8]) -> PyResult<f64> {
py.allow_threads(|| Ok(chi_square_probability(data)))
}

pub fn init_module(root_module: &Bound<'_, PyModule>) -> PyResult<()> {
let module = PyModule::new_bound(root_module.py(), "math_tools")?;
module.add_function(wrap_pyfunction!(py_shannon_entropy, &module)?)?;
module.add_function(wrap_pyfunction!(py_chi_square_probability, &module)?)?;

root_module.add_submodule(&module)?;

Expand All @@ -43,13 +82,63 @@ pub fn init_module(root_module: &Bound<'_, PyModule>) -> PyResult<()> {
#[cfg(test)]
mod tests {
use approx::assert_relative_eq;
use rand::Rng;

use super::*;

#[test]
fn test_shannon_entropy() {
let input = b"000111"; // 50% entropy distribution ~ 1 bit information
mod shannon {
use super::*;

#[test]
fn test_shannon_entropy() {
let input = b"000111"; // 50% entropy distribution ~ 1 bit information

assert_relative_eq!(shannon_entropy(input), 1.0);
assert_relative_eq!(shannon_entropy(input), 1.0);
}
}

mod chi_square {
use super::*;

#[test]
fn test_uniform_random_data() {
// Generate random data (uniform distribution)
let mut rng = rand::thread_rng();
let random_data: Vec<u8> = (0..4096).map(|_| rng.gen_range(0..=255)).collect();
let chi_square_value = chi_square_probability(&random_data);

// Expect chi-square to be relatively low for uniform random data
assert!(
chi_square_value < 0.95 && chi_square_value > 0.1,
"Chi-square probability for random data was not within bounds: {}",
chi_square_value
);
}

#[test]
fn test_non_random_data() {
// Create non-random data (all values the same)
let non_random_data = vec![42u8; 4096];
let chi_square_value = chi_square_probability(&non_random_data);

assert!(
chi_square_value == 0.0,
"Chi-square probability for non-random data was too high: {}",
chi_square_value
);
}

#[test]
fn test_empty_data() {
// Edge case for empty data
let empty_data: Vec<u8> = Vec::new();
let chi_square_value = chi_square_probability(&empty_data);

// For empty data, chi-square should handle gracefully and return 0.0
assert_eq!(
chi_square_value, 0.0,
"Chi-square for empty data should be 0.0"
);
}
}
}
22 changes: 22 additions & 0 deletions tests/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from unblob_native import math_tools

UNIFORM_DISTRIBUTION = bytes(x for x in range(256))
NON_UNIFORM_DISTRIBUTION = bytes([0] * 256)


@pytest.mark.parametrize(
"data,entropy",
Expand All @@ -15,3 +18,22 @@
)
def test_shannon_entropy(data: bytes, entropy: float):
assert math_tools.shannon_entropy(data) == pytest.approx(entropy)


@pytest.mark.parametrize(
"data,chi_square_value",
[
pytest.param(b"", 0, id="empty"),
pytest.param(UNIFORM_DISTRIBUTION, 1.0, id="uniform distribution"),
pytest.param(NON_UNIFORM_DISTRIBUTION, 0.0, id="non uniform distribution"),
pytest.param(
UNIFORM_DISTRIBUTION + NON_UNIFORM_DISTRIBUTION,
0.0,
id="partially uniform distribution",
),
],
)
def test_chi_square_entropy(data: bytes, chi_square_value: float):
assert math_tools.chi_square_probability(data) == pytest.approx(
chi_square_value, abs=1e-4
)

0 comments on commit bec8800

Please sign in to comment.