Skip to content

Commit

Permalink
Compatibility testing (#133)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwlon authored Nov 29, 2023
1 parent 52a6892 commit 2b823d2
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 15 deletions.
Binary file added pco/assets/v0_0_0_classic.pco
Binary file not shown.
Binary file added pco/assets/v0_0_0_delta_float_mult.pco
Binary file not shown.
Binary file added pco/assets/v0_1_0_delta_int_mult.pco
Binary file not shown.
43 changes: 28 additions & 15 deletions pco/src/int_mult_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,31 +90,37 @@ fn score_triple_gcd<U: UnsignedLike>(
return None;
}

let triples_w_gcd = triples_w_gcd as f64;
let total_triples = total_triples as f64;
// defining rarity as 1 / probability
let prob_per_triple = triples_w_gcd as f64 / total_triples as f64;
let implied_prob_per_num = prob_per_triple.sqrt();
let prob_per_triple = triples_w_gcd / total_triples;
let gcd_f64 = min(gcd, U::from_u64(u64::MAX)).to_u64() as f64;

// check if the GCD has statistical evidence (3 sigma)
let natural_prob_per_num = 1.0 / gcd_f64;
let stdev = (natural_prob_per_num * (1.0 - natural_prob_per_num) / total_triples as f64).sqrt();
let z_score = (implied_prob_per_num - natural_prob_per_num) / stdev;
// check if the GCD has statistical evidence
let natural_prob_per_triple = 1.0 / (gcd_f64 * gcd_f64);
let stdev = (natural_prob_per_triple * (1.0 - natural_prob_per_triple) / total_triples).sqrt();
let z_score = (prob_per_triple - natural_prob_per_triple) / stdev;
let implied_prob_per_num = prob_per_triple.sqrt();
if z_score < 3.0 {
return None;
}

// heuristic for when the GCD is useless, even if true
if implied_prob_per_num < 0.1 || implied_prob_per_num < 1.0 / (0.9 + 0.2 * gcd_f64) {
if implied_prob_per_num < 0.1 || implied_prob_per_num < 1.0 / (1.0 + 0.1 * gcd_f64) {
return None;
}

// heuristic for how good a GCD is. It mostly scales with overperformance of
// the GCD relative to expectations, but that breaks down when considering
// multiples of the GCD. e.g. if 100 is the true GCD, 200 will appear half
// as often and look equally enticing. To decide between them we add a small
// penalty for larger GCDs.
let score = (implied_prob_per_num - 0.05) * gcd_f64;
Some(score)
// The most likely valid GCD maximizes triples * gcd, and the most
// valuable one (if true) maximizes triples.sqrt() * gcd. We take a
// conservative lower confidence bound for how many triples we'd get if we
// repeated the measurement, and strike a compromise between most likely and
// most valuable.
let triples_lcb = triples_w_gcd - 1.0 * triples_w_gcd.sqrt();
if triples_lcb >= 0.0 {
Some(triples_lcb.powf(0.6) * gcd_f64)
} else {
None
}
}

fn most_prominent_gcd<U: UnsignedLike>(triple_gcds: &[U], total_triples: usize) -> Option<U> {
Expand Down Expand Up @@ -178,6 +184,7 @@ pub fn choose_base<T: NumberLike>(nums: &[T]) -> Option<T::Unsigned> {
#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;

#[test]
fn test_split_join_latents() {
Expand Down Expand Up @@ -229,7 +236,7 @@ mod tests {
fn test_calc_candidate_gcd() {
// not significant enough
assert_eq!(
calc_candidate_base(&mut vec![0_u32, 4, 8, 10, 14, 18]),
calc_candidate_base(&mut vec![0_u32, 4, 8]),
None,
);
assert_eq!(
Expand All @@ -252,5 +259,11 @@ mod tests {
]),
None,
);
// even just evens can be useful if the signal is strong enough
let mut rng = rand_xoshiro::Xoroshiro128PlusPlus::seed_from_u64(0);
let mut twos = (0_u32..100)
.map(|_| rng.gen_range(0_u32..1000) * 2)
.collect::<Vec<_>>();
assert_eq!(calc_candidate_base(&mut twos), Some(2));
}
}
119 changes: 119 additions & 0 deletions pco/src/tests/compatibility.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use crate::data_types::NumberLike;
use crate::errors::PcoResult;
use crate::{standalone, ChunkConfig};
use std::fs;
use std::path::PathBuf;
use std::str::FromStr;

fn get_asset_dir() -> PathBuf {
PathBuf::from_str(env!("CARGO_MANIFEST_DIR"))
.unwrap()
.join("assets")
}

fn get_pco_path(version: &str, name: &str) -> PathBuf {
get_asset_dir().join(format!(
"v{}_{}.pco",
version.replace(".", "_"),
name,
))
}

fn assert_nums_eq<T: NumberLike>(x: &[T], y: &[T]) {
assert_eq!(x.len(), y.len());
for (i, (x, y)) in x.iter().zip(y).enumerate() {
assert_eq!(
x.to_unsigned(),
y.to_unsigned(),
"{} != {} at {}",
x,
y,
i
);
}
}

fn assert_compatible<T: NumberLike>(version: &str, name: &str, expected: &[T]) -> PcoResult<()> {
let pco_path = get_pco_path(version, name);

let compressed = fs::read(pco_path)?;
let decompressed = standalone::auto_decompress::<T>(&compressed)?;

assert_nums_eq(&decompressed, &expected);
Ok(())
}

fn simple_write_if_version_matches<T: NumberLike>(
version: &str,
name: &str,
nums: &[T],
config: &ChunkConfig,
) -> PcoResult<()> {
if version != env!("CARGO_PKG_VERSION") {
return Ok(());
}

let pco_path = get_pco_path(version, name);
if pco_path.exists() {
return Ok(());
}

fs::write(
pco_path,
standalone::simple_compress(nums, config)?,
)?;
Ok(())
}

#[cfg(test)]
mod tests {
use crate::errors::PcoResult;
use crate::tests::compatibility::{assert_compatible, simple_write_if_version_matches};
use crate::ChunkConfig;

#[test]
fn v0_0_0_classic() -> PcoResult<()> {
let name = "classic";
let version = "0.0.0";
let nums = (0_i32..1000).chain(2000..3000).collect::<Vec<_>>();
let config = ChunkConfig {
delta_encoding_order: Some(0),
..Default::default()
};
simple_write_if_version_matches(version, name, &nums, &config)?;
assert_compatible(version, name, &nums)?;
Ok(())
}

#[test]
fn v0_0_0_delta_float_mult() -> PcoResult<()> {
let version = "0.0.0";
let name = "delta_float_mult";
let mut nums = (0..2000).map(|i| i as f32).collect::<Vec<_>>();
nums[1337] += 1.001;
let config = ChunkConfig {
delta_encoding_order: Some(1),
..Default::default()
};
simple_write_if_version_matches(version, name, &nums, &config)?;
assert_compatible(version, name, &nums)?;
Ok(())
}

#[test]
fn v0_1_0_delta_int_mult() -> PcoResult<()> {
// starting at 0.1.0 because 0.0.0 had GCD mode (no longer supported)
// instead of int mult
let version = "0.1.0";
let name = "delta_int_mult";
let mut nums = (0..2000).map(|i| i * 1000).collect::<Vec<_>>();
nums[1337] -= 1;
let config = ChunkConfig {
delta_encoding_order: Some(1),
..Default::default()
};
simple_write_if_version_matches(version, name, &nums, &config)?;
assert_compatible(version, name, &nums)?;
Ok(())
}
}
1 change: 1 addition & 0 deletions pco/src/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod compatibility;
mod low_level;
mod recovery;
mod stability;
Expand Down
4 changes: 4 additions & 0 deletions pco/src/wrapped/chunk_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,10 @@ impl<U: UnsignedLike> ChunkCompressor<U> {

for (var_policy, var_latents) in latent_var_policies.iter().zip(page_latents.per_var.iter()) {
if var_policy.is_trivial {
per_var.push(uninit_dissected_page_var(
0,
var_policy.encoder.default_state(),
));
continue;
}

Expand Down

0 comments on commit 2b823d2

Please sign in to comment.