From 608e7a3c44d30dc5fd4ffff3af974b3b6e923b50 Mon Sep 17 00:00:00 2001 From: Robbie McKinstry Date: Mon, 28 Oct 2024 18:00:17 -0400 Subject: [PATCH] Implement Chi Square engine and add different types of contingency tables. --- src/adapters/engines/chi.rs | 45 +++++++++++++++++ src/adapters/engines/mod.rs | 2 + src/stats/chi.rs | 87 ++++----------------------------- src/stats/mod.rs | 12 +++-- src/stats/tables/contingency.rs | 18 +++++++ src/stats/tables/empirical.rs | 75 ++++++++++++++++++++++++++++ src/stats/tables/expectation.rs | 84 +++++++++++++++++++++++++++++++ src/stats/tables/fixed.rs | 60 +++++++++++++++++++++++ src/stats/tables/mod.rs | 14 ++++++ 9 files changed, 316 insertions(+), 81 deletions(-) create mode 100644 src/adapters/engines/chi.rs create mode 100644 src/stats/tables/contingency.rs create mode 100644 src/stats/tables/empirical.rs create mode 100644 src/stats/tables/expectation.rs create mode 100644 src/stats/tables/fixed.rs create mode 100644 src/stats/tables/mod.rs diff --git a/src/adapters/engines/chi.rs b/src/adapters/engines/chi.rs new file mode 100644 index 0000000..ca41a38 --- /dev/null +++ b/src/adapters/engines/chi.rs @@ -0,0 +1,45 @@ +use crate::{ + metrics::ResponseStatusCode, + stats::{EmpiricalTable, ExpectationTable, Group, Observation}, +}; + +use super::DecisionEngine; + +/// The [ChiSquareEngine] uses the Chi Square statistical +/// significance test to determine whether the canary should be promoted or not. +#[derive(Default)] +pub struct ChiSquareEngine { + control_data: ExpectationTable, + experimental_data: EmpiricalTable, +} + +impl DecisionEngine for ChiSquareEngine { + // TODO: From writing this method, it's apparent there should be a Vec implementation + // that adds Vec::len() to the total and concats the vectors together, because + // otherwise we're wasting a ton of cycles just incrementing counters. + fn add_observation(&mut self, observation: Observation) { + match observation.group { + Group::Control => { + // • Increment the number of observations for this category. + self.control_data.increment(observation.outcome); + } + Group::Experimental => { + // • Increment the number of observations in the canary contingency table. + self.experimental_data.increment(observation.outcome); + // • Then, let the control contingency table know that there was + // another experimental observation. + self.control_data.increment_experimental_total(); + } + } + } + + fn compute(&mut self) -> Option { + todo!() + } +} + +impl ChiSquareEngine { + pub fn new() -> Self { + Self::default() + } +} diff --git a/src/adapters/engines/mod.rs b/src/adapters/engines/mod.rs index 5b5fbb9..854e81b 100644 --- a/src/adapters/engines/mod.rs +++ b/src/adapters/engines/mod.rs @@ -2,6 +2,7 @@ use crate::stats::{EnumerableCategory, Observation}; use std::hash::Hash; pub use action::Action; +pub use chi::ChiSquareEngine; /// Helper trait, since these requirements are often used by /// our implementation of `ContingencyTables`. @@ -23,6 +24,7 @@ pub trait DecisionEngine { } mod action; +mod chi; mod controller; /// The AlwaysPromote decision engine will always return the Promote diff --git a/src/stats/chi.rs b/src/stats/chi.rs index fccf035..9bfa540 100644 --- a/src/stats/chi.rs +++ b/src/stats/chi.rs @@ -1,28 +1,10 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::hash::Hash; use std::num::NonZeroU64; +use crate::stats::ContingencyTable; use statrs::distribution::{ChiSquared, ContinuousCDF}; -/// A ContingencyTable expresses the frequency with which a group was observed. -/// Usually, it tracks the number of observations in ecah group, but when the -/// number is already known (i.e. its fixed, like a fair dice or coin), it can -/// expose just the frequencies for each group. -pub trait ContingencyTable { - /// return the number of observations of the in the provided group. - fn group_count(&self, cat: &Group) -> u64; - - /// Return the set of groups that serve as columns of the contingency table. - fn groups(&self) -> Box>; - - // returns the total number of observations made. This should be the sum - // of the group count for every group. - fn total_count(&self) -> u64 { - self.groups() - .fold(0, |sum, group| sum + self.group_count(&group)) - } -} - /// returns the number of degrees of freedom for this table. /// This is typically the number of groups minus one. /// # Panics @@ -66,60 +48,6 @@ pub trait EnumerableCategory { fn groups() -> Box>; } -/// A [FixedContingencyTable] is used to model scenarios where the -/// frequencies are fixed (i.e. known ahead of time), like fair dice. -/// It is mostly used for testing. The category must be hashable -/// because a hashmap is used internally to store the frequencies. -/// If you'd like us to add a B-Tree based alternative, please open an issue. -pub struct FixedContingencyTable -where - C: EnumerableCategory + Hash + Eq, -{ - counts: HashMap, -} - -impl FixedContingencyTable -where - C: EnumerableCategory + Hash + Eq, -{ - /// Construct a new, empty contingency table. All frequencies are - /// initialized to zero. - pub fn new() -> Self { - let mut counts = HashMap::new(); - for group in C::groups() { - counts.entry(group).or_insert(0); - } - - Self { counts } - } - - /// Sets the expected count of the category to the value provided. - pub fn set_group_count(&mut self, cat: C, count: u64) { - self.counts.insert(cat, count); - } - - /// Returns the number of observations that were classified as - /// having this group/category. - pub fn group_count(&self, cat: &C) -> u64 { - self.counts[cat] - } -} - -impl ContingencyTable for FixedContingencyTable -where - C: EnumerableCategory + Hash + Eq, -{ - fn group_count(&self, cat: &C) -> u64 { - // delegate to the method on the base class. - Self::group_count(self, cat) - } - - fn groups(&self) -> Box> { - // Delegate to the fixed list provided by the EnumerableCategory. - C::groups() - } -} - /// Alpha represents the alpha cutoff, expressed as a floating point from [0, 1] inclusive. /// For example, 0.95 is the standard 5% confidency interval. pub fn chi_square_test( @@ -175,7 +103,10 @@ mod tests { use std::{collections::HashSet, num::NonZeroU64}; - use crate::stats::chi::{degrees_of_freedom, p_value, FixedContingencyTable}; + use crate::stats::{ + chi::{degrees_of_freedom, p_value}, + FixedTable, + }; use super::{test_statistic, ContingencyTable, EnumerableCategory}; use pretty_assertions::assert_eq; @@ -203,7 +134,7 @@ mod tests { /// can have its frequencies set and accessed. #[test] fn enumerable_table() { - let mut table = FixedContingencyTable::new(); + let mut table = FixedTable::new(); let groups = [(true, 30u64), (false, 70u64)]; // Put the values into the table. for (group, freq) in groups { @@ -224,10 +155,10 @@ mod tests { /// Let True represent Heads and False represent Tails. #[test] fn calc_test_statistic() { - let mut control_group = FixedContingencyTable::new(); + let mut control_group = FixedTable::new(); control_group.set_group_count(true, 25); control_group.set_group_count(false, 25); - let mut experimental_group = FixedContingencyTable::new(); + let mut experimental_group = FixedTable::new(); experimental_group.set_group_count(true, 21); experimental_group.set_group_count(false, 29); assert_eq!( diff --git a/src/stats/mod.rs b/src/stats/mod.rs index d8388a5..ac0ee0e 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -3,9 +3,13 @@ use std::collections::HashMap; pub use chi::EnumerableCategory; pub use group::Group; pub use observation::Observation; +pub use tables::{ContingencyTable, EmpiricalTable, ExpectationTable, FixedTable}; use crate::metrics::ResponseStatusCode; +// TODO: Before long, we can delete this file since this is an +// old and mostly incorrect implement of X2. + /// The alpha cutoff is the amount of confidence must have in the result /// to feel comfortable that the result is not due to chance, but instead /// do to the independent variable. The valu is expressed as a confidence @@ -17,8 +21,8 @@ const DEFAULT_ALPHA_CUTOFF: f64 = 0.05; /// The [ChiSquareEngine] calculates the Chi Square test statistic /// based on the data stored in its contingency tables. pub struct ChiSquareEngine { - control: ContingencyTable, - experimental: ContingencyTable, + control: Table, + experimental: Table, total_control_count: usize, total_experimental_count: usize, alpha_cutoff: f64, @@ -93,7 +97,7 @@ impl ChiSquareEngine { } /// This type maps the dependent variable to its count. -pub type ContingencyTable = HashMap; +type Table = HashMap; /// contains the engine to calculate the chi square test statistic. mod chi; @@ -101,3 +105,5 @@ mod chi; mod group; /// An observation represents a group and the observed category. mod observation; +/// Different kinds of contingency tables. +mod tables; diff --git a/src/stats/tables/contingency.rs b/src/stats/tables/contingency.rs new file mode 100644 index 0000000..e4dd1e2 --- /dev/null +++ b/src/stats/tables/contingency.rs @@ -0,0 +1,18 @@ +/// A ContingencyTable expresses the frequency with which a group was observed. +/// Usually, it tracks the number of observations in ecah group, but when the +/// number is already known (i.e. its fixed, like a fair dice or coin), it can +/// expose just the frequencies for each group. +pub trait ContingencyTable { + /// return the number of observations of the in the provided group. + fn group_count(&self, cat: &Group) -> u64; + + /// Return the set of groups that serve as columns of the contingency table. + fn groups(&self) -> Box>; + + // returns the total number of observations made. This should be the sum + // of the group count for every group. + fn total_count(&self) -> u64 { + self.groups() + .fold(0, |sum, group| sum + self.group_count(&group)) + } +} diff --git a/src/stats/tables/empirical.rs b/src/stats/tables/empirical.rs new file mode 100644 index 0000000..6c267d2 --- /dev/null +++ b/src/stats/tables/empirical.rs @@ -0,0 +1,75 @@ +use super::ContingencyTable; +use crate::stats::EnumerableCategory; +use std::{collections::HashMap, hash::Hash}; + +/// An [EmpiricalTable] is used to track observed data. It keeps +/// a talley of each observed category. When queried, it uses +/// the empirical values to emit an observation count. +/// This is in contrast to a ExpectationTable, which also keeps a +/// talley of observations made, but uses the count of observations +/// from an EmpiricalTable to determine the expected ratios. +/// +/// The category must be hashable +/// because a hashmap is used internally to store the frequencies. +/// If you'd like us to add a B-Tree based alternative, please open an issue. +pub struct EmpiricalTable +where + C: EnumerableCategory + Hash + Eq, +{ + counts: HashMap, +} + +impl Default for EmpiricalTable +where + C: EnumerableCategory + Hash + Eq, +{ + fn default() -> Self { + Self::new() + } +} + +impl EmpiricalTable +where + C: EnumerableCategory + Hash + Eq, +{ + /// Construct a new, empty contingency table. All frequencies are + /// initialized to zero. + pub fn new() -> Self { + let mut counts = HashMap::new(); + for group in C::groups() { + counts.entry(group).or_insert(0); + } + + Self { counts } + } + + pub fn increment(&mut self, cat: C) { + self.counts.entry(cat).and_modify(|c| *c += 1); + } + + /// Sets the expected count of the category to the value provided. + pub fn set_group_count(&mut self, cat: C, count: u64) { + self.counts.insert(cat, count); + } + + /// Returns the number of observations that were classified as + /// having this group/category. + pub fn group_count(&self, cat: &C) -> u64 { + self.counts[cat] + } +} + +impl ContingencyTable for EmpiricalTable +where + C: EnumerableCategory + Hash + Eq, +{ + fn group_count(&self, cat: &C) -> u64 { + // delegate to the method on the base class. + Self::group_count(self, cat) + } + + fn groups(&self) -> Box> { + // Delegate to the fixed list provided by the EnumerableCategory. + C::groups() + } +} diff --git a/src/stats/tables/expectation.rs b/src/stats/tables/expectation.rs new file mode 100644 index 0000000..937e808 --- /dev/null +++ b/src/stats/tables/expectation.rs @@ -0,0 +1,84 @@ +use crate::stats::EnumerableCategory; + +use super::ContingencyTable; +use std::{collections::HashMap, hash::Hash}; + +/// An [EmpiricalTable] is used to track observed data. It keeps +/// a talley of each observed category. When queried, it uses +/// the empirical values to emit an observation count. +/// This is in contrast to a ExpectationTable, which also keeps a +/// talley of observations made, but uses the count of observations +/// from an EmpiricalTable to determine the expected ratios. +/// +/// The category must be hashable +/// because a hashmap is used internally to store the frequencies. +/// If you'd like us to add a B-Tree based alternative, please open an issue. +pub struct ExpectationTable +where + C: EnumerableCategory + Hash + Eq, +{ + counts: HashMap, + experimental_total: u64, +} + +impl Default for ExpectationTable +where + C: EnumerableCategory + Hash + Eq, +{ + fn default() -> Self { + Self::new() + } +} + +impl ExpectationTable +where + C: EnumerableCategory + Hash + Eq, +{ + /// Construct a new, empty contingency table. All frequencies are + /// initialized to zero. + pub fn new() -> Self { + let mut counts = HashMap::new(); + for group in C::groups() { + counts.entry(group).or_insert(0); + } + + Self { + counts, + experimental_total: 0, + } + } + + pub fn increment(&mut self, cat: C) { + self.counts.entry(cat).and_modify(|c| *c += 1); + } + + pub fn increment_experimental_total(&mut self) { + self.experimental_total += 1; + } + + /// Using the expected frequency for this group, calculate + /// the expected number of items in the experimental group + /// using the number of observations from that group. + pub fn group_count(&self, cat: &C) -> u64 { + (self.local_ratio(cat) * (self.experimental_total as f64)).round() as u64 + } + + fn local_ratio(&self, cat: &C) -> f64 { + (self.counts[cat] as f64) / (self.total_count() as f64) + } +} + +impl ContingencyTable for ExpectationTable +where + C: EnumerableCategory + Hash + Eq, +{ + fn group_count(&self, cat: &C) -> u64 { + // delegate to the method on the base class. + Self::group_count(self, cat) + } + + fn groups(&self) -> Box> { + // Delegate to the fixed list provided by the EnumerableCategory. + C::groups() + } +} diff --git a/src/stats/tables/fixed.rs b/src/stats/tables/fixed.rs new file mode 100644 index 0000000..9553b7f --- /dev/null +++ b/src/stats/tables/fixed.rs @@ -0,0 +1,60 @@ +use super::ContingencyTable; +use crate::stats::EnumerableCategory; +use std::{collections::HashMap, hash::Hash}; + +/// A [FixedTable] is used when the number of elements in each category +/// should be set directly. Mainly used for testing. +/// +/// +/// The category must be hashable +/// because a hashmap is used internally to store the frequencies. +/// If you'd like us to add a B-Tree based alternative, please open an issue. +#[derive(Default)] +pub struct FixedTable +where + C: EnumerableCategory + Hash + Eq, +{ + counts: HashMap, +} + +impl FixedTable +where + C: EnumerableCategory + Hash + Eq, +{ + /// Construct a new, empty contingency table. All frequencies are + /// initialized to zero. + pub fn new() -> Self { + let mut counts = HashMap::new(); + for group in C::groups() { + counts.entry(group).or_insert(0); + } + + Self { counts } + } + + /// Sets the expected count of the category to the value provided. + pub fn set_group_count(&mut self, cat: C, count: u64) { + self.counts.insert(cat, count); + } + + /// Returns the number of observations that were classified as + /// having this group/category. + pub fn group_count(&self, cat: &C) -> u64 { + self.counts[cat] + } +} + +impl ContingencyTable for FixedTable +where + C: EnumerableCategory + Hash + Eq, +{ + fn group_count(&self, cat: &C) -> u64 { + // delegate to the method on the base class. + Self::group_count(self, cat) + } + + fn groups(&self) -> Box> { + // Delegate to the fixed list provided by the EnumerableCategory. + C::groups() + } +} diff --git a/src/stats/tables/mod.rs b/src/stats/tables/mod.rs new file mode 100644 index 0000000..477d19a --- /dev/null +++ b/src/stats/tables/mod.rs @@ -0,0 +1,14 @@ +pub use contingency::ContingencyTable; +pub use empirical::EmpiricalTable; +pub use expectation::ExpectationTable; +pub use fixed::FixedTable; + +/// Defines the contingency table abstraction. +mod contingency; +/// Defines the Empirical (contingency) table, which is the experimental group. +mod empirical; +/// Defines the Expectation (contingency) table, which is used as the control group. +mod expectation; +/// For times where you want to manually set the number of events in each category, like +/// when testing. +mod fixed;