Skip to content

Commit

Permalink
[+] better example for the one-sample bootstrap ht
Browse files Browse the repository at this point in the history
  • Loading branch information
ssoudan committed Oct 13, 2022
1 parent 7dd84a7 commit cdac193
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 86 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
statistic is more 'extreme' than the test statistic for our initial samples.

# References
- Introduction to the Bootstrap, Efron and Tibshirani
- [Bootstrap Hypothesis Testing](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)#Bootstrap_hypothesis_testing)
- [P-value](https://en.wikipedia.org/wiki/P-value)
- [Stats 102A Lesson 9-2 Bootstrap Hypothesis Tests, Miles Chen](https://www.youtube.com/watch?v=s7do_F9LV-w)
Expand Down
162 changes: 76 additions & 86 deletions src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,18 @@ pub fn two_samples_non_parametric_ht<
}

// the test statistic for the observed data
let t_stat = test_statistic_fn(a, b);
let test_stat = test_statistic_fn(a, b);

// the test statistic distribution of the population under the null hypothesis (a and b
// are drawn from the same population).
let mut t_stat_dist = vec![F::from(0.).unwrap(); rep];
let mut test_stat_dist = vec![F::from(0.).unwrap(); rep];

let reference = [a, b].concat();

let mut a_ = vec![S::default(); n_a];
let mut b_ = vec![S::default(); n_b];

for t_stat_dist_ in t_stat_dist.iter_mut() {
for test_stat_dist_ in test_stat_dist.iter_mut() {
for a__ in a_.iter_mut() {
*a__ = reference.choose(&mut rng).unwrap().clone();
}
Expand All @@ -143,12 +143,12 @@ pub fn two_samples_non_parametric_ht<
*b__ = reference.choose(&mut rng).unwrap().clone();
}

let t_stat_ = test_statistic_fn(&a_, &b_);
*t_stat_dist_ = t_stat_;
let test_stat_ = test_statistic_fn(&a_, &b_);
*test_stat_dist_ = test_stat_;
}

// the p-value
let p_value = compute_p_value(pvalue_type, t_stat, t_stat_dist);
let p_value = compute_p_value(pvalue_type, test_stat, test_stat_dist);

Ok(p_value)
}
Expand All @@ -162,11 +162,11 @@ pub fn two_samples_non_parametric_ht<
/// # Description
///
/// The null hypothesis is that the test statistic of the population generated by the
/// sample `a` is `t_stat`. The p-value is the probability of obtaining a test statistic
/// at least as extreme as the one provided by `mu` (extreme being defined by
/// sample `a` is `test_stat`. The p-value is the probability of obtaining a test
/// statistic at least as extreme as the one provided by `mu` (extreme being defined by
/// `pvalue_type` - see [`PValueType`]).
///
/// The p-value is computed by comparing `t_stat` to the distribution of the
/// The p-value is computed by comparing `test_stat` to the distribution of the
/// test statistic.
///
/// The test statistic distribution is obtained by randomly sampling with replacement from
Expand All @@ -178,38 +178,42 @@ pub fn two_samples_non_parametric_ht<
/// Let's use the mean as the test statistic: `mean(a)`.
/// ```rust
/// use bootstrap_ht::prelude::*;
/// use rand::prelude::Distribution;
/// use rand::SeedableRng;
/// use rand_chacha::ChaCha8Rng;
/// use rand_distr::StandardNormal;
///
/// let mut rng = ChaCha8Rng::seed_from_u64(42);
/// // From p.224 - 'An introduction to the boostrap', Efron and Tibshirani
///
/// let a = StandardNormal
/// .sample_iter(&mut rng)
/// .take(10)
/// .collect::<Vec<f64>>();
/// let a = [94., 197., 16., 38., 99., 141., 23.];
/// let a_mean = a.iter().sum::<f32>() / a.len() as f32;
///
/// let mut rng = ChaCha8Rng::seed_from_u64(42);
///
/// let t_stat = 1.2;
/// let test_statistic_fn = |a: &[f32]| {
/// let a_mean = a.iter().sum::<f32>() / a.len() as f32;
/// let a_std =
/// (a.iter().map(|x| (x - a_mean).powi(2)).sum::<f32>() / (a.len() - 1) as f32).sqrt();
///
/// /// std
/// let test_statistic_fn = |a: &[f64]| {
/// let std = a.iter().copied().map(|x| f64::powi(x, 2)).sum::<f64>() / a.len() as f64;
/// std.sqrt()
/// (a_mean - 129.) / (a_std / (a.len() as f32).sqrt())
/// };
///
/// let test_stat: f32 = test_statistic_fn(&a);
///
/// /// Got to shift the samples so their mean is the one assumed under H0
/// let a = a
/// .into_iter()
/// .map(|x| x - a_mean + 129.)
/// .collect::<Vec<f32>>();
///
/// let p_value = bootstrap::one_sample_non_parametric_ht(
/// &mut rng,
/// &a,
/// t_stat,
/// test_stat,
/// test_statistic_fn,
/// bootstrap::PValueType::OneSidedRightTail,
/// 10_000,
/// bootstrap::PValueType::OneSidedLeftTail,
/// 1_000,
/// )
/// .unwrap();
///
/// assert_eq!(p_value, 0.1493);
/// // might require more investigations maybe the standard deviation is not 1.2...
/// assert_eq!(p_value, 0.092);
/// ```
pub fn one_sample_non_parametric_ht<
R: Rng + ?Sized,
Expand All @@ -218,7 +222,7 @@ pub fn one_sample_non_parametric_ht<
>(
mut rng: &mut R,
a: &[S],
t_stat: F,
test_stat: F,
test_statistic_fn: impl Fn(&[S]) -> F,
pvalue_type: PValueType,
rep: usize,
Expand All @@ -229,38 +233,38 @@ pub fn one_sample_non_parametric_ht<
}

// the test statistic distribution of the population under the null hypothesis.
let mut t_stat_dist = vec![F::from(0.).unwrap(); rep];
let mut test_stat_dist = vec![F::from(0.).unwrap(); rep];

let mut a_ = vec![S::default(); n_a];
for t_stat_dist_ in t_stat_dist.iter_mut() {
for test_stat_dist_ in test_stat_dist.iter_mut() {
for a__ in a_.iter_mut() {
*a__ = a.choose(&mut rng).unwrap().clone();
}

let t_stat_ = test_statistic_fn(&a_);
*t_stat_dist_ = t_stat_;
let test_stat_ = test_statistic_fn(&a_);
*test_stat_dist_ = test_stat_;
}

// the p-value
let p_value = compute_p_value(pvalue_type, t_stat, t_stat_dist);
let p_value = compute_p_value(pvalue_type, test_stat, test_stat_dist);

Ok(p_value)
}

fn compute_p_value<F: Float + std::iter::Sum>(
pvalue_type: PValueType,
t_stat: F,
t_stat_dist: Vec<F>,
test_stat: F,
test_stat_dist: Vec<F>,
) -> F {
let n = F::from(t_stat_dist.len()).unwrap();
let n = F::from(test_stat_dist.len()).unwrap();

match pvalue_type {
PValueType::TwoSided => {
let right_p_value =
F::from(t_stat_dist.iter().filter(|t| **t >= t_stat).count()).unwrap() / n;
F::from(test_stat_dist.iter().filter(|t| **t >= test_stat).count()).unwrap() / n;

let left_p_value =
F::from(t_stat_dist.iter().filter(|t| **t <= t_stat).count()).unwrap() / n;
F::from(test_stat_dist.iter().filter(|t| **t <= test_stat).count()).unwrap() / n;

let min = if right_p_value <= left_p_value {
right_p_value
Expand All @@ -271,10 +275,10 @@ fn compute_p_value<F: Float + std::iter::Sum>(
F::from(2.).unwrap() * min
}
PValueType::OneSidedRightTail => {
F::from(t_stat_dist.iter().filter(|&t| t >= &t_stat).count()).unwrap() / n
F::from(test_stat_dist.iter().filter(|&t| t >= &test_stat).count()).unwrap() / n
}
PValueType::OneSidedLeftTail => {
F::from(t_stat_dist.iter().filter(|&t| t <= &t_stat).count()).unwrap() / n
F::from(test_stat_dist.iter().filter(|&t| t <= &test_stat).count()).unwrap() / n
}
}
}
Expand Down Expand Up @@ -481,12 +485,12 @@ mod tests {
a_mean
};

let t_stat = test_statistic_fn(&a);
let test_stat = test_statistic_fn(&a);

let p_value = one_sample_non_parametric_ht(
&mut rng,
&a,
t_stat,
test_stat,
test_statistic_fn,
PValueType::TwoSided,
10_000,
Expand All @@ -498,64 +502,50 @@ mod tests {

#[test]
fn test_one_sample_mean_ht() {
let a = [
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
];
// From p.224 - 'An introduction to the boostrap', Efron and Tibshirani

let mut rng = ChaCha8Rng::seed_from_u64(42);
// H0: the mean of the population is 129.

// mean
let test_statistic_fn = |a: &[i32]| {
let a_mean = a.iter().map(|x| *x as f32).sum::<f32>() / a.len() as f32;
a_mean
};
let a = [94., 197., 16., 38., 99., 141., 23.];
let a_mean = a.iter().sum::<f32>() / a.len() as f32;

let t_stat: f32 = 0.5;
let mut rng = ChaCha8Rng::seed_from_u64(42);

let p_value = one_sample_non_parametric_ht(
&mut rng,
&a,
t_stat,
test_statistic_fn,
PValueType::OneSidedRightTail,
10_000,
)
.unwrap();
assert_eq!(p_value, 0.0592);
// might start to want to reject the null hypothesis that the coin is fair
}
// 'studentized' distance to the H0 mean statistic
let test_statistic_fn = |a: &[f32]| {
let a_mean = a.iter().sum::<f32>() / a.len() as f32;
let a_std =
(a.iter().map(|x| (x - a_mean).powi(2)).sum::<f32>() / (a.len() - 1) as f32).sqrt();

#[test]
fn test_one_sample_substring_ht() {
let a = ["h", "e", "l", "l", "o"];
(a_mean - 129.) / (a_std / (a.len() as f32).sqrt())
};

let mut rng = ChaCha8Rng::seed_from_u64(42);
let test_stat: f32 = test_statistic_fn(&a);

// number of correct letters at the correct position
let test_statistic_fn = |a: &[&str]| {
let mut correct_letters = 0;
for (i, letter) in a.iter().enumerate() {
if letter == &"hello".chars().nth(i).unwrap().to_string() {
correct_letters += 1;
}
}
correct_letters as f32
};
// Got to shift the samples so their mean is the one assumed under H0 - namely 129.
let a = a
.into_iter()
.map(|x| x - a_mean + 129.)
.collect::<Vec<f32>>();

let t_stat = test_statistic_fn(&a);
// Alternatively, we could have not shifted the samples, and used a test statistic
// measuring the distance to the empirical mean of the original sample (86.9) while
// keeping 129 when calculating the reference test statistic.
//
// In the end, what we want to estimate is how 'surprising' it is for the original sample
// to be this far from a hypothetical population mean by comparing the distribution of
// distance to the mean for a collection of samples generated randomly from the
// same 'data'.

let p_value = one_sample_non_parametric_ht(
&mut rng,
&a,
t_stat,
test_stat,
test_statistic_fn,
PValueType::OneSidedRightTail,
10_000,
PValueType::OneSidedLeftTail,
1_000,
)
.unwrap();

assert_eq!(p_value, 0.0022);
// p_value is small enough to reject the null hypothesis that the sample is
// representative of the population
assert_eq!(p_value, 0.092);
}
}

0 comments on commit cdac193

Please sign in to comment.