Skip to content

Commit

Permalink
FT: #60 More efficient replications in confint simulation for vignett…
Browse files Browse the repository at this point in the history
…e 3.1
  • Loading branch information
shamindras committed Apr 18, 2021
1 parent 8fccca0 commit bf96189
Showing 1 changed file with 133 additions and 0 deletions.
133 changes: 133 additions & 0 deletions R/scripts_and_filters/experiments/experiment-vignette3.1-SS-2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Setup Libraries ---------------------------------------------------------
library(tidyverse)
library(progress)
library(glue)

devtools::document()
devtools::load_all()

# Global variables --------------------------------------------------------
set.seed(35542)
NUM_COVG_REPS <- 200 # Number of coverage replications
NUM_OBS <- 300

# Create grid sequences ---------------------------------------------------
# For empirical bootstrap parameters i.e. B*m
# Create individual sequences of grid values to cross join
# TODO: Need to add replace = {TRUE/FALSE} values once we add subsampling
out_crossing <- purrr::cross2(
.x = seq(from = 5, to = 35, by = 10),
.y = seq(from = 40, to = 120, by = 40),
.filter = `==`
) %>%
purrr::map(purrr::set_names, c("B", "m"))

# Check output for one of the lists we have constructed
out_crossing[[10]]
out_crossing[[10]]$B
out_crossing[[10]]$m

#' Given fitted linear model i.e. `mod_fit`
#' For an individual coverage replication index i.e. `covg_rep_idx`
#' For an individual [B, m] empirical bootstrap combination i.e. `boot_emp`
#' Get the empirical bootstrap confidence interval for the fitted parameters
fit_ind_repl_ind_boot_emp <- function(covg_rep_idx, mod_fit, boot_emp) {
confint_fit <- comp_var(mod_fit = mod_fit, boot_emp = boot_emp) %>%
get_confint(mod_fit = ., level = 0.95, boot_emp = TRUE) %>%
dplyr::mutate(
covg_rep_idx = covg_rep_idx,
B = boot_emp[["B"]],
m = boot_emp[["m"]]
) %>%
dplyr::select(covg_rep_idx, B, m, dplyr::everything())
return(confint_fit)
}

fit_ind_repl_all_boot_emp <- function(covg_rep_idx, out_crossing) {
# TODO: Remove the print message later
print(glue::glue("Running coverage replication index: {covg_rep_idx}...\n"))

# For each coverage replication generate the data once
# For all B*m combinations
n <- NUM_OBS
X <- stats::rnorm(n, 0, 1)
y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
lm_fit <- stats::lm(y ~ X)

# Using the single data generated for each coverage replication
# Get the empirical bootstrap confidence interval for each individual B*m
# combination, and combine all such combination output into a single
# tibble
out <- out_crossing %>%
purrr::map_dfr(
.x = .,
.f = ~ fit_ind_repl_ind_boot_emp(
covg_rep_idx = covg_rep_idx,
mod_fit = lm_fit,
boot_emp = .x
)
)
return(out)
}

fit_all_repl_all_boot_emp <- function(num_reps, out_crossing) {
out <- 1:num_reps %>%
purrr::map_dfr(
.x = .,
.f = ~ fit_ind_repl_all_boot_emp(
covg_rep_idx = .x,
out_crossing = out_crossing
)
)
return(out)
}

# Compute confidence intervals for N replications -------------------------
system.time(confint_replications <- fit_all_repl_all_boot_emp(
num_reps = NUM_COVG_REPS,
out_crossing = out_crossing
) %>%
dplyr::arrange(B, m, covg_rep_idx) %>%
dplyr::mutate(
term = forcats::as_factor(term),
B = forcats::as_factor(B),
m = forcats::as_factor(m)
))

# Check the output
head(confint_replications)

# Get combined confidence intervals ---------------------------------------
all_confint <- confint_replications %>%
# Add plotting code here
filter(stat_type == "conf.low" |
stat_type == "conf.high") %>%
pivot_wider(names_from = stat_type, values_from = stat_val)

all_confint_coverage <- all_confint %>%
inner_join(tibble(
term = c("(Intercept)", "X"),
value_par = c(2, 1)
),
by = "term"
) %>%
mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
group_by(term, B, m, var_type_abb) %>%
summarize(
coverage = mean(covers_c),
avg_width = mean(conf.high - conf.low)
)

# Coverage seems to be working here i.e. most of the values are in the 86-100% range
summary(all_confint_coverage$coverage)
hist(all_confint_coverage$coverage, breaks = 10, xlim = c(0, 1))

# TODO: Check if we should be plotting avg_width?
# Should this be coverage instead?
all_confint_coverage %>%
mutate(title = as.factor(glue::glue("m = {m}"))) %>%
ggplot(aes(B, avg_width)) +
geom_col() +
facet_grid(~title) +
labs(y = "Coverage") +
geom_hline(yintercept = 0.95, linetype = "dashed")

0 comments on commit bf96189

Please sign in to comment.