From bf9618994fae2b021bbd5f906a8b351ad1bc2c4f Mon Sep 17 00:00:00 2001
From: Shamindra Shrotriya <shamindra.shrotriya@gmail.com>
Date: Sun, 18 Apr 2021 16:56:11 -0400
Subject: [PATCH] FT: #60 More efficient replications in confint simulation for
 vignette 3.1

---
 .../experiments/experiment-vignette3.1-SS-2.R | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 R/scripts_and_filters/experiments/experiment-vignette3.1-SS-2.R

diff --git a/R/scripts_and_filters/experiments/experiment-vignette3.1-SS-2.R b/R/scripts_and_filters/experiments/experiment-vignette3.1-SS-2.R
new file mode 100644
index 00000000..d19aefcc
--- /dev/null
+++ b/R/scripts_and_filters/experiments/experiment-vignette3.1-SS-2.R
@@ -0,0 +1,133 @@
+# Setup Libraries ---------------------------------------------------------
+library(tidyverse)
+library(progress)
+library(glue)
+
+devtools::document()
+devtools::load_all()
+
+# Global variables --------------------------------------------------------
+set.seed(35542)
+NUM_COVG_REPS <- 200 # Number of coverage replications
+NUM_OBS <- 300
+
+# Create grid sequences ---------------------------------------------------
+# For empirical bootstrap parameters i.e. B*m
+# Create individual sequences of grid values to cross join
+# TODO: Need to add replace = {TRUE/FALSE} values once we add subsampling
+out_crossing <- purrr::cross2(
+  .x = seq(from = 5, to = 35, by = 10),
+  .y = seq(from = 40, to = 120, by = 40),
+  .filter = `==`
+) %>%
+  purrr::map(purrr::set_names, c("B", "m"))
+
+# Check output for one of the lists we have constructed
+out_crossing[[10]]
+out_crossing[[10]]$B
+out_crossing[[10]]$m
+
+#' Given fitted linear model i.e. `mod_fit`
+#' For an individual coverage replication index i.e. `covg_rep_idx`
+#' For an individual [B, m] empirical bootstrap combination i.e. `boot_emp`
+#' Get the empirical bootstrap confidence interval for the fitted parameters
+fit_ind_repl_ind_boot_emp <- function(covg_rep_idx, mod_fit, boot_emp) {
+  confint_fit <- comp_var(mod_fit = mod_fit, boot_emp = boot_emp) %>%
+    get_confint(mod_fit = ., level = 0.95, boot_emp = TRUE) %>%
+    dplyr::mutate(
+      covg_rep_idx = covg_rep_idx,
+      B = boot_emp[["B"]],
+      m = boot_emp[["m"]]
+    ) %>%
+    dplyr::select(covg_rep_idx, B, m, dplyr::everything())
+  return(confint_fit)
+}
+
+fit_ind_repl_all_boot_emp <- function(covg_rep_idx, out_crossing) {
+  # TODO: Remove the print message later
+  print(glue::glue("Running coverage replication index: {covg_rep_idx}...\n"))
+
+  # For each coverage replication generate the data once
+  # For all B*m combinations
+  n <- NUM_OBS
+  X <- stats::rnorm(n, 0, 1)
+  y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
+  lm_fit <- stats::lm(y ~ X)
+
+  # Using the single data generated for each coverage replication
+  # Get the empirical bootstrap confidence interval for each individual B*m
+  # combination, and combine all such combination output into a single
+  # tibble
+  out <- out_crossing %>%
+    purrr::map_dfr(
+      .x = .,
+      .f = ~ fit_ind_repl_ind_boot_emp(
+        covg_rep_idx = covg_rep_idx,
+        mod_fit = lm_fit,
+        boot_emp = .x
+      )
+    )
+  return(out)
+}
+
+fit_all_repl_all_boot_emp <- function(num_reps, out_crossing) {
+  out <- 1:num_reps %>%
+    purrr::map_dfr(
+      .x = .,
+      .f = ~ fit_ind_repl_all_boot_emp(
+        covg_rep_idx = .x,
+        out_crossing = out_crossing
+      )
+    )
+  return(out)
+}
+
+# Compute confidence intervals for N replications -------------------------
+system.time(confint_replications <- fit_all_repl_all_boot_emp(
+  num_reps = NUM_COVG_REPS,
+  out_crossing = out_crossing
+) %>%
+  dplyr::arrange(B, m, covg_rep_idx) %>%
+  dplyr::mutate(
+    term = forcats::as_factor(term),
+    B = forcats::as_factor(B),
+    m = forcats::as_factor(m)
+  ))
+
+# Check the output
+head(confint_replications)
+
+# Get combined confidence intervals ---------------------------------------
+all_confint <- confint_replications %>%
+  # Add plotting code here
+  filter(stat_type == "conf.low" |
+    stat_type == "conf.high") %>%
+  pivot_wider(names_from = stat_type, values_from = stat_val)
+
+all_confint_coverage <- all_confint %>%
+  inner_join(tibble(
+    term = c("(Intercept)", "X"),
+    value_par = c(2, 1)
+  ),
+  by = "term"
+  ) %>%
+  mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
+  group_by(term, B, m, var_type_abb) %>%
+  summarize(
+    coverage = mean(covers_c),
+    avg_width = mean(conf.high - conf.low)
+  )
+
+# Coverage seems to be working here i.e. most of the values are in the 86-100% range
+summary(all_confint_coverage$coverage)
+hist(all_confint_coverage$coverage, breaks = 10, xlim = c(0, 1))
+
+# TODO: Check if we should be plotting avg_width?
+# Should this be coverage instead?
+all_confint_coverage %>%
+  mutate(title = as.factor(glue::glue("m = {m}"))) %>%
+  ggplot(aes(B, avg_width)) +
+  geom_col() +
+  facet_grid(~title) +
+  labs(y = "Coverage") +
+  geom_hline(yintercept = 0.95, linetype = "dashed")