FT: #62 more prototyping for vignette 3.1

shamindras · Apr 18, 2021 · 8fccca0 · 8fccca0
1 parent caa985f
commit 8fccca0
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 94 deletions.
diff --git a/R/scripts_and_filters/experiments/experiment-lincom.R b/R/scripts_and_filters/experiments/experiment-lincom.R
@@ -64,7 +64,8 @@ lincom <- function(coef_est, R, r, cov_mat){
 # https://www.stata.com/manuals/rtest.pdf page 21
 
 
-get_lincomtest <- function(mod_fit, R,
+get_lincomtest <- function(mod_fit,
+                           R,
                    r,
                    sand = NULL,
                    boot_emp = NULL,
@@ -106,6 +107,10 @@ get_lincomtest <- function(mod_fit, R,
 get_lincomtest(mms_var,
                R = matrix(c(1,0,0,1),ncol = 2),
                r = c(0,0))
+get_lincomtest(mms_var,
+               R = matrix(c(1,0),ncol = 2),
+               r = 0)
+
 # test
 
 
@@ -125,9 +130,6 @@ diag(length(coef(lm_fit))) %>%
           function(x) lincom(coef_est = coef(lm_fit), R = x, r = c(0), cov_mat = vcov(lm_fit)))
 
 
-
-
-
 car::linearHypothesis(lm_fit, c(0,1,0), test=c("Chisq"))
 lmtest::coeftest(lm_fit, sandwich::sandwich(lm_fit))
 lmtest::waldtest(lm_fit)

diff --git a/R/scripts_and_filters/experiments/experiment-vignette-2.2.R b/R/scripts_and_filters/experiments/experiment-vignette-2.2.R
@@ -16,19 +16,19 @@ beta2_grid <- c(0, 0.1, 0.3, 0.5)
 
 
 # note that the parameters of the linear pojection are
-get_proj_params <- function(beta0, beta1, beta2){
-    first_term <- solve(matrix(c(1,5,5,100/3), ncol = 2))
-    second_term <- beta0 * c(1,5) + beta1 * c(5,100/3) + beta2 * c(100/3, 250)
-    proj_beta <- first_term %*% second_term
-    return(tibble(c0 = proj_beta[1], c1 = proj_beta[2]))
+get_proj_params <- function(beta0, beta1, beta2) {
+  first_term <- solve(matrix(c(1, 5, 5, 100 / 3), ncol = 2))
+  second_term <- beta0 * c(1, 5) + beta1 * c(5, 100 / 3) + beta2 * c(100 / 3, 250)
+  proj_beta <- first_term %*% second_term
+  return(tibble(c0 = proj_beta[1], c1 = proj_beta[2]))
 }
 C_beta2 <- beta2_grid %>%
-    imap_dfr(~ bind_cols(tibble(beta2 = .x),
-        get_proj_params(beta0, beta1, .x)))
+  imap_dfr(~ bind_cols(
+    tibble(beta2 = .x),
+    get_proj_params(beta0, beta1, .x)
+  ))
 C_beta2 <- C_beta2 %>%
-    pivot_longer(cols = c(c0, c1), names_to = 'par', values_to = 'value_par')
-
-
+  pivot_longer(cols = c(c0, c1), names_to = "par", values_to = "value_par")
 
 # fit linear model on only one dataset ----
 
@@ -37,112 +37,122 @@ n_obs <- 1e2
 # generate data
 X <- runif(n_obs, 0, 10)
 Y_beta2 <- beta2_grid %>%
-    map(~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)) %>%
-    setNames(beta2_grid)
+  map(~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)) %>%
+  setNames(beta2_grid)
 
 # plot the data
 Y_beta2 %>%
-    imap( ~ tibble(X = X, Y = .x, beta2 = .y) %>%
-              ggplot(., aes(X,Y)) +
-              geom_point() +
-              labs(title = expression(beta2 ~ '=')) # TODO: Fix this
-          + geom_smooth(method='lm', formula = y ~ x)
-    ) %>%
-    patchwork::wrap_plots(., ncol = 3, nrow = 2)
+  imap(~ tibble(X = X, Y = .x, beta2 = .y) %>%
+    ggplot(., aes(X, Y)) +
+    geom_point() +
+    labs(title = expression(beta2 ~ "=")) # TODO: Fix this
+    +
+    geom_smooth(method = "lm", formula = y ~ x)) %>%
+  patchwork::wrap_plots(., ncol = 3, nrow = 2)
 
 
 
 
 # obtain coverage ----
 
-n_obs <- 5 * 1e1
+n_obs <- 5e2
 
 # function to obtain regression data
-get_data <- function(beta2_grid, n_obs){
-    X <- runif(n_obs, 0, 10)
-    Y_beta2 <- map(beta2_grid,
-            ~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1))
-
-    out <- tibble(
-        beta2 = beta2_grid,
-        regdata = map(Y_beta2, ~ tibble(Y = ., X = X))
-    )
-
-    return(out)
+get_data <- function(beta2_grid, n_obs) {
+  X <- runif(n_obs, 0, 10)
+  Y_beta2 <- map(
+    beta2_grid,
+    ~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)
+  )
+
+  out <- tibble(
+    beta2 = beta2_grid,
+    regdata = map(Y_beta2, ~ tibble(Y = ., X = X))
+  )
+
+  return(out)
 }
 
 
 # obtain maars objects with variance estimates
 # for (to add) replications of the data
 var_maars_all <- 1:500 %>%
-    map_dfr( ~ get_data(beta2_grid) %>%
-  mutate(var_maars = map(
-    regdata,
-    ~ comp_var(
-      mod_fit = lm(Y ~ X, data = .x),
-      boot_emp = list(B = 100),
-      boot_mul = list(B = 100, weights_type = "rademacher"),
-      boot_res = list(B = 100)
-    )
-  )),
-  .id = "iter")
+  map_dfr(~ get_data(beta2_grid) %>%
+    mutate(var_maars = map(
+      regdata,
+      ~ comp_var(
+        mod_fit = lm(Y ~ X, data = .x),
+        boot_emp = list(B = 100),
+        boot_mul = list(B = 100, weights_type = "rademacher"),
+        boot_res = list(B = 100)
+      )
+    )),
+  .id = "iter"
+  )
 
 
 # obtain confidence intervals and plot coverage ----
 
 # obtain confidence intervals for each estimate
 ci_all <- var_maars_all %>%
-    mutate(ci = map(var_maars, ~ get_confint(.) %>%
-                        filter(stat_type == "conf.low" |
-                                   stat_type == "conf.high") %>%
-                        pivot_wider(names_from = stat_type, values_from = stat_val) %>%
-                        mutate(term = case_when(
-                            term == '(Intercept)' ~ 'c0',
-                            term == 'X' ~ 'c1'
-                        ))))
+  mutate(ci = map(var_maars, ~ get_confint(.) %>%
+    filter(stat_type == "conf.low" |
+      stat_type == "conf.high") %>%
+    pivot_wider(names_from = stat_type, values_from = stat_val) %>%
+    mutate(term = case_when(
+      term == "(Intercept)" ~ "c0",
+      term == "X" ~ "c1"
+    ))))
 
 glimpse(ci_all)
 
 # obtain average width of the interval
 avg_width_beta2 <- ci_all %>%
-    unnest(ci) %>%
-    group_by(term, beta2, var_type_abb) %>%
-    summarise(avg_width = mean(conf.high-conf.low))
+  unnest(ci) %>%
+  group_by(term, beta2, var_type_abb) %>%
+  summarise(avg_width = mean(conf.high - conf.low))
 avg_width_beta2
 
 # TODO: add "beta2=" to the panels titles
 avg_width_beta2 %>%
-    ggplot(aes(factor(var_type_abb, levels = c('lm', 'res', 'sand', 'emp', 'mul')),
-               avg_width)) +
-    geom_col() +
-    facet_grid(term ~ beta2, scales = 'free_y') +
-    labs(x = 'Type of estimator', y = 'Average width of confidence intervals')
+  ggplot(aes(
+    factor(var_type_abb, levels = c("lm", "res", "sand", "emp", "mul")),
+    avg_width
+  )) +
+  geom_col() +
+  facet_grid(term ~ beta2, scales = "free_y") +
+  labs(x = "Type of estimator", y = "Average width of confidence intervals")
 
 
 # obtain coverage
 coverage_proj <- ci_all %>%
-    unnest(ci) %>%
-    inner_join(C_beta2, by = c('beta2', 'term' = 'par')) %>%
-    mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
-    group_by(term, beta2, var_type_abb) %>%
-    summarise(coverage = mean(covers_c))
+  unnest(ci) %>%
+  inner_join(C_beta2, by = c("beta2", "term" = "par")) %>%
+  mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
+  group_by(term, beta2, var_type_abb) %>%
+  summarise(coverage = mean(covers_c))
 
 coverage_proj
 
 # plot coverage
 coverage_proj %>%
-    ggplot(aes(x = factor(var_type_abb,
-                      levels = c('lm', 'res', 'sand', 'emp', 'mul')),
-               y = coverage,
-               fill = factor(var_type_abb,
-                             levels = c('lm', 'res', 'sand', 'emp', 'mul')))) +
-    geom_col() +
-    facet_grid(term ~ beta2, scales = 'free_y') +
-    theme(axis.ticks.x = element_blank(),
-          axis.text.x = element_blank()) +
-    labs(x = '', y = 'Coverage', fill = 'Estimator') +
-    geom_hline(yintercept = 0.95, linetype = 'dashed')
-
+  ggplot(aes(
+    x = factor(var_type_abb,
+      levels = c("lm", "res", "sand", "emp", "mul")
+    ),
+    y = coverage,
+    fill = factor(var_type_abb,
+      levels = c("lm", "res", "sand", "emp", "mul")
+    )
+  )) +
+  geom_col() +
+  facet_grid(term ~ beta2, scales = "free_y") +
+  theme(
+    axis.ticks.x = element_blank(),
+    axis.text.x = element_blank()
+  ) +
+  labs(x = "", y = "Coverage", fill = "Estimator") +
+  geom_hline(yintercept = 0.95, linetype = "dashed")
 
 
 

diff --git a/R/scripts_and_filters/experiments/experiment-vignette3.1-SS.R b/R/scripts_and_filters/experiments/experiment-vignette3.1-SS.R
@@ -10,15 +10,15 @@ set.seed(35542)
 NUM_COVG_REPS <- 10 # Number of coverage replications
 
 # Setup progress bar ------------------------------------------------------
-pb <- progress_bar$new(
-  format = "running replications [:bar] :elapsedfull",
-  total = NUM_COVG_REPS, clear = FALSE, width = 60
-)
+# pb <- progress_bar$new(
+#   format = "running replications [:bar] :elapsedfull",
+#   total = NUM_COVG_REPS, clear = FALSE, width = 60
+# )
 
 # Run lm fitted model -----------------------------------------------------
 # Fit the linear model using OLS (ordinary least squares)
 fit_confint_boot_emp <- function(covg_rep_idx, boot_emp) {
-  n <- 500
+  n <- 1e2 # TOO LARGE - DATA CHANGES ALL THE TIMES
   X <- stats::rnorm(n, 0, 1)
   y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
   lm_fit <- stats::lm(y ~ X)
@@ -35,7 +35,7 @@ fit_confint_boot_emp <- function(covg_rep_idx, boot_emp) {
 
 # Wrapper function to
 wrap_fit_confint_boot_emp <- function(num_reps, boot_emp) {
-  pb$tick()
+  #pb$tick()
   out <- 1:num_reps %>%
     purrr::imap_dfr(.x = ., .f = ~ fit_confint_boot_emp(
       covg_rep_idx = .y,
@@ -55,17 +55,11 @@ wrap_fit_confint_boot_emp(num_reps = 5, boot_emp = list(B = 20, m = 200))
 # Create individual sequences of grid values to cross join
 # TODO: Need to add replace = {TRUE/FALSE} values once we add subsampling
 
-# TODO: This version is using rowwise, we can simplify this with map2 below
-# out_crossing <- tidyr::crossing(B = seq(from = 5, to = 35, by = 10),
-#                                 m = seq(from = 40, to = 120, by = 40)) %>%
-#   dplyr::rowwise(data = .) %>%
-#   dplyr::mutate(boot_emp = list(tibble::lst("B" = B,
-#                                             "m" = switch(!is.na(m), m, NULL))))
-
 out_crossing <- tidyr::crossing(
   B = seq(from = 5, to = 35, by = 10),
   m = seq(from = 40, to = 120, by = 40)
 ) %>%
+  # THIS IS PROBABLY NOT NEEDED
   dplyr::mutate(boot_emp = purrr::map2(
     .x = B, .y = m,
     .f = ~ list(B = .x, m = .y)
@@ -103,10 +97,32 @@ confint_replications <-
 head(confint_replications)
 
 # Get combined confidence intervals ---------------------------------------
-confint_replications %>%
+confint <- confint_replications %>%
   dplyr::pull(cmp_var) %>%
-  purrr::map_dfr(.x = ., .f = ~.x)
+  purrr::map_dfr(.x = ., .f = ~.x) %>%
   # Add plotting code here
+  filter(stat_type == "conf.low" |
+           stat_type == "conf.high") %>%
+  pivot_wider(names_from = stat_type, values_from = stat_val)
+
+
+
+coverage <- confint %>%
+  inner_join(tibble(term = c('(Intercept)', 'X'), value_par = c(2, 1)), by = 'term') %>%
+  mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
+  group_by(term, B, m, var_type_abb) %>%
+  summarise(coverage = mean(covers_c),
+          avg_width = mean(conf.high-conf.low))
+
+coverage %>%
+  mutate(title = as.factor(glue::glue('m = {m}'))) %>%
+  ggplot(aes(B, avg_width)) +
+  geom_col() +
+  facet_grid(~ title) +
+  labs(y = "Coverage") +
+  geom_hline(yintercept = 0.95, linetype = "dashed")
+
+
 
 # TODO: Discuss
 # 1. Can we filter the tibble from `fit_confint_boot_emp` to only keep the
@@ -119,3 +135,14 @@ confint_replications %>%
 #    or the NUM_COVG_REPS? We should find a reasonable combination that
 #    allows the key ideas to be illustrated and for the code to be more
 #    efficiently run
+
+
+
+
+
+
+
+
+
+
+