Skip to content

Commit

Permalink
FT: #62 more prototyping for vignette 3.1
Browse files Browse the repository at this point in the history
  • Loading branch information
ricfog committed Apr 18, 2021
1 parent caa985f commit 8fccca0
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 94 deletions.
10 changes: 6 additions & 4 deletions R/scripts_and_filters/experiments/experiment-lincom.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ lincom <- function(coef_est, R, r, cov_mat){
# https://www.stata.com/manuals/rtest.pdf page 21


get_lincomtest <- function(mod_fit, R,
get_lincomtest <- function(mod_fit,
R,
r,
sand = NULL,
boot_emp = NULL,
Expand Down Expand Up @@ -106,6 +107,10 @@ get_lincomtest <- function(mod_fit, R,
get_lincomtest(mms_var,
R = matrix(c(1,0,0,1),ncol = 2),
r = c(0,0))
get_lincomtest(mms_var,
R = matrix(c(1,0),ncol = 2),
r = 0)

# test


Expand All @@ -125,9 +130,6 @@ diag(length(coef(lm_fit))) %>%
function(x) lincom(coef_est = coef(lm_fit), R = x, r = c(0), cov_mat = vcov(lm_fit)))





car::linearHypothesis(lm_fit, c(0,1,0), test=c("Chisq"))
lmtest::coeftest(lm_fit, sandwich::sandwich(lm_fit))
lmtest::waldtest(lm_fit)
Expand Down
160 changes: 85 additions & 75 deletions R/scripts_and_filters/experiments/experiment-vignette-2.2.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@ beta2_grid <- c(0, 0.1, 0.3, 0.5)


# note that the parameters of the linear pojection are
get_proj_params <- function(beta0, beta1, beta2){
first_term <- solve(matrix(c(1,5,5,100/3), ncol = 2))
second_term <- beta0 * c(1,5) + beta1 * c(5,100/3) + beta2 * c(100/3, 250)
proj_beta <- first_term %*% second_term
return(tibble(c0 = proj_beta[1], c1 = proj_beta[2]))
get_proj_params <- function(beta0, beta1, beta2) {
first_term <- solve(matrix(c(1, 5, 5, 100 / 3), ncol = 2))
second_term <- beta0 * c(1, 5) + beta1 * c(5, 100 / 3) + beta2 * c(100 / 3, 250)
proj_beta <- first_term %*% second_term
return(tibble(c0 = proj_beta[1], c1 = proj_beta[2]))
}
C_beta2 <- beta2_grid %>%
imap_dfr(~ bind_cols(tibble(beta2 = .x),
get_proj_params(beta0, beta1, .x)))
imap_dfr(~ bind_cols(
tibble(beta2 = .x),
get_proj_params(beta0, beta1, .x)
))
C_beta2 <- C_beta2 %>%
pivot_longer(cols = c(c0, c1), names_to = 'par', values_to = 'value_par')


pivot_longer(cols = c(c0, c1), names_to = "par", values_to = "value_par")

# fit linear model on only one dataset ----

Expand All @@ -37,112 +37,122 @@ n_obs <- 1e2
# generate data
X <- runif(n_obs, 0, 10)
Y_beta2 <- beta2_grid %>%
map(~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)) %>%
setNames(beta2_grid)
map(~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)) %>%
setNames(beta2_grid)

# plot the data
Y_beta2 %>%
imap( ~ tibble(X = X, Y = .x, beta2 = .y) %>%
ggplot(., aes(X,Y)) +
geom_point() +
labs(title = expression(beta2 ~ '=')) # TODO: Fix this
+ geom_smooth(method='lm', formula = y ~ x)
) %>%
patchwork::wrap_plots(., ncol = 3, nrow = 2)
imap(~ tibble(X = X, Y = .x, beta2 = .y) %>%
ggplot(., aes(X, Y)) +
geom_point() +
labs(title = expression(beta2 ~ "=")) # TODO: Fix this
+
geom_smooth(method = "lm", formula = y ~ x)) %>%
patchwork::wrap_plots(., ncol = 3, nrow = 2)




# obtain coverage ----

n_obs <- 5 * 1e1
n_obs <- 5e2

# function to obtain regression data
get_data <- function(beta2_grid, n_obs){
X <- runif(n_obs, 0, 10)
Y_beta2 <- map(beta2_grid,
~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1))

out <- tibble(
beta2 = beta2_grid,
regdata = map(Y_beta2, ~ tibble(Y = ., X = X))
)

return(out)
get_data <- function(beta2_grid, n_obs) {
X <- runif(n_obs, 0, 10)
Y_beta2 <- map(
beta2_grid,
~ beta0 + beta1 * X + .x * X^2 + rnorm(n_obs, 0, 1)
)

out <- tibble(
beta2 = beta2_grid,
regdata = map(Y_beta2, ~ tibble(Y = ., X = X))
)

return(out)
}


# obtain maars objects with variance estimates
# for (to add) replications of the data
var_maars_all <- 1:500 %>%
map_dfr( ~ get_data(beta2_grid) %>%
mutate(var_maars = map(
regdata,
~ comp_var(
mod_fit = lm(Y ~ X, data = .x),
boot_emp = list(B = 100),
boot_mul = list(B = 100, weights_type = "rademacher"),
boot_res = list(B = 100)
)
)),
.id = "iter")
map_dfr(~ get_data(beta2_grid) %>%
mutate(var_maars = map(
regdata,
~ comp_var(
mod_fit = lm(Y ~ X, data = .x),
boot_emp = list(B = 100),
boot_mul = list(B = 100, weights_type = "rademacher"),
boot_res = list(B = 100)
)
)),
.id = "iter"
)


# obtain confidence intervals and plot coverage ----

# obtain confidence intervals for each estimate
ci_all <- var_maars_all %>%
mutate(ci = map(var_maars, ~ get_confint(.) %>%
filter(stat_type == "conf.low" |
stat_type == "conf.high") %>%
pivot_wider(names_from = stat_type, values_from = stat_val) %>%
mutate(term = case_when(
term == '(Intercept)' ~ 'c0',
term == 'X' ~ 'c1'
))))
mutate(ci = map(var_maars, ~ get_confint(.) %>%
filter(stat_type == "conf.low" |
stat_type == "conf.high") %>%
pivot_wider(names_from = stat_type, values_from = stat_val) %>%
mutate(term = case_when(
term == "(Intercept)" ~ "c0",
term == "X" ~ "c1"
))))

glimpse(ci_all)

# obtain average width of the interval
avg_width_beta2 <- ci_all %>%
unnest(ci) %>%
group_by(term, beta2, var_type_abb) %>%
summarise(avg_width = mean(conf.high-conf.low))
unnest(ci) %>%
group_by(term, beta2, var_type_abb) %>%
summarise(avg_width = mean(conf.high - conf.low))
avg_width_beta2

# TODO: add "beta2=" to the panels titles
avg_width_beta2 %>%
ggplot(aes(factor(var_type_abb, levels = c('lm', 'res', 'sand', 'emp', 'mul')),
avg_width)) +
geom_col() +
facet_grid(term ~ beta2, scales = 'free_y') +
labs(x = 'Type of estimator', y = 'Average width of confidence intervals')
ggplot(aes(
factor(var_type_abb, levels = c("lm", "res", "sand", "emp", "mul")),
avg_width
)) +
geom_col() +
facet_grid(term ~ beta2, scales = "free_y") +
labs(x = "Type of estimator", y = "Average width of confidence intervals")


# obtain coverage
coverage_proj <- ci_all %>%
unnest(ci) %>%
inner_join(C_beta2, by = c('beta2', 'term' = 'par')) %>%
mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
group_by(term, beta2, var_type_abb) %>%
summarise(coverage = mean(covers_c))
unnest(ci) %>%
inner_join(C_beta2, by = c("beta2", "term" = "par")) %>%
mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
group_by(term, beta2, var_type_abb) %>%
summarise(coverage = mean(covers_c))

coverage_proj

# plot coverage
coverage_proj %>%
ggplot(aes(x = factor(var_type_abb,
levels = c('lm', 'res', 'sand', 'emp', 'mul')),
y = coverage,
fill = factor(var_type_abb,
levels = c('lm', 'res', 'sand', 'emp', 'mul')))) +
geom_col() +
facet_grid(term ~ beta2, scales = 'free_y') +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank()) +
labs(x = '', y = 'Coverage', fill = 'Estimator') +
geom_hline(yintercept = 0.95, linetype = 'dashed')

ggplot(aes(
x = factor(var_type_abb,
levels = c("lm", "res", "sand", "emp", "mul")
),
y = coverage,
fill = factor(var_type_abb,
levels = c("lm", "res", "sand", "emp", "mul")
)
)) +
geom_col() +
facet_grid(term ~ beta2, scales = "free_y") +
theme(
axis.ticks.x = element_blank(),
axis.text.x = element_blank()
) +
labs(x = "", y = "Coverage", fill = "Estimator") +
geom_hline(yintercept = 0.95, linetype = "dashed")



Expand Down
57 changes: 42 additions & 15 deletions R/scripts_and_filters/experiments/experiment-vignette3.1-SS.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ set.seed(35542)
NUM_COVG_REPS <- 10 # Number of coverage replications

# Setup progress bar ------------------------------------------------------
pb <- progress_bar$new(
format = "running replications [:bar] :elapsedfull",
total = NUM_COVG_REPS, clear = FALSE, width = 60
)
# pb <- progress_bar$new(
# format = "running replications [:bar] :elapsedfull",
# total = NUM_COVG_REPS, clear = FALSE, width = 60
# )

# Run lm fitted model -----------------------------------------------------
# Fit the linear model using OLS (ordinary least squares)
fit_confint_boot_emp <- function(covg_rep_idx, boot_emp) {
n <- 500
n <- 1e2 # TOO LARGE - DATA CHANGES ALL THE TIMES
X <- stats::rnorm(n, 0, 1)
y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
lm_fit <- stats::lm(y ~ X)
Expand All @@ -35,7 +35,7 @@ fit_confint_boot_emp <- function(covg_rep_idx, boot_emp) {

# Wrapper function to
wrap_fit_confint_boot_emp <- function(num_reps, boot_emp) {
pb$tick()
#pb$tick()
out <- 1:num_reps %>%
purrr::imap_dfr(.x = ., .f = ~ fit_confint_boot_emp(
covg_rep_idx = .y,
Expand All @@ -55,17 +55,11 @@ wrap_fit_confint_boot_emp(num_reps = 5, boot_emp = list(B = 20, m = 200))
# Create individual sequences of grid values to cross join
# TODO: Need to add replace = {TRUE/FALSE} values once we add subsampling

# TODO: This version is using rowwise, we can simplify this with map2 below
# out_crossing <- tidyr::crossing(B = seq(from = 5, to = 35, by = 10),
# m = seq(from = 40, to = 120, by = 40)) %>%
# dplyr::rowwise(data = .) %>%
# dplyr::mutate(boot_emp = list(tibble::lst("B" = B,
# "m" = switch(!is.na(m), m, NULL))))

out_crossing <- tidyr::crossing(
B = seq(from = 5, to = 35, by = 10),
m = seq(from = 40, to = 120, by = 40)
) %>%
# THIS IS PROBABLY NOT NEEDED
dplyr::mutate(boot_emp = purrr::map2(
.x = B, .y = m,
.f = ~ list(B = .x, m = .y)
Expand Down Expand Up @@ -103,10 +97,32 @@ confint_replications <-
head(confint_replications)

# Get combined confidence intervals ---------------------------------------
confint_replications %>%
confint <- confint_replications %>%
dplyr::pull(cmp_var) %>%
purrr::map_dfr(.x = ., .f = ~.x)
purrr::map_dfr(.x = ., .f = ~.x) %>%
# Add plotting code here
filter(stat_type == "conf.low" |
stat_type == "conf.high") %>%
pivot_wider(names_from = stat_type, values_from = stat_val)



coverage <- confint %>%
inner_join(tibble(term = c('(Intercept)', 'X'), value_par = c(2, 1)), by = 'term') %>%
mutate(covers_c = ifelse(conf.low <= value_par & conf.high >= value_par, 1, 0)) %>%
group_by(term, B, m, var_type_abb) %>%
summarise(coverage = mean(covers_c),
avg_width = mean(conf.high-conf.low))

coverage %>%
mutate(title = as.factor(glue::glue('m = {m}'))) %>%
ggplot(aes(B, avg_width)) +
geom_col() +
facet_grid(~ title) +
labs(y = "Coverage") +
geom_hline(yintercept = 0.95, linetype = "dashed")



# TODO: Discuss
# 1. Can we filter the tibble from `fit_confint_boot_emp` to only keep the
Expand All @@ -119,3 +135,14 @@ confint_replications %>%
# or the NUM_COVG_REPS? We should find a reasonable combination that
# allows the key ideas to be illustrated and for the code to be more
# efficiently run











0 comments on commit 8fccca0

Please sign in to comment.