Merge pull request #61 from shamindras/iss-60-assertion-getassmps

shamindras · May 2, 2021 · bddb14d · bddb14d
2 parents 23d7747 + 5e792ba
commit bddb14d
Show file tree

Hide file tree

Showing 104 changed files with 2,817 additions and 294 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,15 +1,19 @@
 Package: maars
 Title: Tidy Inference Under Misspecified Statistical Models in R
-Version: 1.0.0
+Version: 1.1.0
 Authors@R: 
     c(person(given = "Shamindra",
              family = "Shrotriya",
-             role = c("aut", "cre"),
+             role = c("aut", "cre", "cph"),
              email = "[email protected]"),
       person(given = "Riccardo",
              family = "Fogliato",
-             role = c("aut"),
-             email = "[email protected]"))
+             role = c("aut", "cph"),
+             email = "[email protected]"),
+      person(given = "Arun Kumar",
+             family = "Kuchibhotla",
+             role = c("aut", "cph"),
+             email = "[email protected]"))
 Description: Tidy Inference Under Misspecified Statistical Models in R. An 
     implementation of the Models As Approximations series of statistics papers.
 License: MIT + file LICENSE

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,39 @@
+# maars 1.1.0
+
+- Change `Significance` heading in our summary output to be abbreviated to `Signif:` to be consistent with `lm()` output
+- Change column ordering for bootstrap standard errors and t-statistics rigorously
+- Change p-value format to be `2`/`3` digits
+- Clean `R/scripts_and_filters/experiments/` dir, remove old experiments
+- Clean up script that gets metadata of `maars` function **metadata** from `pkgdown`
+- Remove all `base::` prefix use e.g. `base::return()`
+- Change `dplyr::summarise` to `dplyr::summarize` for spelling consistency
+- Ensure all `stats` functions use the `stats::` prefix
+- Fix `.data` related `rlang` issues
+- We shouldn't name a variable `df` since this conflicts with `stats::df`. We should change this after the demo
+- Clean vignette. Leave the plot in the vignette (to be moved elsewhere)
+- Updated the corrected `Boston Housing` Dataset with citations. Add unit tests 
+for the corrections
+- Clean up spelling notes
+- Change multiplier weights code to use `switch` based approach
+- Switch to `model.matrix` in sandwich variance and use residuals in the computation
+- Update `maars` to have a package level doc. Add `@importFrom` statements
+- Fix `NOTE` by adding `.gitkeep` in vignettes to `.RBuildignore`
+- Fix `NOTE` by DESCRIPTION meta-information by making it a couple of sentences. This is a placeholder and we should refine it before official `CRAN` release.
+- Consolidate `boston-housing.R` and `la-county.R` files into a single `data-maars.R` files. Consolidate `test` files accordingly
+- Make some minor changes to the vignette
+- Add `styling` to our code using the `Makefile` and `styler::style_dir(here::here('R'))` 
+and for `tests`
+- Have a `make style` which does both `R` and `tests`
+- Make sure styling does not include vignettes
+- Ensure that `url_check()` are all resolved for `CRAN`
+- Remove DOI entries from `inst/REFERENCES.bib` since they can cause `CRAN` url issues
+- Add search functionality to `pkgdown` our site.
+- Use MIT License
+- Fix the no visible binding for global variable errors in our code
+- Remove mixture of `%>%` and base code, and just break pipes into variables
+- Change all `attr(obj, "class") <- c("obj_class_name")` to be of the form `class(obj) <- "obj_class_name"` for consistency
+- Change `dplyr::_all` scoped words using the superseded `across` function
+
 # maars 1.0.0
 
 - Set the default `digits = 3` formatting in `summary.maars_lm` printed output

diff --git a/R/boot-empirical.R b/R/boot-empirical.R
@@ -12,6 +12,7 @@
 #'   be drawn.
 #' @param m Number of observations to be sampled with replacement from
 #'   the original dataset for each bootstrap repetition.
+#' @param replace TODO: ADD
 #'
 #' @return A tibble containing the number of bootstrap iteration, the
 #'   bootstrap samples (\code{data}),
@@ -41,14 +42,15 @@
 #' }
 comp_boot_emp_samples <- function(data,
                                   B = 1,
-                                  m = NULL) {
+                                  m = NULL,
+                                  replace = TRUE) {
   n <- nrow(data)
   if (is.null(m)) {
     m <- n
   }
   # this is a simplified version of rsample because rsample currently
   # does not allow for sampling m!=n observations from the data
-  indices <- purrr::map(rep(n, B), sample, replace = TRUE, size = m)
+  indices <- purrr::map(rep(n, B), sample, replace = replace, size = m)
 
   out <- tibble::tibble(
     b = as.integer(paste0(1:length(indices))),
@@ -171,6 +173,7 @@ fit_reg <- function(mod_fit, data, weights = NULL) {
 #' @param B Bootstrap repetitions or number of bootstrap samples to be drawn.
 #' @param m Number of observations to be sampled with replacement from the
 #'   dataset for each bootstrap repetition.
+#' @param replace TODO: ADD
 #'
 #' @return A list containing the following elements.
 #'   \code{var_type}: The type of estimator for the variance of the coefficients
@@ -207,7 +210,7 @@ fit_reg <- function(mod_fit, data, weights = NULL) {
 #'
 #' print(out)
 #' }
-comp_boot_emp <- function(mod_fit, B = 100, m = NULL) {
+comp_boot_emp <- function(mod_fit, B = 100, m = NULL, replace = TRUE) {
   assertthat::assert_that(all("lm" == class(mod_fit)) | any("glm" == class(mod_fit)),
                           msg = glue::glue("mod_fit must only be of class lm or glm")
   )
@@ -219,20 +222,26 @@ comp_boot_emp <- function(mod_fit, B = 100, m = NULL) {
     m <- n
   }
 
+  assertthat::assert_that(replace || (!replace & m <= n),
+                            msg = glue::glue("m must be less or equal to n for subsampling")
+  )
+
+  boot_type <- ifelse(replace, 'emp', 'sub')
+
   boot_out <- lapply(
     1:B,
     function(x) {
       fit_reg(
         mod_fit = mod_fit,
-        data = comp_boot_emp_samples(data, B = 1, m)$data[[1]]
+        data = comp_boot_emp_samples(data, B = 1, m = m, replace = replace)$data[[1]]
       )
     }
   )
 
   cov_mat <- boot_out %>%
     purrr::map(~ .x %>% dplyr::pull(estimate)) %>%
     dplyr::bind_rows(data = ., .id = NULL) %>%
-    stats::cov(x = .) * m / n
+    stats::cov(x = .) * m / n # check in case of subsampling
 
   boot_out <- boot_out %>%
     dplyr::bind_rows(.id = "b") %>%
@@ -243,12 +252,12 @@ comp_boot_emp <- function(mod_fit, B = 100, m = NULL) {
   summary_boot <- get_boot_summary(
     mod_fit = mod_fit,
     boot_out = boot_out,
-    boot_type = "emp"
+    boot_type = boot_type # check if subsampling needs any modification
   )
 
   out <- list(
-    var_type = "boot_emp",
-    var_type_abb = "emp",
+    var_type = glue::glue("boot_{boot_type}"),
+    var_type_abb = boot_type,
     var_summary = summary_boot,
     var_assumptions = c(
       glue::glue("Observations are assumed to be independent",

diff --git a/R/maars-lm.R b/R/maars-lm.R
@@ -16,6 +16,7 @@
 #'   in \code{list(B = -15, m = -20)} will pass this function without errors,
 #'   these will be addressed explicitly in \code{\link{comp_boot_emp}} as
 #'   invalid inputs.
+#' @param boot_sub (list) TODO: ADD
 #' @param boot_mul (list) : In the case of multiplier bootstrap the expected
 #'   input is of the form \code{list(B = 10, weights_type = "rademacher")}.
 #'   Here the named element \code{weights_type} is optional
@@ -76,14 +77,19 @@
 #' }
 comp_var <- function(mod_fit,
                      boot_emp = NULL,
+                     boot_sub = NULL,
                      boot_res = NULL,
                      boot_mul = NULL) {
   # the following condition will need to be revised once we introduce glm
   if (all(c("maars_lm", "lm") %in% class(mod_fit))) {
     attr(mod_fit, "class") <- "lm"
   }
 
-  out_var <- comp_mms_var(mod_fit, boot_emp, boot_res, boot_mul)
+  out_var <- comp_mms_var(mod_fit = mod_fit,
+                          boot_emp = boot_emp,
+                          boot_sub = boot_sub,
+                          boot_res = boot_res,
+                          boot_mul = boot_mul)
   mod_fit[["var"]] <- out_var
 
   class(mod_fit) <- c("maars_lm", "lm")
@@ -165,6 +171,8 @@ get_mms_summary_split_cli <- function(title,
 #'   \code{FALSE} to exclude this output from the request.
 #' @param boot_emp (logical) : \code{TRUE} if empirical bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request.
+#' @param boot_sub (logical) : \code{TRUE} if subsampling standard error
+#'   output is required, \code{FALSE} to exclude this output from the request.
 #' @param boot_res (logical) : \code{TRUE} if residual bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request.
 #' @param boot_mul (logical) : \code{TRUE} if multiplier bootstrap standard error
@@ -178,11 +186,12 @@ get_mms_summary_split_cli <- function(title,
 #' @method summary maars_lm
 #' @export
 summary.maars_lm <- function(object,
-                             sand = FALSE,
-                             boot_emp = FALSE,
-                             boot_res = FALSE,
-                             boot_mul = FALSE,
-                             well_specified = FALSE,
+                             sand = NULL,
+                             boot_emp = NULL,
+                             boot_sub = NULL,
+                             boot_mul = NULL,
+                             boot_res = NULL,
+                             well_specified = NULL,
                              digits = 3,
                              ...) {
 
@@ -193,6 +202,7 @@ summary.maars_lm <- function(object,
     mod_fit = object,
     sand = sand,
     boot_emp = boot_emp,
+    boot_sub = boot_sub,
     boot_res = boot_res,
     boot_mul = boot_mul,
     well_specified = well_specified
@@ -598,6 +608,8 @@ get_mms_summary_confint_split_cli <- function(title,
 #'   \code{FALSE} to exclude this output from the request.
 #' @param boot_emp (logical) : \code{TRUE} if empirical bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request.
+#' @param boot_sub (logical) : \code{TRUE} if subsampling standard error
+#'   output is required, \code{FALSE} to exclude this output from the request.
 #' @param boot_res (logical) : \code{TRUE} if residual bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request.
 #' @param boot_mul (logical) : \code{TRUE} if multiplier bootstrap standard error
@@ -614,8 +626,10 @@ get_mms_summary_confint_split_cli <- function(title,
 confint.maars_lm <- function(object,
                              parm = NULL,
                              level = 0.95,
+                             # TODO: to be changed
                              sand = TRUE,
                              boot_emp = FALSE,
+                             boot_sub = FALSE,
                              boot_res = FALSE,
                              boot_mul = FALSE,
                              well_specified = FALSE,
@@ -640,6 +654,7 @@ confint.maars_lm <- function(object,
     mod_fit = object,
     sand = sand,
     boot_emp = boot_emp,
+    boot_sub = boot_sub,
     boot_res = boot_res,
     boot_mul = boot_mul,
     well_specified = well_specified

diff --git a/R/ols-confint.R b/R/ols-confint.R
@@ -15,6 +15,8 @@
 #'   \code{FALSE} to exclude this output from the request
 #' @param boot_emp (logical) : \code{TRUE} if empirical bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request
+#' @param boot_sub (logical) : \code{TRUE} if subsampling standard error
+#'   output is required, \code{FALSE} to exclude this output from the request
 #' @param boot_res (logical) : \code{TRUE} if residual bootstrap standard error
 #'   output is required, \code{FALSE} to exclude this output from the request
 #' @param boot_mul (logical) : \code{TRUE} if multiplier bootstrap standard error
@@ -40,11 +42,12 @@
 get_confint <- function(mod_fit,
                         parm = NULL,
                         level = 0.95,
-                        sand = TRUE,
-                        boot_emp = FALSE,
-                        boot_mul = FALSE,
-                        boot_res = FALSE,
-                        well_specified = FALSE) {
+                        sand = NULL,
+                        boot_emp = NULL,
+                        boot_sub = NULL,
+                        boot_mul = NULL,
+                        boot_res = NULL,
+                        well_specified = NULL) {
 
   # Check parm is NULL valued
   # TODO: Allow this to be a vector of numbers or a vector of names to filter
@@ -65,6 +68,7 @@ get_confint <- function(mod_fit,
     mod_fit = mod_fit,
     sand = sand,
     boot_emp = boot_emp,
+    boot_sub = boot_sub,
     boot_res = boot_res,
     boot_mul = boot_mul,
     well_specified = well_specified

diff --git a/R/ols-sandwich-lm-var.R b/R/ols-sandwich-lm-var.R
@@ -141,7 +141,7 @@ comp_lm_var <- function(mod_fit) {
     var_type_abb = "lm",
     var_summary = summary_lm,
     var_assumptions = assumptions_lm,
-    cov_mat = NULL
+    cov_mat = vcov(mod_fit)
   )
 
   return(out)