add more unit tests small fixes regarding estimation with pop totals

ncn-foreigners · Oct 4, 2024 · c3a4d53 · c3a4d53
1 parent 3a13951
commit c3a4d53
Show file tree

Hide file tree

Showing 13 changed files with 981 additions and 142 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -45,7 +45,8 @@ Imports:
     nleqslv,
     doParallel,
     foreach,
-    parallel
+    parallel,
+    formula.tools
 LinkingTo: 
     Rcpp,
     RcppArmadillo

diff --git a/NAMESPACE b/NAMESPACE
@@ -34,15 +34,16 @@ importFrom(Rcpp,sourceCpp)
 importFrom(doParallel,registerDoParallel)
 importFrom(foreach,"%dopar%")
 importFrom(foreach,foreach)
-importFrom(formula.tools,lhs)
-importFrom(formula.tools,rhs)
+importFrom(formula.tools,lhs.vars)
+importFrom(formula.tools,rhs.vars)
 importFrom(maxLik,maxLik)
 importFrom(ncvreg,cv.ncvreg)
 importFrom(nleqslv,nleqslv)
 importFrom(parallel,makeCluster)
 importFrom(parallel,stopCluster)
 importFrom(stats,AIC)
 importFrom(stats,BIC)
+importFrom(stats,aggregate)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
 importFrom(stats,coef)

diff --git a/R/internals.R b/R/internals.R
@@ -151,20 +151,29 @@ get_method <- function(method) {
 }
 
 ff <- function(formula) {
-  fff <- as.character(formula)
-  f <- strsplit(fff[2], "\\s*\\+\\s*")[[1]]
-  outcome_formulas <- list()
-  if (any(duplicated(f))) {
-    warning("No unique names of the outcome variables in formula. The error has been corrected")
-    f <- unique(f)
-  }
-  l <- length(f)
-  for (i in 1:l) {
-    outcome_formulas[[i]] <- as.formula(paste(f[i], fff[3], sep = " ~ "))
+  formula_string <- paste(deparse(formula), collapse = " ")
+  formula_parts <- strsplit(formula_string, "~")[[1]]
+  if (length(formula_parts) != 2) {
+    stop("The formula must contain exactly one '~' operator.")
   }
 
+  lhs <- trimws(formula_parts[1])
+  rhs <- trimws(formula_parts[2])
+
+  dependent_vars <- strsplit(lhs, "\\s*\\+\\s*")[[1]]
+  independent_vars <- strsplit(rhs, "\\s*\\+\\s*")[[1]]
+
+  if (any(duplicated(dependent_vars))) {
+    warning("Duplicate dependent variable names detected. They have been made unique.")
+    dependent_vars <- unique(dependent_vars)
+  }
+  outcome_formulas <- vector("list", length(dependent_vars))
+  l <- length(dependent_vars)
+  for (i in seq_along(dependent_vars)) {
+    outcome_formulas[[i]] <- reformulate(termlabels = independent_vars, response = dependent_vars[i])
+  }
   list(
-    f = f,
+    f = dependent_vars,
     outcomes = outcome_formulas,
     l = l
   )

diff --git a/R/nonprobDR.R b/R/nonprobDR.R
@@ -521,6 +521,9 @@ nonprobDR <- function(selection,
       X_rand <- OutcomeModel$X_rand
       nons_names <- OutcomeModel$nons_names
       y_nons <- OutcomeModel$y_nons
+      n_nons <- nrow(X_nons)
+      R <- rep(1, n_nons)
+      loc_nons <- which(R == 1)
 
       if (var_selection == TRUE) {
         nlambda <- control_outcome$nlambda
@@ -559,17 +562,19 @@ nonprobDR <- function(selection,
           colnames(X) <- c("(Intercept)", colnames(Xsel))
           OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ]
           SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1])
+          pop_totals <- c(pop_totals[1], pop_totals[idx+1])
         } else if (control_inference$bias_inf == "div") {
           X_outcome <- as.matrix(X[, beta_selected[-1] + 1, drop = FALSE])
           Xsel <- X_selection <- as.matrix(X[, theta_selected[-1] + 1, drop = FALSE])
           OutcomeModel$X_nons <- cbind(1, X_outcome[loc_nons, ])
-          OutcomeModel$X_rand <- cbind(1, X_outcome[loc_rand, ])
-          colnames(OutcomeModel$X_nons) <- colnames(OutcomeModel$X_rand) <- c("(Intercept)", colnames(X_outcome))
+          colnames(OutcomeModel$X_nons) <- c("(Intercept)", colnames(X_outcome))
           SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[theta_selected[-1] + 1])
           SelectionModel$X_nons <- cbind(1, X_selection[loc_nons, ])
           X <- cbind(1, Xsel)
           colnames(X) <- colnames(SelectionModel$X_nons) <- c("(Intercept)", colnames(Xsel))
+          pop_totals <- c(pop_totals[1], pop_totals[theta_selected[-1] + 1])
         }
+        SelectionModel$total_names <- names(SelectionModel$pop_totals)
       }
       prob_pop_totals <- SelectionModel$pop_totals
 

diff --git a/R/nonprobMI.R b/R/nonprobMI.R
@@ -268,6 +268,7 @@ nonprobMI <- function(outcome,
       n_rand <- 0
       weights_rand <- NULL
       n_nons <- nrow(X_nons)
+      R <- rep(1, n_nons)
 
       ############## WORKING VERSION
       if (var_selection == TRUE) {
@@ -316,7 +317,7 @@ nonprobMI <- function(outcome,
         control = control_outcome,
         n_nons = n_nons,
         n_rand = n_rand,
-        model_frame = OutcomeModel$model_frame_rand,
+        model_frame = Model$model_frame_rand,
         vars_selection = control_inference$vars_selection,
         pop_totals = pop_totals
       )

diff --git a/R/simple_methods.R b/R/simple_methods.R
@@ -328,10 +328,10 @@ deviance.nonprobsvy <- function(object,
 #'
 #' @method total nonprobsvy
 #' @return A vector of estimated totals of the given covariates in subgroups
-#' @importFrom formula.tools lhs
-#' @importFrom formula.tools rhs
+#' @importFrom formula.tools lhs.vars
+#' @importFrom formula.tools rhs.vars
+#' @importFrom stats aggregate
 total.nonprobsvy <- function(x, nonprob) {
-  # Rozbijanie formuły
   variables <- lhs.vars(x)
   groups <- rhs.vars(x)
 
@@ -353,8 +353,9 @@ total.nonprobsvy <- function(x, nonprob) {
 #'
 #' @method mean nonprobsvy
 #' @return A vector of estimated means of the given covariates in subgroups
-#' @importFrom formula.tools lhs
-#' @importFrom formula.tools rhs
+#' @importFrom formula.tools lhs.vars
+#' @importFrom formula.tools rhs.vars
+#' @importFrom stats aggregate
 mean.nonprobsvy <- function(x, nonprob) {
   variables <- lhs.vars(x)
   groups <- rhs.vars(x)
@@ -372,58 +373,64 @@ mean.nonprobsvy <- function(x, nonprob) {
   }
   mean_value
 }
-#' @title Median values of covariates in subgroups
-#'
-#' @param x - formula
-#' @param nonprob - object of nonprobsvy class
-#'
-#' @method median nonprobsvy
-#' @return A vector of estimated medians of the given covariates in subgroups
-#' @importFrom formula.tools lhs
-#' @importFrom formula.tools rhs
-median.nonprobsvy <- function(x, nonprob) {
-  variables <- lhs.vars(x)
-  groups <- rhs.vars(x)
-
-  if (is.null(variables)) {
-    data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
-    median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median)
-  } else {
-
-    data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
-    weights <- nonprob$weights[which(nonprob$R == 1)]
-
-    median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
-                              FUN = function(y, w) weighted.median(y, w = w[1:length(y)]),
-                              w = weights)
-  }
-  median_value
-}
-#' @title Quantile values of covariates in subgroups
-#'
-#' @param x - formula
-#' @param nonprob - object of nonprobsvy class
-#' @param probs - probability
-#'
-#' @method quantile nonprobsvy
-#' @return A vector of estimated quantiles if the given covariates in subgroups
-#' @importFrom formula.tools lhs
-#' @importFrom formula.tools rhs
-quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) {
-  variables <- lhs.vars(x)
-  groups <- rhs.vars(x)
-
-  if (is.null(variables)) {
-    data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
-    percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs))
-  } else {
-
-    data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
-    weights <- nonprob$weights[which(nonprob$R == 1)]
-
-    percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
-                                  FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs),
-                                  w = weights)
-  }
-  percentile_value
-}
+# @title Median values of covariates in subgroups
+#
+# @param x - formula
+# @param nonprob - object of nonprobsvy class
+#
+# @method median nonprobsvy
+# @return A vector of estimated medians of the given covariates in subgroups
+# @importFrom formula.tools lhs.vars
+# @importFrom formula.tools rhs.vars
+# @importFrom spatstat weighted.median
+# @importFrom stats aggregate
+# @importFrom stats median
+# median.nonprobsvy <- function(x, nonprob) {
+#   variables <- lhs.vars(x)
+#   groups <- rhs.vars(x)
+#
+#   if (is.null(variables)) {
+#     data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
+#     median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median)
+#   } else {
+#
+#     data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
+#     weights <- nonprob$weights[which(nonprob$R == 1)]
+#
+#     median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
+#                               FUN = function(y, w) weighted.median(y, w = w[1:length(y)]),
+#                               w = weights)
+#   }
+#   median_value
+# }
+# @title Quantile values of covariates in subgroups
+#
+# @param x - formula
+# @param nonprob - object of nonprobsvy class
+# @param probs - probability
+#
+# @method quantile nonprobsvy
+# @return A vector of estimated quantiles if the given covariates in subgroups
+# @importFrom formula.tools lhs.vars
+# @importFrom formula.tools rhs.vars
+# @importFrom spatstat weighted.quantile
+# @importFrom stats aggregate
+# @importFrom stats quantile
+# quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) {
+#   variables <- lhs.vars(x)
+#   groups <- rhs.vars(x)
+#
+#   if (is.null(variables)) {
+#     data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
+#     percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs))
+#   } else {
+#
+#     data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
+#     weights <- nonprob$weights[which(nonprob$R == 1)]
+#
+#     percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
+#                                   FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs),
+#                                   w = weights)
+#   }
+#   percentile_value
+# }
diff --git a/inst/tinytest/test-2-ipw-totals.R b/inst/tinytest/test-2-ipw-totals.R
@@ -1,4 +1,4 @@
-if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
+# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
 
   library(sampling)
   library(survey)
@@ -505,7 +505,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
   #### variable selection  ------------------------------------------------------------------
   ##### one target variable  ----------------------------------------------------------------
 
-  if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
+  # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
 
     ## y_11
     expect_silent(
@@ -625,7 +625,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
     #                             verbose = T)
     # )
 
-  }
+  # }
 
 
   # check probit ----------------------------------------------------------------------------
@@ -811,7 +811,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
                               row.names = c("Y_11", "Y_12", "Y_21", "Y_22")))
 
 
-  if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
+  # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
     #### variable selection  ------------------------------------------------------------------
     ##### one target variable  ----------------------------------------------------------------
 
@@ -933,7 +933,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
     #                             verbose = T)
     # )
 
-  }
+  # }
 
   ## non-linear case ------------------------------------------------------------------------
   #### correctly specified variables --------------------------------------------------------
@@ -1119,7 +1119,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
                               row.names = c("Y_11", "Y_12", "Y_21", "Y_22")))
 
 
-  if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
+  # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
     #### variable selection  ------------------------------------------------------------------
     ##### one target variable  ----------------------------------------------------------------
 
@@ -1241,7 +1241,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
     #                             verbose = T)
     # )
 
-  }
+  # }
 
   # check cloglog -----------------------------------------------------
   ## linear case ----------------------------------------------------------------------------
@@ -1752,4 +1752,4 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")
     # # expect_true(y22_corr_scad$confidence_interval$lower_bound < mean(Y_22) &
     # #               y22_corr_scad$confidence_interval$upper_bound > mean(Y_22)) ## conf int
     # expect_true(NROW(y22_corr_scad$selection$coefficients) == 2)
-}
+# }