From 3680870137d3eedad064b64bc53e416ed277f74c Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 3 Apr 2024 18:05:14 +0100 Subject: [PATCH 01/24] add version working properly on M1 processors --- src/Makevars | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makevars b/src/Makevars index f09302a..b6de604 100644 --- a/src/Makevars +++ b/src/Makevars @@ -15,5 +15,5 @@ ## set the appropriate value, possibly CXX17. # CXX_STD = CXX11 -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +# PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +# PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) From 0bf7ac8715a77e6ae58c727623eb54779d0a910d Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Thu, 23 May 2024 10:48:15 +0100 Subject: [PATCH 02/24] fix issue #48 --- R/data_manip.R | 2 +- README.Rmd | 3 +++ README.md | 4 ++++ src/Makevars | 4 ++-- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/R/data_manip.R b/R/data_manip.R index e652330..b165f88 100644 --- a/R/data_manip.R +++ b/R/data_manip.R @@ -25,7 +25,7 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot } else { design_to_frame <- svydesign$variables names_rand <- all.vars(formula[-2]) - model_Frame_rand <- design_to_frame[, names_rand] + model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE] nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") diff --git a/README.Rmd b/README.Rmd index cc4e382..5ba683a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,7 +29,10 @@ coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badg [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) [![CRAN +downloads](https://cranlogs.r-pkg.org/badges/grand-total/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) +[![CRAN downloads](https://cranlogs.r-pkg.org/badges/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) +[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) diff --git a/README.md b/README.md index d442f2d..5a46742 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,11 @@ coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badg [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) [![CRAN +downloads](https://cranlogs.r-pkg.org/badges/grand-total/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) +[![CRAN downloads](https://cranlogs.r-pkg.org/badges/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) +[![Mentioned in Awesome Official +Statistics](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) diff --git a/src/Makevars b/src/Makevars index b6de604..f09302a 100644 --- a/src/Makevars +++ b/src/Makevars @@ -15,5 +15,5 @@ ## set the appropriate value, possibly CXX17. # CXX_STD = CXX11 -# PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -# PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) From 311fb0d80033acfff679291003eaef799d615f03 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 29 May 2024 17:29:51 +0100 Subject: [PATCH 03/24] close #49 --- .Rapp.history | 0 R/bias_correction_ipw.R | 28 ++++++++++++++++++++++------ R/control_selection.R | 15 +++++++++++++-- R/gee_ipw.R | 6 ++++++ R/nonprobDR.R | 11 ++++++++++- R/nonprobIPW.R | 6 ++++++ R/theta_funcs.R | 12 ++++++++---- man/controlSel.Rd | 11 ++++++++++- 8 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 .Rapp.history diff --git a/.Rapp.history b/.Rapp.history new file mode 100644 index 0000000..e69de29 diff --git a/R/bias_correction_ipw.R b/R/bias_correction_ipw.R index 5905422..919530f 100644 --- a/R/bias_correction_ipw.R +++ b/R/bias_correction_ipw.R @@ -1,5 +1,20 @@ # bias correction -mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, start_selection, start_outcome, boot = FALSE) { +mm <- function(X, + y, + weights, + weights_rand, + R, + n_nons, + n_rand, + method_selection, + family, + start_selection, + start_outcome, + maxit, + nleqslv_method, + nleqslv_global, + nleqslv_xscalm, + boot = FALSE) { method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") method <- get_method(method_selection_function) inv_link <- method$make_link_inv @@ -63,13 +78,14 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, # par_sel <- multiroot$par ######### NLESQLV multiroot <- nleqslv::nleqslv( - x = par0, # TODO add user-specified parameters to control functions + x = par0, fn = u_theta_beta_dr, - method = "Newton", # TODO consider the method Broyden - global = "dbldog", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none") - xscalm = "auto", # c("fixed","auto") + method = nleqslv_method, + global = nleqslv_global, + xscalm = nleqslv_xscalm, jacobian = TRUE, - control = list(scalex = rep(1, length(par0))), # TODO algorithm did not converge in maxit iterations for cloglog + control = list(scalex = rep(1, length(par0)), + maxit = maxit), # TODO algorithm did not converge in maxit iterations for cloglog R = R, X = X, y = y, diff --git a/R/control_selection.R b/R/control_selection.R index 6441396..a5cfd19 100644 --- a/R/control_selection.R +++ b/R/control_selection.R @@ -36,6 +36,9 @@ #' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest. #' \item if \code{zero} then start is a vector of zeros. #' } +#' @param nleqslv_method The method that will be passed to [nleqslv::nleqslv()] function. +#' @param nleqslv_global The global strategy that will be passed to [nleqslv::nleqslv()] function. +#' @param nleqslv_xscalm The type of x scaling that will be passed to [nleqslv::nleqslv()] function. #' #' @return List with selected parameters. #' @@ -64,7 +67,12 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo nlambda = 50, nfolds = 10, print_level = 0, - start_type = c("glm", "naive", "zero")) { + start_type = c("glm", "naive", "zero"), + nleqslv_method = c("Broyden", "Newton"), + nleqslv_global = c("dbldog", "pwldog", + "cline", "qline", "gline", "hook", "none"), + nleqslv_xscalm = c("fixed","auto") + ) { list( epsilon = epsilon, maxit = maxit, @@ -84,6 +92,9 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo nfolds = nfolds, lambda = lambda, print_level = print_level, - start_type = if (missing(start_type)) "naive" else start_type + start_type = if (missing(start_type)) "naive" else start_type, + nleqslv_method = if (missing(nleqslv_method)) "Newton" else nleqslv_method, + nleqslv_global = if (missing(nleqslv_global)) "dbldog" else nleqslv_global, + nleqslv_xscalm = if (missing(nleqslv_xscalm)) "auto" else nleqslv_xscalm ) } diff --git a/R/gee_ipw.R b/R/gee_ipw.R index 8dde5ab..707f288 100644 --- a/R/gee_ipw.R +++ b/R/gee_ipw.R @@ -111,6 +111,9 @@ gee <- function(...) { h = h, method_selection = method_selection, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, start = 0 )$theta_h) start <- c(start_h, rep(0, ncol(X) - 1)) @@ -127,6 +130,9 @@ gee <- function(...) { h = h, method_selection = method_selection, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_method, + nleqslv_xscalm = control_selection$nleqslv_method, start = start ) theta_hat <- h_object$theta_h diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 34f0d30..b1e072e 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -311,7 +311,10 @@ nonprobDR <- function(selection, method_selection = method_selection, family = family_nonprobsvy, start_selection = start_selection, - start_outcome = start_outcome + start_outcome = start_outcome, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, ) selection_model <- estimation_model$selection @@ -557,6 +560,9 @@ nonprobDR <- function(selection, method_selection = method_selection, start = 0, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, pop_totals = SelectionModel$pop_totals[1] )$theta_h) start_selection <- c(start_h, rep(0, ncol(X) - 1)) @@ -572,6 +578,9 @@ nonprobDR <- function(selection, method_selection = method_selection, start = start_selection, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, pop_totals = SelectionModel$pop_totals ) theta_hat <- h_object$theta_h diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index 71c65ed..cb70844 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -294,6 +294,9 @@ nonprobIPW <- function(selection, method_selection = method_selection, start = 0, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, pop_totals = pop_totals[1] )$theta_h) start_selection <- c(start_h, rep(0, ncol(X) - 1)) @@ -309,6 +312,9 @@ nonprobIPW <- function(selection, method_selection = method_selection, start = start_selection, maxit = maxit, + nleqslv_method = control_selection$nleqslv_method, + nleqslv_global = control_selection$nleqslv_global, + nleqslv_xscalm = control_selection$nleqslv_xscalm, pop_totals = pop_totals ) # theta_h estimation for h_x == 2 is equal to the main method for theta estimation diff --git a/R/theta_funcs.R b/R/theta_funcs.R index 40e9fc1..424b0e3 100644 --- a/R/theta_funcs.R +++ b/R/theta_funcs.R @@ -87,9 +87,12 @@ theta_h_estimation <- function(R, h, method_selection, maxit, + nleqslv_method, + nleqslv_global, + nleqslv_xscalm, start = NULL, pop_totals = NULL, - pop_means = NULL) { # TODO with BERENZ recommendation + pop_means = NULL) { p <- ncol(X) # if (is.null(pop_totals) & is.null(pop_means)) { @@ -157,7 +160,8 @@ theta_h_estimation <- function(R, method = "Newton", # TODO consider the methods global = "dbldog", # qline", xscalm = "auto", - jacobian = TRUE + jacobian = TRUE, + control = list(maxit = maxit) ) } else { root <- nleqslv::nleqslv( @@ -167,8 +171,8 @@ theta_h_estimation <- function(R, global = "dbldog", # qline", xscalm = "auto", jacobian = TRUE, - jac = u_theta_der - # control = list(sigma = 0.1, trace = 1) + jac = u_theta_der, + control = list(maxit = maxit) ) } theta_root <- root$x diff --git a/man/controlSel.Rd b/man/controlSel.Rd index 966d21f..f151a26 100644 --- a/man/controlSel.Rd +++ b/man/controlSel.Rd @@ -24,7 +24,10 @@ controlSel( nlambda = 50, nfolds = 10, print_level = 0, - start_type = c("glm", "naive", "zero") + start_type = c("glm", "naive", "zero"), + nleqslv_method = c("Broyden", "Newton"), + nleqslv_global = c("dbldog", "pwldog", "cline", "qline", "gline", "hook", "none"), + nleqslv_xscalm = c("fixed", "auto") ) } \arguments{ @@ -81,6 +84,12 @@ controlSel( \item if \code{zero} then start is a vector of zeros. } }} + +\item{nleqslv_method}{The method that will be passed to \code{\link[nleqslv:nleqslv]{nleqslv::nleqslv()}} function.} + +\item{nleqslv_global}{The global strategy that will be passed to \code{\link[nleqslv:nleqslv]{nleqslv::nleqslv()}} function.} + +\item{nleqslv_xscalm}{The type of x scaling that will be passed to \code{\link[nleqslv:nleqslv]{nleqslv::nleqslv()}} function.} } \value{ List with selected parameters. From f0b25a56a8a6de7e1a40d1f9ccd44d5f913edeea Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Thu, 30 May 2024 11:01:12 +0100 Subject: [PATCH 04/24] close #50 --- NAMESPACE | 2 ++ R/main_function_documentation.R | 2 ++ R/nonprobDR.R | 12 +++++++++++- R/nonprobIPW.R | 9 +++++++++ R/nonprobMI.R | 10 ++++++++++ R/prints.R | 5 +++++ R/summary.R | 2 ++ man/nonprob.Rd | 2 ++ 8 files changed, 43 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 6194a87..6c7a33e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,6 +71,7 @@ importFrom(stats,pt) importFrom(stats,qnorm) importFrom(stats,rbinom) importFrom(stats,rchisq) +importFrom(stats,reformulate) importFrom(stats,residuals) importFrom(stats,rexp) importFrom(stats,rnorm) @@ -86,6 +87,7 @@ importFrom(stats,weighted.mean) importFrom(survey,as.svrepdesign) importFrom(survey,svymean) importFrom(survey,svyrecvar) +importFrom(survey,svytotal) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) useDynLib(nonprobsvy) diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R index 8bb6f6f..bf31968 100644 --- a/R/main_function_documentation.R +++ b/R/main_function_documentation.R @@ -191,6 +191,7 @@ NULL #' \item{\code{nonprob_size} -- size of non-probability sample} #' \item{\code{prob_size} -- size of probability sample} #' \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)} +#' \item{\code{pop_totals} -- the total values of the auxiliary variables derived from a probability sample or vector of total/mean values.} #' \item{\code{outcome} -- list containing information about the fitting of the mass imputation model, in the case of regression model the object containing the list returned by #' [stats::glm()], in the case of the nearest neighbour imputation the object containing list returned by [RANN::nn2()]. If `bias_correction` in [controlInf()] is set to `TRUE`, the estimation is based on #' the joint estimating equations for the `selection` and `outcome` model and therefore, the list is different from the one returned by the [stats::glm()] function and contains elements such as @@ -223,6 +224,7 @@ NULL #' \item{\code{aic} -- A version of Akaike's An Information Criterion, minus twice the maximized log-likelihood plus twice the number of parameters.} #' \item{\code{weights} -- vector of estimated weights for non-probability sample.} #' \item{\code{prior.weights} -- the weights initially supplied, a vector of 1s if none were.} +#' \item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample.} #' \item{\code{formula} -- the formula supplied.} #' \item{\code{df_residual} -- the residual degrees of freedom.} #' \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NA`.} diff --git a/R/nonprobDR.R b/R/nonprobDR.R index b1e072e..eb21968 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -6,6 +6,8 @@ #' @importFrom stats qnorm #' @importFrom stats binomial #' @importFrom stats terms +#' @importFrom stats reformulate +#' @importFrom survey svytotal #' @importFrom ncvreg cv.ncvreg #' @importFrom MASS ginv #' @import Rcpp @@ -81,6 +83,8 @@ nonprobDR <- function(selection, data = data, svydesign = svydesign ) + prob_totals <- svytotal(selection, svydesign) + prob_pop_totals <- c(sum(weights(svydesign)), prob_totals) # if (all(svydesign$prob) == 1) { # TODO # if (!is.null(pop_size)) { # ps_rand <- rep(sum(svydesign$prob)/pop_size, length(svydesign$prob)) @@ -497,6 +501,7 @@ nonprobDR <- function(selection, } } else if ((!is.null(pop_totals) || !is.null(pop_means)) && is.null(svydesign)) { # model for outcome formula + # TODO add pop_means ? to check OutcomeModel <- model_frame(formula = outcome, data = data, pop_totals = pop_totals) X_nons <- OutcomeModel$X_nons X_rand <- OutcomeModel$X_rand @@ -540,6 +545,7 @@ nonprobDR <- function(selection, X_nons <- OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1]) } + prob_pop_totals <- SelectionModel$pop_totals if (is.null(start_selection)) { if (control_selection$start_type == "glm") { @@ -660,7 +666,7 @@ nonprobDR <- function(selection, ys[[k]] <- as.numeric(y_nons) if (se) { - if (var_method == "analytic") { # TODO add estimator variance with model containg pop_totals to internal_varDR function + if (var_method == "analytic") { # TODO add estimator variance with model containing pop_totals to internal_varDR function var_obj <- internal_varDR( OutcomeModel = OutcomeModel, # consider add selection argument instead of separate arguments for selection objects SelectionModel = SelectionModel, @@ -809,6 +815,8 @@ nonprobDR <- function(selection, if (is.null(pop_size)) pop_size <- N_nons names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + est_totals <- colSums(SelectionModel$X_nons*as.vector(weights_nons)) + names(prob_pop_totals) <- colnames(SelectionModel$X_nons) boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { stat @@ -829,6 +837,7 @@ nonprobDR <- function(selection, aic = selection_model$aic, weights = as.vector(weights_nons), prior.weights = weights, + est_totals = est_totals, formula = selection, df_residual = selection_model$df_residual, log_likelihood = selection_model$log_likelihood, @@ -859,6 +868,7 @@ nonprobDR <- function(selection, nonprob_size = n_nons, prob_size = n_rand, pop_size = pop_size, + pop_totals = prob_pop_totals, outcome = OutcomeList, selection = SelectionList, boot_sample = boot_sample diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index cb70844..94b831b 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -5,6 +5,8 @@ #' @importFrom stats qnorm #' @importFrom stats as.formula #' @importFrom stats terms +#' @importFrom stats reformulate +#' @importFrom survey svytotal #' @importFrom MASS ginv #' @import Rcpp #' @importFrom Rcpp evalCpp @@ -66,6 +68,8 @@ nonprobIPW <- function(selection, data = data, svydesign = svydesign ) + prob_totals <- svytotal(selection, svydesign) + prob_pop_totals <- c(sum(weights(svydesign)), prob_totals) # y_nons <- model$y_nons X_nons <- model$X_nons @@ -275,6 +279,7 @@ nonprobIPW <- function(selection, pop_totals <- model$pop_totals[idx] } + prob_pop_totals <- pop_totals if (is.null(start_selection)) { if (control_selection$start_type == "glm") { start_selection <- start_fit( @@ -535,6 +540,8 @@ nonprobIPW <- function(selection, if (is.null(pop_size)) pop_size <- N # estimated pop_size names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + est_totals <- colSums(X_nons*as.vector(weights_nons)) + names(prob_pop_totals) <- colnames(X_nons) boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { boot_obj$stat @@ -555,6 +562,7 @@ nonprobIPW <- function(selection, aic = selection_model$aic, weights = as.vector(weights_nons), prior.weights = weights, + est_totals = est_totals, formula = selection, df_residual = selection_model$df_residual, log_likelihood = selection_model$log_likelihood, @@ -581,6 +589,7 @@ nonprobIPW <- function(selection, nonprob_size = n_nons, prob_size = n_rand, pop_size = pop_size, + pop_totals = prob_pop_totals, selection = SelectionList, boot_sample = boot_sample ), diff --git a/R/nonprobMI.R b/R/nonprobMI.R index fecdc70..d845fcb 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -8,6 +8,8 @@ #' @importFrom RANN nn2 #' @importFrom ncvreg cv.ncvreg #' @importFrom stats terms +#' @importFrom stats reformulate +#' @importFrom survey svytotal #' @import Rcpp #' @importFrom Rcpp evalCpp @@ -52,6 +54,11 @@ nonprobMI <- function(outcome, if (control_inference$var_method == "bootstrap") { stat <- matrix(nrow = control_inference$num_boot, ncol = outcomes$l) } + terms_obj <- terms(outcome) + if (is.null(pop_totals) && !is.null(svydesign)) { + prob_totals <- svytotal(reformulate(attr(terms_obj, "term.labels")), svydesign) + prob_pop_totals <- c(sum(weights(svydesign)), prob_totals) + } for (k in 1:outcomes$l) { if (control_outcome$pmm_k_choice == "min_var" & method_outcome == "pmm") { @@ -292,6 +299,7 @@ nonprobMI <- function(outcome, X <- X_nons pop_totals <- pop_totals[beta_selected + 1] } + prob_pop_totals <- pop_totals ####################### method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") @@ -452,6 +460,7 @@ nonprobMI <- function(outcome, names(OutcomeList) <- outcomes$f names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + names(prob_pop_totals) <- colnames(X_nons) boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { list(stat = stat, comp2 = boot_obj$comp2) @@ -474,6 +483,7 @@ nonprobMI <- function(outcome, nonprob_size = n_nons, prob_size = n_rand, pop_size = pop_size, + pop_totals = prob_pop_totals, outcome = OutcomeList, boot_sample = boot_sample ), diff --git a/R/prints.R b/R/prints.R index 3fdbba5..f0bffe9 100644 --- a/R/prints.R +++ b/R/prints.R @@ -115,6 +115,11 @@ print.summary_nonprobsvy <- function(x, cat("-------------------------\n\n") + cat("Estimation Quality:\n") + print(x$est_totals - x$totals) + + cat("-------------------------\n\n") + cat("Residuals:\n") print(summary(x$residuals$selection)) diff --git a/R/summary.R b/R/summary.R index e14886a..05b37ef 100644 --- a/R/summary.R +++ b/R/summary.R @@ -112,6 +112,7 @@ summary.nonprobsvy <- function(object, ), sample_size = nobs(object, ...), population_size = pop.size(object, ...), + totals = object$pop_totals, test = test, control = object$control, model = switch(class(object)[2], @@ -125,6 +126,7 @@ summary.nonprobsvy <- function(object, likelihood = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$log_likelihood, "no value for the selected method"), df_residual = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$df_residual, "no value for the selected method"), weights = summary(object$weights), + est_totals = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$est_totals, "no value for the selected method"), coef = cf, std_err = se, w_val = wald_test_stat, diff --git a/man/nonprob.Rd b/man/nonprob.Rd index 8d0fa7e..87972e1 100644 --- a/man/nonprob.Rd +++ b/man/nonprob.Rd @@ -100,6 +100,7 @@ with type \code{list} containing:\cr \item{\code{nonprob_size} -- size of non-probability sample} \item{\code{prob_size} -- size of probability sample} \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)} +\item{\code{pop_totals} -- the total values of the auxiliary variables derived from a probability sample or vector of total/mean values.} \item{\code{outcome} -- list containing information about the fitting of the mass imputation model, in the case of regression model the object containing the list returned by \code{\link[stats:glm]{stats::glm()}}, in the case of the nearest neighbour imputation the object containing list returned by \code{\link[RANN:nn2]{RANN::nn2()}}. If \code{bias_correction} in \code{\link[=controlInf]{controlInf()}} is set to \code{TRUE}, the estimation is based on the joint estimating equations for the \code{selection} and \code{outcome} model and therefore, the list is different from the one returned by the \code{\link[stats:glm]{stats::glm()}} function and contains elements such as @@ -132,6 +133,7 @@ In addition, if the variable selection model for the outcome variable is fitting \item{\code{aic} -- A version of Akaike's An Information Criterion, minus twice the maximized log-likelihood plus twice the number of parameters.} \item{\code{weights} -- vector of estimated weights for non-probability sample.} \item{\code{prior.weights} -- the weights initially supplied, a vector of 1s if none were.} +\item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample.} \item{\code{formula} -- the formula supplied.} \item{\code{df_residual} -- the residual degrees of freedom.} \item{\code{log_likelihood} -- value of log-likelihood function if \code{mle} method, in the other case \code{NA}.} From f0eb12b75e4fa4f0382096e05ab47fa380a4b15f Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Thu, 30 May 2024 13:57:21 +0100 Subject: [PATCH 05/24] small change regarding #50 --- R/prints.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/prints.R b/R/prints.R index f0bffe9..470f609 100644 --- a/R/prints.R +++ b/R/prints.R @@ -115,7 +115,7 @@ print.summary_nonprobsvy <- function(x, cat("-------------------------\n\n") - cat("Estimation Quality:\n") + cat("Estimation balance:\n") print(x$est_totals - x$totals) cat("-------------------------\n\n") From 9c41bd18e616a44b91ab4162d5587816ad1f8f47 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Thu, 30 May 2024 17:30:42 +0100 Subject: [PATCH 06/24] fix for calibrated estimation regarding #50 --- R/summary.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/summary.R b/R/summary.R index 05b37ef..f80ea35 100644 --- a/R/summary.R +++ b/R/summary.R @@ -102,6 +102,11 @@ summary.nonprobsvy <- function(object, } else { se_mean <- NULL } + if (class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw")) { + est_totals <- object$selection$est_totals + } else { + est_totals <- "no value for the selected method" + } res <- structure( list( call = object$call, @@ -126,7 +131,7 @@ summary.nonprobsvy <- function(object, likelihood = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$log_likelihood, "no value for the selected method"), df_residual = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$df_residual, "no value for the selected method"), weights = summary(object$weights), - est_totals = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$est_totals, "no value for the selected method"), + est_totals = est_totals, coef = cf, std_err = se, w_val = wald_test_stat, From 2fa354cbf0f3ababa092f7c5226f2a170ddaa83f Mon Sep 17 00:00:00 2001 From: Piotr Chlebicki Date: Fri, 31 May 2024 10:54:26 +0200 Subject: [PATCH 07/24] Added ni exact se --- R/nn.R | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ R/pmm.R | 4 ++-- R/varianceMI.R | 16 +++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/R/nn.R b/R/nn.R index 067a9bd..2e827ba 100644 --- a/R/nn.R +++ b/R/nn.R @@ -91,3 +91,57 @@ nonprobMI_nn <- function(data, ) model_nn } + + +nn_exact <- function(pi_ij, + weights_rand, + n_nons, + y, + X_nons, + X_rand, + k, + #control, + N) { + # if (isTRUE("ppsmat" %in% class(pi_ij))) { + # pi_ij <- pi_ij$pij + # } + # # if (!is.null(svydesign$dcheck[[1]]$dcheck)) { + # # pi_ij <- svydesign$dcheck[[1]]$dcheck + # # } + # if (is.null(pi_ij)) { + # pi_ij <- outer(1 / weights_rand, 1 / weights_rand) * ( + # 1 - outer(1 - 1 / weights_rand, 1 - 1 / weights_rand) / + # sum(1 - 1 / weights_rand)) + # } + # # if (!is.matrix(pi_ij)) { + # # + # # } + # add variable for loop size to control + loop_size <- 50 + + dd <- vector(mode = "numeric", length = loop_size) + for (jj in 1:loop_size) { + + boot_samp <- sample(1:n_nons, size = n_nons, replace = TRUE) + # boot_samp <- sample(1:n_rand, size = n_rand, replace = TRUE) + y_nons_b <- y[boot_samp] + x_nons_b <- X_nons[boot_samp, , drop = FALSE] + + YY <- nonprobMI_nn( + data = x_nons_b, + query = X_rand, + k = k, + searchtype = "standard", + treetype = "kd" + #TODO:: add control options + #treetype = control$treetype, + #searchtype = control$searchtype + ) + + dd[jj] <- weighted.mean( + apply(YY$nn.idx, 1, FUN = \(x) mean(y_nons_b[x])), + weights_rand + ) + } + var(dd) +} diff --git a/R/pmm.R b/R/pmm.R index 219febb..8ac4924 100644 --- a/R/pmm.R +++ b/R/pmm.R @@ -172,8 +172,8 @@ pmm_exact <- function(pi_ij, n_nons, y, pmm_reg_engine, - stats, - glm, + #stats, #why is this here? + #glm, #why is this here? model_obj, svydesign, predictive_match, diff --git a/R/varianceMI.R b/R/varianceMI.R index 8716847..b4cd317 100644 --- a/R/varianceMI.R +++ b/R/varianceMI.R @@ -37,6 +37,22 @@ internal_varMI <- function(svydesign, sigma_hat <- mean((y - y_pred)^2) # family_nonprobsvy$variance(mu = y_pred, y = y) est_ps <- n_nons / N var_nonprob <- n_rand / N^2 * (1 - est_ps) / est_ps * sigma_hat + + if (pmm_exact_se) { + var_nonprob <- nn_exact( + pi_ij = pi_ij, + weights_rand = weights_rand, + n_nons = n_nons, + y = y, + X_nons = X_nons, + X_rand = X_rand, + k = k, + # TODO:: add control here + #control = control + N = N + ) + } + } else if (method == "glm") { # TODO add variance for count binary outcome variable control_outcome$method beta <- parameters[, 1] From 4d4ebbb1e8d1b4144e10e6551c71c652214a0a8b Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Fri, 31 May 2024 12:28:10 +0100 Subject: [PATCH 08/24] summary print edit --- R/prints.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/prints.R b/R/prints.R index 470f609..02d132d 100644 --- a/R/prints.R +++ b/R/prints.R @@ -115,7 +115,7 @@ print.summary_nonprobsvy <- function(x, cat("-------------------------\n\n") - cat("Estimation balance:\n") + cat("Covariate balance:\n") print(x$est_totals - x$totals) cat("-------------------------\n\n") From 342cc3b00d4009656946682d1ef83f2c8efdbf8a Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 5 Jun 2024 20:52:46 +0100 Subject: [PATCH 09/24] add formula and data to glm.fit object --- R/glm.R | 9 ++ README.Rmd | 290 +++++++--------------------------------------- README.md | 329 +++++++---------------------------------------------- 3 files changed, 90 insertions(+), 538 deletions(-) diff --git a/R/glm.R b/R/glm.R index 9cb4c08..f588957 100644 --- a/R/glm.R +++ b/R/glm.R @@ -68,6 +68,15 @@ glm_nonprobsvy <- function(outcome, y_rand_pred <- family_nonprobsvy$linkinv(eta) y_nons_pred <- model$fitted.values + # artificial formula and data created for pmm_extact_se + predictors <- colnames(X_nons)[-1] + outcome_name <- names(model_frame)[1] + formula_str <- paste(outcome_name, "~", paste(predictors, collapse = " + ")) + formula <- as.formula(formula_str) + model$formula <- formula + model$data <- as.data.frame(cbind(y_nons, X_nons[,-1, drop = FALSE])) + colnames(model$data) <- c(outcome_name, predictors) + model_out <- list( glm = model, glm_summary = model_summ diff --git a/README.Rmd b/README.Rmd index 5ba683a..13ff8ae 100644 --- a/README.Rmd +++ b/README.Rmd @@ -32,7 +32,8 @@ status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project downloads](https://cranlogs.r-pkg.org/badges/grand-total/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) [![CRAN downloads](https://cranlogs.r-pkg.org/badges/nonprobsvy)](https://cran.r-project.org/package=nonprobsvy) -[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) +[![Mentioned in Awesome Official +Statistics](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) @@ -72,20 +73,20 @@ Details on use of the package be found: ## Installation -You can install the recent version of `nonprobsvy` package from main branch -[Github](https://github.com/ncn-foreigners/nonprobsvy) with: +You can install the recent version of `nonprobsvy` package from main +branch [Github](https://github.com/ncn-foreigners/nonprobsvy) with: ```{r, eval=FALSE} remotes::install_github("ncn-foreigners/nonprobsvy") ``` -or install the stable version from [CRAN](https://CRAN.R-project.org/package=nonprobsvy) +or install the stable version from +[CRAN](https://CRAN.R-project.org/package=nonprobsvy) ```{r, eval=FALSE} install.packages("nonprobsvy") ``` - or development version from the `dev` branch ```{r, eval=FALSE} @@ -101,7 +102,7 @@ available for both sources while $Y$ and $\boldsymbol{d}$ (or $\boldsymbol{w}$) is present only in probability sample. | Sample | | Auxiliary variables $\boldsymbol{X}$ | Target variable $Y$ | Design ($\boldsymbol{d}$) or calibrated ($\boldsymbol{w}$) weights | -|---------------|--------------:|:-------------:|:-------------:|:-------------:| +|------------|-----------:|:----------:|:----------:|:---------------------:| | $S_A$ (non-probability) | 1 | $\checkmark$ | $\checkmark$ | ? | | | ... | $\checkmark$ | $\checkmark$ | ? | | | $n_A$ | $\checkmark$ | $\checkmark$ | ? | @@ -109,7 +110,6 @@ $\boldsymbol{w}$) is present only in probability sample. | | ... | $\checkmark$ | ? | $\checkmark$ | | | $n_A+n_B$ | $\checkmark$ | ? | $\checkmark$ | - ## Basic functionalities Suppose $Y$ is the target variable, $\boldsymbol{X}$ is a matrix of @@ -136,253 +136,41 @@ possible scenarios: ### When unit-level data is available for non-probability survey only - - - - - - - - - - - - - - - - - - -
Estimator Example code
-Mass imputation based on regression imputation - -```{r, eval = FALSE} -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - pop_totals = c(`(Intercept)`= N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_outcome = "glm", - family_outcome = "gaussian" -) -``` -
-Inverse probability weighting - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - pop_totals = c(`(Intercept)` = N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_selection = "logit" -) -``` -
-Inverse probability weighting with calibration constraint - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - pop_totals = c(`(Intercept)`= N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_selection = "logit", - control_selection = controlSel(est_method_sel = "gee", h = 1) -) -``` -
-Doubly robust estimator - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + …, + xk, - pop_totals = c(`(Intercept)` = N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` -
+| Estimator | Example code | +|-----------------------------------|-----------------------------------| +| | | +| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | +| | | +| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_selection = "logit" ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_selection = "logit", control_selection = controlSel(est_method_sel = "gee", h = 1) ) \`\`\` | +| | | +| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + ..., + xk, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | +| | | ### When unit-level data are available for both surveys - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Estimator Example code
-Mass imputation based on regression imputation - -```{r, eval = FALSE} -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` -
-Mass imputation based on nearest neighbour imputation - -```{r, eval = FALSE} -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "nn", - family_outcome = "gaussian", - control_outcome = controlOutcome(k = 2) -) -``` -
-Mass imputation based on predictive mean matching - -```{r, eval = FALSE} -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian" -) -``` -
-Mass imputation based on regression imputation with variable selection (LASSO) - -```{r, eval = FALSE} -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian", - control_outcome = controlOut(penalty = "lasso"), - control_inference = controlInf(vars_selection = TRUE) -) -``` -
-Inverse probability weighting - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_selection = "logit" -) -``` -
-Inverse probability weighting with calibration constraint - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_selection = "logit", - control_selection = controlSel(est_method_sel = "gee", h = 1) -) -``` -
-Inverse probability weighting with calibration constraint with variable selection (SCAD) - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian", - control_inference = controlInf(vars_selection = TRUE) -) -``` -
-Doubly robust estimator - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` -
-Doubly robust estimator with variable selection (SCAD) and bias minimization - -```{r, eval = FALSE} -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian", - control_inference = controlInf( - vars_selection = TRUE, - bias_correction = TRUE - ) -) -``` -
+| Estimator | Example code | +|-----------------------------------|-----------------------------------| +| | | +| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | +| | | +| Mass imputation based on nearest neighbour imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "nn", family_outcome = "gaussian", control_outcome = controlOut(k = 2) ) \`\`\` | +| | | +| Mass imputation based on predictive mean matching | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian" ) \`\`\` | +| | | +| Mass imputation based on regression imputation with variable selection (LASSO) | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian", control_outcome = controlOut(penalty = "lasso"), control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | +| | | +| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_selection = "logit" ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_selection = "logit", control_selection = controlSel(est_method_sel = "gee", h = 1) ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint with variable selection (SCAD) | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian", control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | +| | | +| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | +| | | +| Doubly robust estimator with variable selection (SCAD) and bias minimization | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian", control_inference = controlInf( vars_selection = TRUE, bias_correction = TRUE ) ) \`\`\` | +| | | ## Examples diff --git a/README.md b/README.md index 5a46742..3188d19 100644 --- a/README.md +++ b/README.md @@ -119,296 +119,41 @@ possible scenarios: ### When unit-level data is available for non-probability survey only - - - - - - - - - - - - - - - - - - - - - -
-Estimator - -Example code -
-Mass imputation based on regression imputation - - -``` r -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - pop_totals = c(`(Intercept)`= N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_outcome = "glm", - family_outcome = "gaussian" -) -``` - -
-Inverse probability weighting - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - pop_totals = c(`(Intercept)` = N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_selection = "logit" -) -``` - -
-Inverse probability weighting with calibration constraint - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - pop_totals = c(`(Intercept)`= N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - method_selection = "logit", - control_selection = controlSel(est_method_sel = "gee", h = 1) -) -``` - -
-Doubly robust estimator - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + …, + xk, - pop_totals = c(`(Intercept)` = N, - x1 = tau_x1, - x2 = tau_x2, - ..., - xk = tau_xk), - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` - -
+| Estimator | Example code | +|-----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| | | +| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | +| | | +| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_selection = “logit” ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_selection = “logit”, control_selection = controlSel(est_method_sel = “gee”, h = 1) ) \`\`\` | +| | | +| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + …, + xk, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | +| | | ### When unit-level data are available for both surveys - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Estimator - -Example code -
-Mass imputation based on regression imputation - - -``` r -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` - -
-Mass imputation based on nearest neighbour imputation - - -``` r -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "nn", - family_outcome = "gaussian", - control_outcome = controlOutcome(k = 2) -) -``` - -
-Mass imputation based on predictive mean matching - - -``` r -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian" -) -``` - -
-Mass imputation based on regression imputation with variable selection -(LASSO) - - -``` r -nonprob( - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian", - control_outcome = controlOut(penalty = "lasso"), - control_inference = controlInf(vars_selection = TRUE) -) -``` - -
-Inverse probability weighting - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_selection = "logit" -) -``` - -
-Inverse probability weighting with calibration constraint - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_selection = "logit", - control_selection = controlSel(est_method_sel = "gee", h = 1) -) -``` - -
-Inverse probability weighting with calibration constraint with variable -selection (SCAD) - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - target = ~ y, - data = nonprob, - svydesign = prob, - method_outcome = "pmm", - family_outcome = "gaussian", - control_inference = controlInf(vars_selection = TRUE) -) -``` - -
-Doubly robust estimator - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian" -) -``` - -
-Doubly robust estimator with variable selection (SCAD) and bias -minimization - - -``` r -nonprob( - selection = ~ x1 + x2 + ... + xk, - outcome = y ~ x1 + x2 + ... + xk, - data = nonprob, - svydesign = prob, - method_outcome = "glm", - family_outcome = "gaussian", - control_inference = controlInf( - vars_selection = TRUE, - bias_correction = TRUE - ) -) -``` - -
+| Estimator | Example code | +|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| | | +| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | +| | | +| Mass imputation based on nearest neighbour imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “nn”, family_outcome = “gaussian”, control_outcome = controlOut(k = 2) ) \`\`\` | +| | | +| Mass imputation based on predictive mean matching | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian” ) \`\`\` | +| | | +| Mass imputation based on regression imputation with variable selection (LASSO) | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian”, control_outcome = controlOut(penalty = “lasso”), control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | +| | | +| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_selection = “logit” ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_selection = “logit”, control_selection = controlSel(est_method_sel = “gee”, h = 1) ) \`\`\` | +| | | +| Inverse probability weighting with calibration constraint with variable selection (SCAD) | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian”, control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | +| | | +| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | +| | | +| Doubly robust estimator with variable selection (SCAD) and bias minimization | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian”, control_inference = controlInf( vars_selection = TRUE, bias_correction = TRUE ) ) \`\`\` | +| | | ## Examples @@ -503,6 +248,11 @@ summary(result_dr) #> 1.000 1.071 1.313 1.479 1.798 2.647 #> ------------------------- #> +#> Covariate balance: +#> (Intercept) x2 +#> 25062.8473 -517.5862 +#> ------------------------- +#> #> Residuals: #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> -0.99999 0.06603 0.23778 0.26046 0.44358 0.62222 @@ -609,6 +359,11 @@ summary(result_ipw) #> 1.000 1.071 1.313 1.479 1.798 2.647 #> ------------------------- #> +#> Covariate balance: +#> (Intercept) x2 +#> 25062.8473 -517.5862 +#> ------------------------- +#> #> Residuals: #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> -0.99999 0.06603 0.23778 0.26046 0.44358 0.62222 From 64a36ed7d3a61d26dfe6a616aac0698d0b50cf3a Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 12 Jun 2024 10:34:35 +0100 Subject: [PATCH 10/24] close #52 --- R/control_outcome.R | 4 +- R/pmm.R | 92 ++++----- README.Rmd | 277 +++++++++++++++++++++++---- README.md | 319 ++++++++++++++++++++++++++++---- inst/tinytest/test_nonprobsvy.R | 6 +- man/controlOut.Rd | 4 +- 6 files changed, 585 insertions(+), 117 deletions(-) diff --git a/R/control_outcome.R b/R/control_outcome.R index e37d010..ed4ea58 100644 --- a/R/control_outcome.R +++ b/R/control_outcome.R @@ -16,8 +16,8 @@ #' @param predictive_match (Only for predictive mean matching) #' Indicates how to select 'closest' unit from nonprobability sample for each #' unit in probability sample. Either \code{1} (default) or \code{2} where -#' \code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for -#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2} +#' \code{2} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{1} #' is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for #' \mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}. #' @param pmm_weights (Only for predictive mean matching) diff --git a/R/pmm.R b/R/pmm.R index 8ac4924..f486ed2 100644 --- a/R/pmm.R +++ b/R/pmm.R @@ -65,38 +65,44 @@ pmm_nonprobsvy <- function(outcome, # ) # add protection for very low values in weighting + # TODO issue #52 switch(control$predictive_match, { # 1 if (is.null(pop_totals)) { model_rand <- nonprobMI_nn( - data = y_nons, + data = glm_object$y_nons_pred, query = glm_object$y_rand_pred, k = control$k, treetype = control$treetype, searchtype = control$searchtype ) + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + switch(control$pmm_weights, - "none" = { - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - }, - "prop_dist" = { - # TODO:: these weights will need to be saved for variance estimation - y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), - FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], - w = 1 / model_rand$nn.dist[x, ] - ) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } ) } else { # I'm not touching this model_rand <- nonprobMI_nn( - data = y_nons, + data = glm_object$y_nons_pred, query = glm_object$y_rand_pred, k = control$k, treetype = control$treetype, @@ -108,39 +114,34 @@ pmm_nonprobsvy <- function(outcome, { # 2 if (is.null(pop_totals)) { model_rand <- nonprobMI_nn( - data = glm_object$y_nons_pred, + data = y_nons, query = glm_object$y_rand_pred, k = control$k, treetype = control$treetype, searchtype = control$searchtype ) - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - switch(control$pmm_weights, - "none" = { - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - }, - "prop_dist" = { - # TODO:: these weights will need to be saved for variance estimation - y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), - FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], - w = 1 / model_rand$nn.dist[x, ] - ) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } ) } else { # I'm not touching this model_rand <- nonprobMI_nn( - data = glm_object$y_nons_pred, + data = y_nons, query = glm_object$y_rand_pred, k = control$k, treetype = control$treetype, @@ -231,10 +232,15 @@ pmm_exact <- function(pi_ij, } } + # TODO issue #52 YY <- switch(predictive_match, { nonprobMI_nn( - data = y_nons_b, + data = predict( + reg_object_boot, + newdata = model_obj$model$glm_object$data[boot_samp, , drop = FALSE], + type = "response" + ), query = XX, k = k, searchtype = "standard", @@ -243,11 +249,7 @@ pmm_exact <- function(pi_ij, }, { nonprobMI_nn( - data = predict( - reg_object_boot, - newdata = model_obj$model$glm_object$data[boot_samp, , drop = FALSE], - type = "response" - ), + data = y_nons_b, query = XX, k = k, searchtype = "standard", diff --git a/README.Rmd b/README.Rmd index 13ff8ae..b9b5f7c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -136,42 +136,253 @@ possible scenarios: ### When unit-level data is available for non-probability survey only -| Estimator | Example code | -|-----------------------------------|-----------------------------------| -| | | -| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | -| | | -| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_selection = "logit" ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), method_selection = "logit", control_selection = controlSel(est_method_sel = "gee", h = 1) ) \`\`\` | -| | | -| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + ..., + xk, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, \..., xk = tau_xk), svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | -| | | + + + + + + + + + + + + + + + + + + +
Estimator Example code
+Mass imputation based on regression imputation + +```{r, eval = FALSE} +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + pop_totals = c(`(Intercept)`= N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_outcome = "glm", + family_outcome = "gaussian" +) +``` +
+Inverse probability weighting + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + pop_totals = c(`(Intercept)` = N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_selection = "logit" +) +``` +
+Inverse probability weighting with calibration constraint + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + pop_totals = c(`(Intercept)`= N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_selection = "logit", + control_selection = controlSel(est_method_sel = "gee", h = 1) +) +``` +
+Doubly robust estimator + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + …, + xk, + pop_totals = c(`(Intercept)` = N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` +
### When unit-level data are available for both surveys -| Estimator | Example code | -|-----------------------------------|-----------------------------------| -| | | -| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | -| | | -| Mass imputation based on nearest neighbour imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "nn", family_outcome = "gaussian", control_outcome = controlOut(k = 2) ) \`\`\` | -| | | -| Mass imputation based on predictive mean matching | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian" ) \`\`\` | -| | | -| Mass imputation based on regression imputation with variable selection (LASSO) | \`\`\`{r, eval = FALSE} nonprob( outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian", control_outcome = controlOut(penalty = "lasso"), control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | -| | | -| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_selection = "logit" ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_selection = "logit", control_selection = controlSel(est_method_sel = "gee", h = 1) ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint with variable selection (SCAD) | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, target = \~ y, data = nonprob, svydesign = prob, method_outcome = "pmm", family_outcome = "gaussian", control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | -| | | -| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian" ) \`\`\` | -| | | -| Doubly robust estimator with variable selection (SCAD) and bias minimization | \`\`\`{r, eval = FALSE} nonprob( selection = \~ x1 + x2 + \... + xk, outcome = y \~ x1 + x2 + \... + xk, data = nonprob, svydesign = prob, method_outcome = "glm", family_outcome = "gaussian", control_inference = controlInf( vars_selection = TRUE, bias_correction = TRUE ) ) \`\`\` | -| | | - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Estimator Example code
+Mass imputation based on regression imputation + +```{r, eval = FALSE} +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` +
+Mass imputation based on nearest neighbour imputation + +```{r, eval = FALSE} +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "nn", + family_outcome = "gaussian", + control_outcome = controlOutcome(k = 2) +) +``` +
+Mass imputation based on predictive mean matching + +```{r, eval = FALSE} +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian" +) +``` +
+Mass imputation based on regression imputation with variable selection (LASSO) + +```{r, eval = FALSE} +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian", + control_outcome = controlOut(penalty = "lasso"), + control_inference = controlInf(vars_selection = TRUE) +) +``` +
+Inverse probability weighting + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_selection = "logit" +) +``` +
+Inverse probability weighting with calibration constraint + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_selection = "logit", + control_selection = controlSel(est_method_sel = "gee", h = 1) +) +``` +
+Inverse probability weighting with calibration constraint with variable selection (SCAD) + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian", + control_inference = controlInf(vars_selection = TRUE) +) +``` +
+Doubly robust estimator + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` +
+Doubly robust estimator with variable selection (SCAD) and bias minimization + +```{r, eval = FALSE} +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian", + control_inference = controlInf( + vars_selection = TRUE, + bias_correction = TRUE + ) +) +``` +
## Examples Simulate example data from the following paper: Kim, Jae Kwang, and diff --git a/README.md b/README.md index 3188d19..79bd182 100644 --- a/README.md +++ b/README.md @@ -119,41 +119,296 @@ possible scenarios: ### When unit-level data is available for non-probability survey only -| Estimator | Example code | -|-----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| | | -| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | -| | | -| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_selection = “logit” ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, pop_totals = c(\`(Intercept)\`= N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), method_selection = “logit”, control_selection = controlSel(est_method_sel = “gee”, h = 1) ) \`\`\` | -| | | -| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + …, + xk, pop_totals = c(\`(Intercept)\` = N, x1 = tau_x1, x2 = tau_x2, ..., xk = tau_xk), svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | -| | | + + + + + + + + + + + + + + + + + + + + + +
+Estimator + +Example code +
+Mass imputation based on regression imputation + + +``` r +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + pop_totals = c(`(Intercept)`= N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_outcome = "glm", + family_outcome = "gaussian" +) +``` + +
+Inverse probability weighting + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + pop_totals = c(`(Intercept)` = N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_selection = "logit" +) +``` + +
+Inverse probability weighting with calibration constraint + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + pop_totals = c(`(Intercept)`= N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + method_selection = "logit", + control_selection = controlSel(est_method_sel = "gee", h = 1) +) +``` + +
+Doubly robust estimator + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + …, + xk, + pop_totals = c(`(Intercept)` = N, + x1 = tau_x1, + x2 = tau_x2, + ..., + xk = tau_xk), + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` + +
### When unit-level data are available for both surveys -| Estimator | Example code | -|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| | | -| Mass imputation based on regression imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | -| | | -| Mass imputation based on nearest neighbour imputation | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “nn”, family_outcome = “gaussian”, control_outcome = controlOut(k = 2) ) \`\`\` | -| | | -| Mass imputation based on predictive mean matching | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian” ) \`\`\` | -| | | -| Mass imputation based on regression imputation with variable selection (LASSO) | \`\`\`{r, eval = FALSE} nonprob( outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian”, control_outcome = controlOut(penalty = “lasso”), control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | -| | | -| Inverse probability weighting | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_selection = “logit” ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_selection = “logit”, control_selection = controlSel(est_method_sel = “gee”, h = 1) ) \`\`\` | -| | | -| Inverse probability weighting with calibration constraint with variable selection (SCAD) | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, target = ~ y, data = nonprob, svydesign = prob, method_outcome = “pmm”, family_outcome = “gaussian”, control_inference = controlInf(vars_selection = TRUE) ) \`\`\` | -| | | -| Doubly robust estimator | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian” ) \`\`\` | -| | | -| Doubly robust estimator with variable selection (SCAD) and bias minimization | \`\`\`{r, eval = FALSE} nonprob( selection = ~ x1 + x2 + ... + xk, outcome = y ~ x1 + x2 + ... + xk, data = nonprob, svydesign = prob, method_outcome = “glm”, family_outcome = “gaussian”, control_inference = controlInf( vars_selection = TRUE, bias_correction = TRUE ) ) \`\`\` | -| | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Estimator + +Example code +
+Mass imputation based on regression imputation + + +``` r +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` + +
+Mass imputation based on nearest neighbour imputation + + +``` r +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "nn", + family_outcome = "gaussian", + control_outcome = controlOutcome(k = 2) +) +``` + +
+Mass imputation based on predictive mean matching + + +``` r +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian" +) +``` + +
+Mass imputation based on regression imputation with variable selection +(LASSO) + + +``` r +nonprob( + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian", + control_outcome = controlOut(penalty = "lasso"), + control_inference = controlInf(vars_selection = TRUE) +) +``` + +
+Inverse probability weighting + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_selection = "logit" +) +``` + +
+Inverse probability weighting with calibration constraint + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_selection = "logit", + control_selection = controlSel(est_method_sel = "gee", h = 1) +) +``` + +
+Inverse probability weighting with calibration constraint with variable +selection (SCAD) + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + target = ~ y, + data = nonprob, + svydesign = prob, + method_outcome = "pmm", + family_outcome = "gaussian", + control_inference = controlInf(vars_selection = TRUE) +) +``` + +
+Doubly robust estimator + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian" +) +``` + +
+Doubly robust estimator with variable selection (SCAD) and bias +minimization + + +``` r +nonprob( + selection = ~ x1 + x2 + ... + xk, + outcome = y ~ x1 + x2 + ... + xk, + data = nonprob, + svydesign = prob, + method_outcome = "glm", + family_outcome = "gaussian", + control_inference = controlInf( + vars_selection = TRUE, + bias_correction = TRUE + ) +) +``` + +
## Examples diff --git a/inst/tinytest/test_nonprobsvy.R b/inst/tinytest/test_nonprobsvy.R index 700901b..50d80b2 100644 --- a/inst/tinytest/test_nonprobsvy.R +++ b/inst/tinytest/test_nonprobsvy.R @@ -239,13 +239,13 @@ expect_silent( ) expect_equivalent( test3apmm$output$mean, - 5.086964, + 5.027936, tolerance = .01 ) expect_true( - (test3apmm$confidence_interval[1] < 5.086964) & - (5.086964 < test3ann$confidence_interval[2]) + (test3apmm$confidence_interval[1] < 5.027936) & + (5.027936 < test3ann$confidence_interval[2]) ) ## bootstrap diff --git a/man/controlOut.Rd b/man/controlOut.Rd index 5d609df..4230429 100644 --- a/man/controlOut.Rd +++ b/man/controlOut.Rd @@ -51,8 +51,8 @@ controlOut( \item{predictive_match}{(Only for predictive mean matching) Indicates how to select 'closest' unit from nonprobability sample for each unit in probability sample. Either \code{1} (default) or \code{2} where -\code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for -\mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2} +\code{2} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +\mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}.} From 74dc1f4e27e12dbaa820224b0e03fae4896b1d46 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 12 Jun 2024 15:52:00 +0100 Subject: [PATCH 11/24] small changes and NEWS.md update --- NEWS.md | 12 +++++++++++- R/control_inference.R | 14 +++++++------- R/nonprobMI.R | 8 ++++---- R/varianceMI.R | 6 +++--- man/controlInf.Rd | 8 ++++---- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4438cb8..fd5d398 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,14 @@ -## version 0.1.0 +# nonprobsvy 0.1.1 + +- bugfixes + - bug Fix occuring when estimation was based on auxiliary variable, which led to compression of the data from the frame to the vector. + - bug Fix related to not passing `maxit` argument from `controlSel` function to internally used `nleqslv` function +- features + - add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables + - add estimation of exact standard error for k-nearest neighbor estimator. + - add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching. + +## nonprobsvy 0.1.0 ------------------------------------------------------------------------ diff --git a/R/control_inference.R b/R/control_inference.R index b0290cd..a2d1199 100644 --- a/R/control_inference.R +++ b/R/control_inference.R @@ -17,9 +17,9 @@ #' @param cores Number of cores in parallel computing. #' @param keep_boot Logical indicating whether statistics from bootstrap should be kept. #' By default set to \code{TRUE} -#' @param pmm_exact_se Logical value indicating whether to compute the exact -#' standard error estimate for \code{pmm} estimator. The variance estimator for -#' estimation based on \code{pmm} can be decomposed into three parts, with the +#' @param nn_exact_se Logical value indicating whether to compute the exact +#' standard error estimate for \code{nn} or \code{pmm} estimator. The variance estimator for +#' estimation based on \code{nn} or \code{pmm} can be decomposed into three parts, with the #' third being computed using covariance between imputed values for units in #' probability sample using predictive matches from non-probability sample. #' In most situations this term is negligible and is very computationally @@ -51,7 +51,7 @@ controlInf <- function(vars_selection = FALSE, alpha = 0.05, cores = 1, keep_boot, - pmm_exact_se = FALSE, + nn_exact_se = FALSE, pi_ij) { list( vars_selection = if (missing(vars_selection)) FALSE else vars_selection, @@ -71,10 +71,10 @@ controlInf <- function(vars_selection = FALSE, keep_boot } }, - pmm_exact_se = if (!is.logical(pmm_exact_se) & length(pmm_exact_se) == 1) { - stop("Argument pmm_exact_se must be a logical scalar") + nn_exact_se = if (!is.logical(nn_exact_se) & length(nn_exact_se) == 1) { + stop("Argument nn_exact_se must be a logical scalar") } else { - pmm_exact_se + nn_exact_se }, pi_ij = if (missing(pi_ij)) NULL else pi_ij ) diff --git a/R/nonprobMI.R b/R/nonprobMI.R index d845fcb..5eeba61 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -139,7 +139,7 @@ nonprobMI <- function(outcome, pop_totals = pop_totals, k = control_outcome$k, predictive_match = control_outcome$predictive_match, - pmm_exact_se = control_inference$pmm_exact_se, + nn_exact_se = control_inference$nn_exact_se, pmm_reg_engine = control_outcome$pmm_reg_engine, pi_ij = control_inference$pi_ij ) @@ -330,8 +330,8 @@ nonprobMI <- function(outcome, stop("Please, provide svydesign object or pop_totals/pop_means.") } - if (isTRUE(attr(model_obj$model, "method") == "pmm") & !(control_inference$pmm_exact_se)) { - # if not pmm_exact_se then this can be dropped + if (isTRUE(attr(model_obj$model, "method") == "pmm") & !(control_inference$nn_exact_se)) { + # if not nn_exact_se then this can be dropped model_obj$model$glm_obj <- NULL } @@ -356,7 +356,7 @@ nonprobMI <- function(outcome, # we should probably just pass full control list k = control_outcome$k, predictive_match = control_outcome$predictive_match, - pmm_exact_se = control_inference$pmm_exact_se, + nn_exact_se = control_inference$nn_exact_se, pmm_reg_engine = control_outcome$pmm_reg_engine, pi_ij = control_inference$pi_ij ) diff --git a/R/varianceMI.R b/R/varianceMI.R index b4cd317..dd01cd5 100644 --- a/R/varianceMI.R +++ b/R/varianceMI.R @@ -19,7 +19,7 @@ internal_varMI <- function(svydesign, pop_totals, k, predictive_match, - pmm_exact_se, + nn_exact_se, pmm_reg_engine, pi_ij) { parameters <- model_obj$parameters @@ -38,7 +38,7 @@ internal_varMI <- function(svydesign, est_ps <- n_nons / N var_nonprob <- n_rand / N^2 * (1 - est_ps) / est_ps * sigma_hat - if (pmm_exact_se) { + if (nn_exact_se) { var_nonprob <- nn_exact( pi_ij = pi_ij, weights_rand = weights_rand, @@ -76,7 +76,7 @@ internal_varMI <- function(svydesign, # An option in controlInf controls this # Maybe add a warning/message if this computation is omited - if (pmm_exact_se) { + if (nn_exact_se) { var_nonprob <- pmm_exact( pi_ij = pi_ij, weights_rand = weights_rand, diff --git a/man/controlInf.Rd b/man/controlInf.Rd index 98c72a0..7ed1c89 100644 --- a/man/controlInf.Rd +++ b/man/controlInf.Rd @@ -15,7 +15,7 @@ controlInf( alpha = 0.05, cores = 1, keep_boot, - pmm_exact_se = FALSE, + nn_exact_se = FALSE, pi_ij ) } @@ -45,9 +45,9 @@ selection and outcome model. \item{keep_boot}{Logical indicating whether statistics from bootstrap should be kept. By default set to \code{TRUE}} -\item{pmm_exact_se}{Logical value indicating whether to compute the exact -standard error estimate for \code{pmm} estimator. The variance estimator for -estimation based on \code{pmm} can be decomposed into three parts, with the +\item{nn_exact_se}{Logical value indicating whether to compute the exact +standard error estimate for \code{nn} or \code{pmm} estimator. The variance estimator for +estimation based on \code{nn} or \code{pmm} can be decomposed into three parts, with the third being computed using covariance between imputed values for units in probability sample using predictive matches from non-probability sample. In most situations this term is negligible and is very computationally From c751c47d42bc0f2402bd94989dcdf52ba47b1e33 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 10 Jul 2024 13:26:52 +0200 Subject: [PATCH 12/24] bugFix and add data manipulation changes for polynomial equations --- R/boot_mi.R | 5 ++--- R/data_manip.R | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/R/boot_mi.R b/R/boot_mi.R index b5769f4..913b53b 100644 --- a/R/boot_mi.R +++ b/R/boot_mi.R @@ -46,10 +46,9 @@ bootMI <- function(X_rand, predictive_match <- control_outcome$predictive_match - pmm_exact_se <- control_inference$pmm_exact_se + nn_exact_se <- control_inference$nn_exact_se pmm_reg_engine <- control_outcome$pmm_reg_engine pi_ij <- control_inference$pi_ij - pmm_exact_se <- control_inference$pmm_exact_se comp2_stat <- numeric(length = num_boot) if (is.null(pop_totals)) { @@ -399,7 +398,7 @@ bootMI <- function(X_rand, } # mu_hat_boot <- mean(mu_hats) if (method == "pmm") { - if (pmm_exact_se) { + if (nn_exact_se) { comp2 <- pmm_exact(pi_ij, weights_rand, n_nons = n_nons, diff --git a/R/data_manip.R b/R/data_manip.R index b165f88..a52e250 100644 --- a/R/data_manip.R +++ b/R/data_manip.R @@ -5,9 +5,11 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot model_Frame <- model.frame(formula, data) y_nons <- model.response(model_Frame) outcome_name <- names(model_Frame)[1] - mt <- attr(model_Frame, "terms") - nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) - ##### Model frame for probability sample ##### + + mt <- terms(formula) # attr(model_Frame, "terms") + nons_names <- all.vars(as.formula(paste("~", paste(attr(mt, "term.labels"), collapse=" + ")))) + #nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) + ##### Model frame for probability sample ##### if (outcome_name %in% colnames(svydesign$variables)) { # design_to_frame <- svydesign$variables # design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0 @@ -19,16 +21,25 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot design_to_frame <- svydesign$variables design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0 names_rand <- all.vars(formula) - model_Frame_rand <- design_to_frame[, names_rand] + model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE] - nons_names_rand <- attr(attr(model.frame(formula, design_to_frame), "terms"), "term.labels") + nons_names_rand <- intersect(names_rand, colnames(design_to_frame)) } else { design_to_frame <- svydesign$variables - names_rand <- all.vars(formula[-2]) + ## + terms_object <- terms(formula) + #names_rand <- all.vars(terms_object) + names_rand <- all.vars(as.formula(paste("~", paste(attr(terms_object, "term.labels"), collapse=" + ")))) + ## + # names_rand <- all.vars(formula[-2]) # old + ## + model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE] + nons_names_rand <- intersect(names_rand, colnames(design_to_frame)) - nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") + # nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") # old + # old # model_Frame_rand <- model.frame(formula[-2], svydesign$variables) # mt_rand <- attr(model_Frame_rand, "terms") # nons_names_rand <- attr(mt_rand, "term.labels") From a449ff4948df6f920792f8eb1fc92d962ff4ba27 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Wed, 10 Jul 2024 13:47:40 +0200 Subject: [PATCH 13/24] temporary change for simulations --- src/Makevars | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makevars b/src/Makevars index f09302a..3d1e654 100644 --- a/src/Makevars +++ b/src/Makevars @@ -15,5 +15,5 @@ ## set the appropriate value, possibly CXX17. # CXX_STD = CXX11 -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +#PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +#PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) From 42e7db4645e2a3258c0531a5e878597e784084ef Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 25 Aug 2024 12:55:02 +0200 Subject: [PATCH 14/24] small changes --- R/main_function_documentation.R | 4 ++-- R/nonprobDR.R | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R index bf31968..17cb042 100644 --- a/R/main_function_documentation.R +++ b/R/main_function_documentation.R @@ -11,8 +11,8 @@ NULL #' Yang et al. (2020), Wu (2022) and use the [Lumley 2004](https://CRAN.R-project.org/package=survey) `survey` package for inference. #' #' It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbor) and -#' doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators, -#' variable selection or overlap between probability and non-probability samples. +#' doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators or +#' variable selection. #' The package uses `survey` package functionality when a probability sample is available. #' #' diff --git a/R/nonprobDR.R b/R/nonprobDR.R index eb21968..8f64338 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -319,6 +319,7 @@ nonprobDR <- function(selection, nleqslv_method = control_selection$nleqslv_method, nleqslv_global = control_selection$nleqslv_global, nleqslv_xscalm = control_selection$nleqslv_xscalm, + maxit = maxit ) selection_model <- estimation_model$selection From a819c6062251fa72cc41e1b7c67821c0544a6d03 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 15 Sep 2024 11:38:40 +0200 Subject: [PATCH 15/24] add div version for variable selection and small changes in output --- R/main_function_documentation.R | 1 + R/nonprobDR.R | 60 +++++++++++++++++++++++---------- R/nonprobIPW.R | 1 + R/nonprobMI.R | 1 + R/prints.R | 2 +- man/nonprob.Rd | 5 +-- 6 files changed, 50 insertions(+), 20 deletions(-) diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R index 17cb042..3fcb96d 100644 --- a/R/main_function_documentation.R +++ b/R/main_function_documentation.R @@ -182,6 +182,7 @@ NULL #' \itemize{ #' \item{\code{X} -- model matrix containing data from probability and non-probability samples if specified at a function call.} #' \item{\code{y}} -- list of vector of outcome variables if specified at a function call. +#' \item{\code{R}} -- vector indicating the probablistic (0) or non-probablistic (1) units in the matrix X. #' \item{\code{prob} -- vector of estimated propensity scores for non-probability sample.} #' \item{\code{weights} -- vector of estimated weights for non-probability sample.} #' \item{\code{control} -- list of control functions.} diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 8f64338..c547dd4 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -408,13 +408,26 @@ nonprobDR <- function(selection, lambda_outcome <- beta$lambda lambda_min_outcome <- beta$lambda.min - idx <- sort(unique(c(beta_selected[-1], theta_selected[-1]))) # excluding intercepts - psel <- length(idx) - Xsel <- as.matrix(X[, idx + 1, drop = FALSE]) - X <- cbind(1, Xsel) - colnames(X) <- c("(Intercept)", colnames(Xsel)) - OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] - OutcomeModel$X_rand <- SelectionModel$X_rand <- X[loc_rand, ] + + if (control_inference$bias_inf == "union") { + idx <- sort(unique(c(beta_selected[-1], theta_selected[-1]))) # excluding intercepts + psel <- length(idx) + Xsel <- as.matrix(X[, idx + 1, drop = FALSE]) + X <- cbind(1, Xsel) + colnames(X) <- c("(Intercept)", colnames(Xsel)) + OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] + OutcomeModel$X_rand <- SelectionModel$X_rand <- X[loc_rand, ] + } else if (control_inference$bias_inf == "div") { + X_outcome <- as.matrix(X[, beta_selected[-1] + 1, drop = FALSE]) + Xsel <- X_selection <- as.matrix(X[, theta_selected[-1] + 1, drop = FALSE]) + OutcomeModel$X_nons <- cbind(1, X_outcome[loc_nons, ]) + OutcomeModel$X_rand <- cbind(1, X_outcome[loc_rand, ]) + colnames(OutcomeModel$X_nons) <- colnames(OutcomeModel$X_rand) <- c("(Intercept)", colnames(X_outcome)) + SelectionModel$X_nons <- cbind(1, X_selection[loc_nons, ]) + SelectionModel$X_rand <- cbind(1, X_selection[loc_rand, ]) + X <- cbind(1, Xsel) + colnames(X) <- colnames(SelectionModel$X_nons) <- colnames(SelectionModel$X_rand) <- c("(Intercept)", colnames(Xsel)) + } } model_sel <- internal_selection( @@ -538,13 +551,25 @@ nonprobDR <- function(selection, # Estimating theta, beta parameters using selected variables # beta_selected <- beta_selected[-1] - 1 - idx <- sort(unique(c(beta_selected[-1], theta_selected[-1]))) # excluding intercepts - psel <- length(idx) - Xsel <- as.matrix(X[, idx + 1, drop = FALSE]) - X <- cbind(1, Xsel) - colnames(X) <- c("(Intercept)", colnames(Xsel)) - X_nons <- OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] - SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1]) + if (control_inference$bias_inf == "union") { + idx <- sort(unique(c(beta_selected[-1], theta_selected[-1]))) # excluding intercepts + psel <- length(idx) + Xsel <- as.matrix(X[, idx + 1, drop = FALSE]) + X <- cbind(1, Xsel) + colnames(X) <- c("(Intercept)", colnames(Xsel)) + OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] + SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1]) + } else if (control_inference$bias_inf == "div") { + X_outcome <- as.matrix(X[, beta_selected[-1] + 1, drop = FALSE]) + Xsel <- X_selection <- as.matrix(X[, theta_selected[-1] + 1, drop = FALSE]) + OutcomeModel$X_nons <- cbind(1, X_outcome[loc_nons, ]) + OutcomeModel$X_rand <- cbind(1, X_outcome[loc_rand, ]) + colnames(OutcomeModel$X_nons) <- colnames(OutcomeModel$X_rand) <- c("(Intercept)", colnames(X_outcome)) + SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[theta_selected[-1] + 1]) + SelectionModel$X_nons <- cbind(1, X_selection[loc_nons, ]) + X <- cbind(1, Xsel) + colnames(X) <- colnames(SelectionModel$X_nons) <- c("(Intercept)", colnames(Xsel)) + } } prob_pop_totals <- SelectionModel$pop_totals @@ -644,9 +669,9 @@ nonprobDR <- function(selection, weights = weights, family_outcome = family_outcome, start_outcome = start_outcome, - X_nons = X_nons, - y_nons = y_nons, - X_rand = X_rand, + X_nons = OutcomeModel$X_nons, + y_nons = OutcomeModel$y_nons, + X_rand = OutcomeModel$X_rand, control = control_outcome, n_nons = n_nons, n_rand = n_rand, @@ -856,6 +881,7 @@ nonprobDR <- function(selection, list( X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, + R = R, prob = prop_scores, weights = as.vector(weights_nons), control = list( diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index 94b831b..eea3b8c 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -577,6 +577,7 @@ nonprobIPW <- function(selection, list( X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, + R = R, prob = prop_scores, weights = as.vector(weights_nons), control = list( diff --git a/R/nonprobMI.R b/R/nonprobMI.R index 5eeba61..c3128b7 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -473,6 +473,7 @@ nonprobMI <- function(outcome, list( X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, + R = R, control = list( control_outcome = control_outcome, control_inference = control_inference diff --git a/R/prints.R b/R/prints.R index 02d132d..8a9e310 100644 --- a/R/prints.R +++ b/R/prints.R @@ -116,7 +116,7 @@ print.summary_nonprobsvy <- function(x, cat("-------------------------\n\n") cat("Covariate balance:\n") - print(x$est_totals - x$totals) + print(x$est_totals - x$totals[which(!is.na(x$est_totals))]) cat("-------------------------\n\n") diff --git a/man/nonprob.Rd b/man/nonprob.Rd index 87972e1..b9cb25c 100644 --- a/man/nonprob.Rd +++ b/man/nonprob.Rd @@ -91,6 +91,7 @@ with type \code{list} containing:\cr \itemize{ \item{\code{X} -- model matrix containing data from probability and non-probability samples if specified at a function call.} \item{\code{y}} -- list of vector of outcome variables if specified at a function call. +\item{\code{R}} -- vector indicating the probablistic (0) or non-probablistic (1) units in the matrix X. \item{\code{prob} -- vector of estimated propensity scores for non-probability sample.} \item{\code{weights} -- vector of estimated weights for non-probability sample.} \item{\code{control} -- list of control functions.} @@ -154,8 +155,8 @@ The package implements state-of-the-art approaches recently proposed in the lite Yang et al. (2020), Wu (2022) and use the \href{https://CRAN.R-project.org/package=survey}{Lumley 2004} \code{survey} package for inference. It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbor) and -doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators, -variable selection or overlap between probability and non-probability samples. +doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators or +variable selection. The package uses \code{survey} package functionality when a probability sample is available. } \details{ From 0c3ed3b1e099975825d58a52a07fc00faf991060 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 15 Sep 2024 14:33:35 +0200 Subject: [PATCH 16/24] add methods to estimate means/totals/medians/quantiles in subgroups --- NAMESPACE | 2 + R/nonprobDR.R | 1 + R/nonprobIPW.R | 2 + R/nonprobMI.R | 4 ++ R/simple_methods.R | 106 +++++++++++++++++++++++++++++++++++++ man/mean.nonprobsvy.Rd | 23 ++++++++ man/median.nonprobsvy.Rd | 23 ++++++++ man/quantile.nonprobsvy.Rd | 27 ++++++++++ man/total.nonprobsvy.Rd | 23 ++++++++ 9 files changed, 211 insertions(+) create mode 100644 man/mean.nonprobsvy.Rd create mode 100644 man/median.nonprobsvy.Rd create mode 100644 man/quantile.nonprobsvy.Rd create mode 100644 man/total.nonprobsvy.Rd diff --git a/NAMESPACE b/NAMESPACE index 6c7a33e..427d3a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,8 @@ importFrom(Rcpp,sourceCpp) importFrom(doParallel,registerDoParallel) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) +importFrom(formula.tools,lhs) +importFrom(formula.tools,rhs) importFrom(maxLik,maxLik) importFrom(ncvreg,cv.ncvreg) importFrom(nleqslv,nleqslv) diff --git a/R/nonprobDR.R b/R/nonprobDR.R index c547dd4..61d9910 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -879,6 +879,7 @@ nonprobDR <- function(selection, structure( list( + data = data, X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, R = R, diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index eea3b8c..a40aed0 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -575,6 +575,7 @@ nonprobIPW <- function(selection, structure( list( + data = data, X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, R = R, @@ -591,6 +592,7 @@ nonprobIPW <- function(selection, prob_size = n_rand, pop_size = pop_size, pop_totals = prob_pop_totals, + outcome = NULL, selection = SelectionList, boot_sample = boot_sample ), diff --git a/R/nonprobMI.R b/R/nonprobMI.R index c3128b7..17c12ed 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -471,9 +471,12 @@ nonprobMI <- function(outcome, structure( list( + data = data, X = if (isTRUE(x)) X else NULL, y = if (isTRUE(y)) ys else NULL, R = R, + prob = NULL, + weights = NULL, control = list( control_outcome = control_outcome, control_inference = control_inference @@ -486,6 +489,7 @@ nonprobMI <- function(outcome, pop_size = pop_size, pop_totals = prob_pop_totals, outcome = OutcomeList, + selection = NULL, boot_sample = boot_sample ), class = c("nonprobsvy", "nonprobsvy_mi") diff --git a/R/simple_methods.R b/R/simple_methods.R index 983fb1d..c502f42 100644 --- a/R/simple_methods.R +++ b/R/simple_methods.R @@ -321,3 +321,109 @@ deviance.nonprobsvy <- function(object, if (class(object)[2] == "nonprobsvy_dr") res <- c("selection" = res_sel, "outcome" = res_out) res } +#' @title Total values of covariates in subgroups +#' +#' @param x - formula +#' @param nonprob - object of nonprobsvy class +#' +#' @method total nonprobsvy +#' @return A vector of estimated totals of the given covariates in subgroups +#' @importFrom formula.tools lhs +#' @importFrom formula.tools rhs +total.nonprobsvy <- function(x, nonprob) { + # Rozbijanie formuły + variables <- lhs.vars(x) + groups <- rhs.vars(x) + + if (is.null(variables)) { + data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] + total <- tapply(nonprob$weights[which(nonprob$R == 1)], data, sum) + } else { + data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] + weights <- nonprob$weights[which(nonprob$R == 1)] + + total <- aggregate(data[, variables] * weights, by = data[, groups, drop = FALSE], sum) + } + total +} +#' @title Mean values of covariates in subgroups +#' +#' @param x - formula +#' @param nonprob - object of nonprobsvy class +#' +#' @method mean nonprobsvy +#' @return A vector of estimated means of the given covariates in subgroups +#' @importFrom formula.tools lhs +#' @importFrom formula.tools rhs +mean.nonprobsvy <- function(x, nonprob) { + variables <- lhs.vars(x) + groups <- rhs.vars(x) + + if (is.null(variables)) { + data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] + mean_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, mean) # to consider + } else { + data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] + weights <- nonprob$weights[which(nonprob$R == 1)] + + mean_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], + FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), + w = weights) + } + mean_value +} +#' @title Median values of covariates in subgroups +#' +#' @param x - formula +#' @param nonprob - object of nonprobsvy class +#' +#' @method median nonprobsvy +#' @return A vector of estimated medians of the given covariates in subgroups +#' @importFrom formula.tools lhs +#' @importFrom formula.tools rhs +median.nonprobsvy <- function(x, nonprob) { + variables <- lhs.vars(x) + groups <- rhs.vars(x) + + if (is.null(variables)) { + data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] + median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median) + } else { + + data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] + weights <- nonprob$weights[which(nonprob$R == 1)] + + median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], + FUN = function(y, w) weighted.median(y, w = w[1:length(y)]), + w = weights) + } + median_value +} +#' @title Quantile values of covariates in subgroups +#' +#' @param x - formula +#' @param nonprob - object of nonprobsvy class +#' @param probs - probability +#' +#' @method quantile nonprobsvy +#' @return A vector of estimated quantiles if the given covariates in subgroups +#' @importFrom formula.tools lhs +#' @importFrom formula.tools rhs +quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) { + variables <- lhs.vars(x) + groups <- rhs.vars(x) + + if (is.null(variables)) { + data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] + percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs)) + } else { + + data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] + weights <- nonprob$weights[which(nonprob$R == 1)] + + percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], + FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs), + w = weights) + } + percentile_value +} diff --git a/man/mean.nonprobsvy.Rd b/man/mean.nonprobsvy.Rd new file mode 100644 index 0000000..4248b7f --- /dev/null +++ b/man/mean.nonprobsvy.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simple_methods.R +\name{mean.nonprobsvy} +\alias{mean.nonprobsvy} +\title{Mean values of covariates in subgroups} +\usage{ +\method{mean}{nonprobsvy}(x, nonprob) +} +\arguments{ +\item{x}{\itemize{ +\item formula +}} + +\item{nonprob}{\itemize{ +\item object of nonprobsvy class +}} +} +\value{ +A vector of estimated means of the given covariates in subgroups +} +\description{ +Mean values of covariates in subgroups +} diff --git a/man/median.nonprobsvy.Rd b/man/median.nonprobsvy.Rd new file mode 100644 index 0000000..653ef0b --- /dev/null +++ b/man/median.nonprobsvy.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simple_methods.R +\name{median.nonprobsvy} +\alias{median.nonprobsvy} +\title{Median values of covariates in subgroups} +\usage{ +\method{median}{nonprobsvy}(x, nonprob) +} +\arguments{ +\item{x}{\itemize{ +\item formula +}} + +\item{nonprob}{\itemize{ +\item object of nonprobsvy class +}} +} +\value{ +A vector of estimated medians of the given covariates in subgroups +} +\description{ +Median values of covariates in subgroups +} diff --git a/man/quantile.nonprobsvy.Rd b/man/quantile.nonprobsvy.Rd new file mode 100644 index 0000000..5f23775 --- /dev/null +++ b/man/quantile.nonprobsvy.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simple_methods.R +\name{quantile.nonprobsvy} +\alias{quantile.nonprobsvy} +\title{Quantile values of covariates in subgroups} +\usage{ +\method{quantile}{nonprobsvy}(x, nonprob, probs = 0.5) +} +\arguments{ +\item{x}{\itemize{ +\item formula +}} + +\item{nonprob}{\itemize{ +\item object of nonprobsvy class +}} + +\item{probs}{\itemize{ +\item probability +}} +} +\value{ +A vector of estimated quantiles if the given covariates in subgroups +} +\description{ +Quantile values of covariates in subgroups +} diff --git a/man/total.nonprobsvy.Rd b/man/total.nonprobsvy.Rd new file mode 100644 index 0000000..d509a22 --- /dev/null +++ b/man/total.nonprobsvy.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simple_methods.R +\name{total.nonprobsvy} +\alias{total.nonprobsvy} +\title{Total values of covariates in subgroups} +\usage{ +\method{total}{nonprobsvy}(x, nonprob) +} +\arguments{ +\item{x}{\itemize{ +\item formula +}} + +\item{nonprob}{\itemize{ +\item object of nonprobsvy class +}} +} +\value{ +A vector of estimated totals of the given covariates in subgroups +} +\description{ +Total values of covariates in subgroups +} From 3a13951deaccb7d50c71864cd8c40e549195da55 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sat, 28 Sep 2024 13:11:31 +0200 Subject: [PATCH 17/24] update NEWS --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index fd5d398..fbb5c38 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,10 +3,13 @@ - bugfixes - bug Fix occuring when estimation was based on auxiliary variable, which led to compression of the data from the frame to the vector. - bug Fix related to not passing `maxit` argument from `controlSel` function to internally used `nleqslv` function + - bug Fix related to storing `vector` in `model_frame` when predicting `y_hat` in mass imputation `glm` model when X is based in one auxiliary variable only - fix provided converting it to `data.frame` object. - features - add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables - add estimation of exact standard error for k-nearest neighbor estimator. - add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching. + - add methods for estimation of means and totals in subsets/groups with its variances. + - implement `div` option when variable selection (more in documentation) for doubly robust estimation. ## nonprobsvy 0.1.0 From c3a4d53b39ebe851ac1857206d2b4ef6807beba5 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Fri, 4 Oct 2024 18:22:42 +0200 Subject: [PATCH 18/24] add more unit tests small fixes regarding estimation with pop totals --- DESCRIPTION | 3 +- NAMESPACE | 5 +- R/internals.R | 31 ++- R/nonprobDR.R | 9 +- R/nonprobMI.R | 3 +- R/simple_methods.R | 127 +++++------ inst/tinytest/test-2-ipw-totals.R | 16 +- inst/tinytest/test-3-mi-totals.R | 238 ++++++++++++++++++++ inst/tinytest/test-4-dr-totals.R | 269 +++++++++++++++++++++++ inst/tinytest/test_nonprobsvy.R | 60 +++++- inst/tinytest/test_nonprobsvy_main.R | 312 +++++++++++++++++++++++++++ man/median.nonprobsvy.Rd | 23 -- man/quantile.nonprobsvy.Rd | 27 --- 13 files changed, 981 insertions(+), 142 deletions(-) create mode 100644 inst/tinytest/test-3-mi-totals.R create mode 100644 inst/tinytest/test-4-dr-totals.R delete mode 100644 man/median.nonprobsvy.Rd delete mode 100644 man/quantile.nonprobsvy.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 34b80db..080ce4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,8 @@ Imports: nleqslv, doParallel, foreach, - parallel + parallel, + formula.tools LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index 427d3a3..2ec47a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,8 +34,8 @@ importFrom(Rcpp,sourceCpp) importFrom(doParallel,registerDoParallel) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) -importFrom(formula.tools,lhs) -importFrom(formula.tools,rhs) +importFrom(formula.tools,lhs.vars) +importFrom(formula.tools,rhs.vars) importFrom(maxLik,maxLik) importFrom(ncvreg,cv.ncvreg) importFrom(nleqslv,nleqslv) @@ -43,6 +43,7 @@ importFrom(parallel,makeCluster) importFrom(parallel,stopCluster) importFrom(stats,AIC) importFrom(stats,BIC) +importFrom(stats,aggregate) importFrom(stats,as.formula) importFrom(stats,binomial) importFrom(stats,coef) diff --git a/R/internals.R b/R/internals.R index ceeb111..714dc4d 100644 --- a/R/internals.R +++ b/R/internals.R @@ -151,20 +151,29 @@ get_method <- function(method) { } ff <- function(formula) { - fff <- as.character(formula) - f <- strsplit(fff[2], "\\s*\\+\\s*")[[1]] - outcome_formulas <- list() - if (any(duplicated(f))) { - warning("No unique names of the outcome variables in formula. The error has been corrected") - f <- unique(f) - } - l <- length(f) - for (i in 1:l) { - outcome_formulas[[i]] <- as.formula(paste(f[i], fff[3], sep = " ~ ")) + formula_string <- paste(deparse(formula), collapse = " ") + formula_parts <- strsplit(formula_string, "~")[[1]] + if (length(formula_parts) != 2) { + stop("The formula must contain exactly one '~' operator.") } + lhs <- trimws(formula_parts[1]) + rhs <- trimws(formula_parts[2]) + + dependent_vars <- strsplit(lhs, "\\s*\\+\\s*")[[1]] + independent_vars <- strsplit(rhs, "\\s*\\+\\s*")[[1]] + + if (any(duplicated(dependent_vars))) { + warning("Duplicate dependent variable names detected. They have been made unique.") + dependent_vars <- unique(dependent_vars) + } + outcome_formulas <- vector("list", length(dependent_vars)) + l <- length(dependent_vars) + for (i in seq_along(dependent_vars)) { + outcome_formulas[[i]] <- reformulate(termlabels = independent_vars, response = dependent_vars[i]) + } list( - f = f, + f = dependent_vars, outcomes = outcome_formulas, l = l ) diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 61d9910..2686013 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -521,6 +521,9 @@ nonprobDR <- function(selection, X_rand <- OutcomeModel$X_rand nons_names <- OutcomeModel$nons_names y_nons <- OutcomeModel$y_nons + n_nons <- nrow(X_nons) + R <- rep(1, n_nons) + loc_nons <- which(R == 1) if (var_selection == TRUE) { nlambda <- control_outcome$nlambda @@ -559,17 +562,19 @@ nonprobDR <- function(selection, colnames(X) <- c("(Intercept)", colnames(Xsel)) OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1]) + pop_totals <- c(pop_totals[1], pop_totals[idx+1]) } else if (control_inference$bias_inf == "div") { X_outcome <- as.matrix(X[, beta_selected[-1] + 1, drop = FALSE]) Xsel <- X_selection <- as.matrix(X[, theta_selected[-1] + 1, drop = FALSE]) OutcomeModel$X_nons <- cbind(1, X_outcome[loc_nons, ]) - OutcomeModel$X_rand <- cbind(1, X_outcome[loc_rand, ]) - colnames(OutcomeModel$X_nons) <- colnames(OutcomeModel$X_rand) <- c("(Intercept)", colnames(X_outcome)) + colnames(OutcomeModel$X_nons) <- c("(Intercept)", colnames(X_outcome)) SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[theta_selected[-1] + 1]) SelectionModel$X_nons <- cbind(1, X_selection[loc_nons, ]) X <- cbind(1, Xsel) colnames(X) <- colnames(SelectionModel$X_nons) <- c("(Intercept)", colnames(Xsel)) + pop_totals <- c(pop_totals[1], pop_totals[theta_selected[-1] + 1]) } + SelectionModel$total_names <- names(SelectionModel$pop_totals) } prob_pop_totals <- SelectionModel$pop_totals diff --git a/R/nonprobMI.R b/R/nonprobMI.R index 17c12ed..d1c2258 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -268,6 +268,7 @@ nonprobMI <- function(outcome, n_rand <- 0 weights_rand <- NULL n_nons <- nrow(X_nons) + R <- rep(1, n_nons) ############## WORKING VERSION if (var_selection == TRUE) { @@ -316,7 +317,7 @@ nonprobMI <- function(outcome, control = control_outcome, n_nons = n_nons, n_rand = n_rand, - model_frame = OutcomeModel$model_frame_rand, + model_frame = Model$model_frame_rand, vars_selection = control_inference$vars_selection, pop_totals = pop_totals ) diff --git a/R/simple_methods.R b/R/simple_methods.R index c502f42..1a937fb 100644 --- a/R/simple_methods.R +++ b/R/simple_methods.R @@ -328,10 +328,10 @@ deviance.nonprobsvy <- function(object, #' #' @method total nonprobsvy #' @return A vector of estimated totals of the given covariates in subgroups -#' @importFrom formula.tools lhs -#' @importFrom formula.tools rhs +#' @importFrom formula.tools lhs.vars +#' @importFrom formula.tools rhs.vars +#' @importFrom stats aggregate total.nonprobsvy <- function(x, nonprob) { - # Rozbijanie formuły variables <- lhs.vars(x) groups <- rhs.vars(x) @@ -353,8 +353,9 @@ total.nonprobsvy <- function(x, nonprob) { #' #' @method mean nonprobsvy #' @return A vector of estimated means of the given covariates in subgroups -#' @importFrom formula.tools lhs -#' @importFrom formula.tools rhs +#' @importFrom formula.tools lhs.vars +#' @importFrom formula.tools rhs.vars +#' @importFrom stats aggregate mean.nonprobsvy <- function(x, nonprob) { variables <- lhs.vars(x) groups <- rhs.vars(x) @@ -372,58 +373,64 @@ mean.nonprobsvy <- function(x, nonprob) { } mean_value } -#' @title Median values of covariates in subgroups -#' -#' @param x - formula -#' @param nonprob - object of nonprobsvy class -#' -#' @method median nonprobsvy -#' @return A vector of estimated medians of the given covariates in subgroups -#' @importFrom formula.tools lhs -#' @importFrom formula.tools rhs -median.nonprobsvy <- function(x, nonprob) { - variables <- lhs.vars(x) - groups <- rhs.vars(x) - - if (is.null(variables)) { - data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] - median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median) - } else { - - data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] - weights <- nonprob$weights[which(nonprob$R == 1)] - - median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], - FUN = function(y, w) weighted.median(y, w = w[1:length(y)]), - w = weights) - } - median_value -} -#' @title Quantile values of covariates in subgroups -#' -#' @param x - formula -#' @param nonprob - object of nonprobsvy class -#' @param probs - probability -#' -#' @method quantile nonprobsvy -#' @return A vector of estimated quantiles if the given covariates in subgroups -#' @importFrom formula.tools lhs -#' @importFrom formula.tools rhs -quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) { - variables <- lhs.vars(x) - groups <- rhs.vars(x) - - if (is.null(variables)) { - data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] - percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs)) - } else { - - data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] - weights <- nonprob$weights[which(nonprob$R == 1)] - - percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], - FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs), - w = weights) - } - percentile_value -} +# @title Median values of covariates in subgroups +# +# @param x - formula +# @param nonprob - object of nonprobsvy class +# +# @method median nonprobsvy +# @return A vector of estimated medians of the given covariates in subgroups +# @importFrom formula.tools lhs.vars +# @importFrom formula.tools rhs.vars +# @importFrom spatstat weighted.median +# @importFrom stats aggregate +# @importFrom stats median +# median.nonprobsvy <- function(x, nonprob) { +# variables <- lhs.vars(x) +# groups <- rhs.vars(x) +# +# if (is.null(variables)) { +# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] +# median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median) +# } else { +# +# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] +# weights <- nonprob$weights[which(nonprob$R == 1)] +# +# median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], +# FUN = function(y, w) weighted.median(y, w = w[1:length(y)]), +# w = weights) +# } +# median_value +# } +# @title Quantile values of covariates in subgroups +# +# @param x - formula +# @param nonprob - object of nonprobsvy class +# @param probs - probability +# +# @method quantile nonprobsvy +# @return A vector of estimated quantiles if the given covariates in subgroups +# @importFrom formula.tools lhs.vars +# @importFrom formula.tools rhs.vars +# @importFrom spatstat weighted.quantile +# @importFrom stats aggregate +# @importFrom stats quantile +# quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) { +# variables <- lhs.vars(x) +# groups <- rhs.vars(x) +# +# if (is.null(variables)) { +# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] +# percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs)) +# } else { +# +# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] +# weights <- nonprob$weights[which(nonprob$R == 1)] +# +# percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], +# FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs), +# w = weights) +# } +# percentile_value +# } diff --git a/inst/tinytest/test-2-ipw-totals.R b/inst/tinytest/test-2-ipw-totals.R index 6cd8df2..932802c 100644 --- a/inst/tinytest/test-2-ipw-totals.R +++ b/inst/tinytest/test-2-ipw-totals.R @@ -1,4 +1,4 @@ -if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { library(sampling) library(survey) @@ -505,7 +505,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- - if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { ## y_11 expect_silent( @@ -625,7 +625,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") # verbose = T) # ) - } + # } # check probit ---------------------------------------------------------------------------- @@ -811,7 +811,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) - if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -933,7 +933,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") # verbose = T) # ) - } + # } ## non-linear case ------------------------------------------------------------------------ #### correctly specified variables -------------------------------------------------------- @@ -1119,7 +1119,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) - if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -1241,7 +1241,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") # verbose = T) # ) - } + # } # check cloglog ----------------------------------------------------- ## linear case ---------------------------------------------------------------------------- @@ -1752,4 +1752,4 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") # # expect_true(y22_corr_scad$confidence_interval$lower_bound < mean(Y_22) & # # y22_corr_scad$confidence_interval$upper_bound > mean(Y_22)) ## conf int # expect_true(NROW(y22_corr_scad$selection$coefficients) == 2) -} +# } diff --git a/inst/tinytest/test-3-mi-totals.R b/inst/tinytest/test-3-mi-totals.R new file mode 100644 index 0000000..2a2dcc7 --- /dev/null +++ b/inst/tinytest/test-3-mi-totals.R @@ -0,0 +1,238 @@ +# Load necessary libraries +library(sampling) +library(survey) + +# Generation of data ---------------------------------------------------------------------- +set.seed(2024) +source("test-1-generate-data.R") # Adjust the path as needed + +# Check Mass Imputation (MI) Estimator ----------------------------------------------------- + +## Linear case ------------------------------------------------------------------------------ + +#### Correctly specified variables --------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 +expect_silent( + y11_mi_corr_one <- nonprob( + outcome = Y_11 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y11_mi_corr_one$output$mean, 2.054878, tolerance = 0.0001) # Adjusted expected mean +expect_equal(y11_mi_corr_one$output$SE, 0.058286, tolerance = 0.0001) # Adjusted expected SE +expect_true(y11_mi_corr_one$confidence_interval$lower_bound < mean(Y_11) & + y11_mi_corr_one$confidence_interval$upper_bound > mean(Y_11)) # Confidence interval test + +# For Y_12 +expect_silent( + y12_mi_corr_one <- nonprob( + outcome = Y_12 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y12_mi_corr_one$output$mean, 7.465879, tolerance = 0.0001) +expect_equal(y12_mi_corr_one$output$SE, 0.2523682, tolerance = 0.0001) +# TODO +# expect_true(y12_mi_corr_one$confidence_interval$lower_bound < mean(Y_12) & +# y12_mi_corr_one$confidence_interval$upper_bound > mean(Y_12)) + +# For Y_21 (Binary outcome) +expect_silent( + y21_mi_corr_one <- nonprob( + outcome = Y_21 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y21_mi_corr_one$output$mean, 0.6706463, tolerance = 0.0001) +expect_equal(y21_mi_corr_one$output$SE, 0.01796104, tolerance = 0.0001) +expect_true(y21_mi_corr_one$confidence_interval$lower_bound < mean(Y_21) & + y21_mi_corr_one$confidence_interval$upper_bound > mean(Y_21)) + +# For Y_22 (Binary outcome) +expect_silent( + y22_mi_corr_one <- nonprob( + outcome = Y_22 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y22_mi_corr_one$output$mean, 0.653369, tolerance = 0.0001) +expect_equal(y22_mi_corr_one$output$SE, 0.01654733, tolerance = 0.0001) +expect_true(y22_mi_corr_one$confidence_interval$lower_bound < mean(Y_22) & + y22_mi_corr_one$confidence_interval$upper_bound > mean(Y_22)) + +##### All target variables ----------------------------------------------------------------- + +#### All X variables ----------------------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 with all X variables +expect_silent( + y11_mi_corr_all <- nonprob( + outcome = as.formula(paste('Y_11', X_formula)), + data = sample_B1, # Include all X variables + pop_totals = X_totals, + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y11_mi_corr_all$output$mean, 2.002616, tolerance = 0.0001) +expect_equal(y11_mi_corr_all$output$SE, 0.03326564, tolerance = 0.0001) +expect_true(y11_mi_corr_all$confidence_interval$lower_bound < mean(Y_11) & + y11_mi_corr_all$confidence_interval$upper_bound > mean(Y_11)) + +# For Y_12 with all X variables +expect_silent( + y12_mi_corr_all <- nonprob( + outcome = as.formula(paste('Y_12', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y12_mi_corr_all$output$mean, 7.425809, tolerance = 0.0001) +expect_equal(y12_mi_corr_all$output$SE, 0.2454465, tolerance = 0.0001) +# TODO +# expect_true(y12_mi_corr_all$confidence_interval$lower_bound < mean(Y_12) & +# y12_mi_corr_all$confidence_interval$upper_bound > mean(Y_12)) + +# For Y_21 with all X variables +expect_silent( + y21_mi_corr_all <- nonprob( + outcome = as.formula(paste('Y_21', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y21_mi_corr_all$output$mean, 0.7112867, tolerance = 0.0001) +expect_equal(y21_mi_corr_all$output$SE, 0.01992489, tolerance = 0.0001) +# TODO +# expect_true(y21_mi_corr_all$confidence_interval$lower_bound < mean(Y_21) & +# y21_mi_corr_all$confidence_interval$upper_bound > mean(Y_21)) + +# For Y_22 with all X variables +expect_silent( + y22_mi_corr_all <- nonprob( + outcome = as.formula(paste('Y_22', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y22_mi_corr_all$output$mean, 0.7836929, tolerance = 0.0001) +expect_equal(y22_mi_corr_all$output$SE, 0.01921207, tolerance = 0.0001) +# TODO +# expect_true(y22_mi_corr_all$confidence_interval$lower_bound < mean(Y_22) & +# y22_mi_corr_all$confidence_interval$upper_bound > mean(Y_22)) + +##### All target variables ----------------------------------------------------------------- + +#### Variable Selection -------------------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 with SCAD penalty +expect_silent( + y11_mi_scad <- nonprob( + outcome = Y_11 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_outcome = "glm", + family_outcome = "gaussian", + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y11_mi_scad$output$mean, 2.00172, tolerance = 0.0001) +expect_equal(y11_mi_scad$output$SE, 0.03371772, tolerance = 0.0001) +expect_true(y11_mi_scad$confidence_interval$lower_bound < mean(Y_11) & + y11_mi_scad$confidence_interval$upper_bound > mean(Y_11)) +expect_true(NROW(y11_mi_scad$outcome$coefficients) < 11) # Number of selected variables + +# For Y_12 with SCAD penalty +expect_silent( + y12_mi_scad <- nonprob( + outcome = Y_12 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_outcome = "glm", + family_outcome = "gaussian", + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y12_mi_scad$output$mean, 7.48752, tolerance = 0.0001) +expect_equal(y12_mi_scad$output$SE, 0.2202497, tolerance = 0.0001) +# TODO +# expect_true(y12_mi_scad$confidence_interval$lower_bound < mean(Y_12) & +# y12_mi_scad$confidence_interval$upper_bound > mean(Y_12)) +expect_true(NROW(y12_mi_scad$outcome$coefficients) < 11) + +# For Y_21 with SCAD penalty +expect_silent( + y21_mi_scad <- nonprob( + outcome = Y_21 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_outcome = "glm", + family_outcome = "binomial", + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y21_mi_scad$output$mean, 0.7158588, tolerance = 0.0001) +expect_equal(y21_mi_scad$output$SE, 0.0141177, tolerance = 0.0001) +# TODO +# expect_true(y21_mi_scad$confidence_interval$lower_bound < mean(Y_21) & +# y21_mi_scad$confidence_interval$upper_bound > mean(Y_21)) +expect_true(NROW(y21_mi_scad$outcome$coefficients) < 11) + +# For Y_22 with SCAD penalty +expect_silent( + y22_mi_scad <- nonprob( + outcome = Y_22 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_outcome = "glm", + family_outcome = "binomial", + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y22_mi_scad$output$mean, 0.7870636, tolerance = 0.0001) +expect_equal(y22_mi_scad$output$SE, 0.01550724, tolerance = 0.0001) +# TODO +# expect_true(y22_mi_scad$confidence_interval$lower_bound < mean(Y_22) & +# y22_mi_scad$confidence_interval$upper_bound > mean(Y_22)) +expect_true(NROW(y22_mi_scad$outcome$coefficients) < 11) diff --git a/inst/tinytest/test-4-dr-totals.R b/inst/tinytest/test-4-dr-totals.R new file mode 100644 index 0000000..cee8879 --- /dev/null +++ b/inst/tinytest/test-4-dr-totals.R @@ -0,0 +1,269 @@ +# Load necessary libraries +library(sampling) +library(survey) + +# Generation of data ---------------------------------------------------------------------- +set.seed(2024) +source("test-1-generate-data.R") # Adjust the path as needed + +# Check Doubly Robust (DR) Estimator ------------------------------------------------------- + +## Linear case ----------------------------------------------------------------------------- + +#### Correctly specified variables --------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 +expect_silent( + y11_dr_corr_one <- nonprob( + selection = ~ X1 + X2 + X3 + X4, + outcome = Y_11 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y11_dr_corr_one$output$mean, 2.17757, tolerance = 0.0001) # Adjusted expected mean +expect_equal(y11_dr_corr_one$output$SE, 0.09009875, tolerance = 0.0001) # Adjusted expected SE +# TODO +# expect_true(y11_dr_corr_one$confidence_interval$lower_bound < mean(Y_11) & +# y11_dr_corr_one$confidence_interval$upper_bound > mean(Y_11)) # Confidence interval test + +# For Y_12 +expect_silent( + y12_dr_corr_one <- nonprob( + selection = ~ X1 + X2 + X3 + X4, + outcome = Y_12 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y12_dr_corr_one$output$mean, 7.168049, tolerance = 0.0001) +expect_equal(y12_dr_corr_one$output$SE, 0.5737179, tolerance = 0.0001) +expect_true(y12_dr_corr_one$confidence_interval$lower_bound < mean(Y_12) & + y12_dr_corr_one$confidence_interval$upper_bound > mean(Y_12)) + +# For Y_21 (Binary outcome) +expect_silent( + y21_dr_corr_one <- nonprob( + selection = ~ X1 + X2 + X3 + X4, + outcome = Y_21 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y21_dr_corr_one$output$mean, 0.6969967, tolerance = 0.0001) +expect_equal(y21_dr_corr_one$output$SE, 0.02966577, tolerance = 0.0001) +expect_true(y21_dr_corr_one$confidence_interval$lower_bound < mean(Y_21) & + y21_dr_corr_one$confidence_interval$upper_bound > mean(Y_21)) + +# For Y_22 (Binary outcome) +expect_silent( + y22_dr_corr_one <- nonprob( + selection = ~ X1 + X2 + X3 + X4, + outcome = Y_22 ~ X1 + X2 + X3 + X4, + data = sample_B1, + pop_totals = X_totals[1:5], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y22_dr_corr_one$output$mean, 0.62005, tolerance = 0.0001) +expect_equal(y22_dr_corr_one$output$SE, 0.05059186, tolerance = 0.0001) +expect_true(y22_dr_corr_one$confidence_interval$lower_bound < mean(Y_22) & + y22_dr_corr_one$confidence_interval$upper_bound > mean(Y_22)) + +##### All target variables ----------------------------------------------------------------- + +#### All X variables ----------------------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 with all X variables +expect_silent( + y11_dr_corr_all <- nonprob( + selection = X_formula, + outcome = as.formula(paste('Y_11', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y11_dr_corr_all$output$mean, 2.005841, tolerance = 0.0001) +expect_equal(y11_dr_corr_all$output$SE, 0.04629972, tolerance = 0.0001) +expect_true(y11_dr_corr_all$confidence_interval$lower_bound < mean(Y_11) & + y11_dr_corr_all$confidence_interval$upper_bound > mean(Y_11)) + +# For Y_12 with all X variables +expect_silent( + y12_dr_corr_all <- nonprob( + selection = X_formula, + outcome = as.formula(paste('Y_12', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian" + ) +) + +expect_equal(y12_dr_corr_all$output$mean, 6.681308, tolerance = 0.0001) +expect_equal(y12_dr_corr_all$output$SE, 0.3886612, tolerance = 0.0001) +expect_true(y12_dr_corr_all$confidence_interval$lower_bound < mean(Y_12) & + y12_dr_corr_all$confidence_interval$upper_bound > mean(Y_12)) + +# For Y_21 with all X variables +expect_silent( + y21_dr_corr_all <- nonprob( + selection = X_formula, + outcome = as.formula(paste('Y_21', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y21_dr_corr_all$output$mean, 0.7233015, tolerance = 0.0001) +expect_equal(y21_dr_corr_all$output$SE, 0.0192478, tolerance = 0.0001) +# TODO +# expect_true(y21_dr_corr_all$confidence_interval$lower_bound < mean(Y_21) & +# y21_dr_corr_all$confidence_interval$upper_bound > mean(Y_21)) + +# For Y_22 with all X variables +expect_silent( + y22_dr_corr_all <- nonprob( + selection = X_formula, + outcome = as.formula(paste('Y_22', X_formula)), + data = sample_B1, + pop_totals = X_totals, + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial" + ) +) + +expect_equal(y22_dr_corr_all$output$mean, 0.7668045, tolerance = 0.0001) +expect_equal(y22_dr_corr_all$output$SE, 0.02748494, tolerance = 0.0001) +# TODO +# expect_true(y22_dr_corr_all$confidence_interval$lower_bound < mean(Y_22) & +# y22_dr_corr_all$confidence_interval$upper_bound > mean(Y_22)) + +##### All target variables ----------------------------------------------------------------- + + +#### Variable Selection -------------------------------------------------------------------- + +##### One target variable ------------------------------------------------------------------ + +# For Y_11 with SCAD penalty +expect_silent( + y11_dr_scad <- nonprob( + selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + outcome = Y_11 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian", + control_selection = controlSel(penalty = "SCAD", nfolds = 5), + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y11_dr_scad$output$mean, 1.975596, tolerance = 0.0001) +expect_equal(y11_dr_scad$output$SE, 0.03277463, tolerance = 0.0001) +expect_true(y11_dr_scad$confidence_interval$lower_bound < mean(Y_11) & + y11_dr_scad$confidence_interval$upper_bound > mean(Y_11)) +expect_true(NROW(y11_dr_scad$selection$coefficients) < 11) +expect_true(NROW(y11_dr_scad$outcome$coefficients) < 11) + +# For Y_12 with SCAD penalty +expect_silent( + y12_dr_scad <- nonprob( + selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + outcome = Y_12 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "gaussian", + control_selection = controlSel(penalty = "SCAD", nfolds = 5), + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y12_dr_scad$output$mean, 6.730679, tolerance = 0.0001) +expect_equal(y12_dr_scad$output$SE, 0.5256198, tolerance = 0.0001) +expect_true(y12_dr_scad$confidence_interval$lower_bound < mean(Y_12) & + y12_dr_scad$confidence_interval$upper_bound > mean(Y_12)) +expect_true(NROW(y12_dr_scad$selection$coefficients) < 11) +expect_true(NROW(y12_dr_scad$outcome$coefficients) < 11) + +# For Y_21 with SCAD penalty +expect_silent( + y21_dr_scad <- nonprob( + selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + outcome = Y_21 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial", + control_selection = controlSel(penalty = "SCAD", nfolds = 5), + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y21_dr_scad$output$mean, 0.7286048, tolerance = 0.0001) +expect_equal(y21_dr_scad$output$SE, 0.01451463, tolerance = 0.0001) +# TODO +# expect_true(y21_dr_scad$confidence_interval$lower_bound < mean(Y_21) & +# y21_dr_scad$confidence_interval$upper_bound > mean(Y_21)) +expect_true(NROW(y21_dr_scad$selection$coefficients) < 11) +expect_true(NROW(y21_dr_scad$outcome$coefficients) < 11) + +# For Y_22 with SCAD penalty +expect_silent( + y22_dr_scad <- nonprob( + selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + outcome = Y_22 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, + data = sample_B1, + pop_totals = X_totals[1:11], + method_selection = "logit", + method_outcome = "glm", + family_outcome = "binomial", + control_selection = controlSel(penalty = "SCAD", nfolds = 5), + control_outcome = controlOut(penalty = "SCAD", nfolds = 5), + control_inference = controlInf(vars_selection = TRUE) + ) +) + +expect_equal(y22_dr_scad$output$mean, 0.7644814, tolerance = 0.0001) +expect_equal(y22_dr_scad$output$SE, 0.01492712, tolerance = 0.0001) +# TODO +# expect_true(y22_dr_scad$confidence_interval$lower_bound < mean(Y_22) & +# y22_dr_scad$confidence_interval$upper_bound > mean(Y_22)) +expect_true(NROW(y22_dr_scad$selection$coefficients) < 11) +expect_true(NROW(y22_dr_scad$outcome$coefficients) < 11) diff --git a/inst/tinytest/test_nonprobsvy.R b/inst/tinytest/test_nonprobsvy.R index 50d80b2..1f2b85c 100644 --- a/inst/tinytest/test_nonprobsvy.R +++ b/inst/tinytest/test_nonprobsvy.R @@ -252,7 +252,7 @@ expect_true( # These tests are only supposed to be run on developer's machine and # package GitHub page not on CRAN (they take too long) -if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { expect_silent( test1a_bootstrap <- nonprob(selection = ~ x, @@ -260,8 +260,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") data = source_nonprob_p, method_selection = "logit", svydesign = svy_a, - control_inference = controlInf(var_method = "bootstrap", cores = 1), - verbose = TRUE) + control_inference = controlInf(var_method = "bootstrap", cores = 1)) ) @@ -271,8 +270,7 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") data = source_nonprob_p, method_selection = "logit", svydesign = svy_a, - control_inference = controlInf(var_method = "bootstrap", cores = 1), - verbose = TRUE) + control_inference = controlInf(var_method = "bootstrap", cores = 1)) ) @@ -280,11 +278,59 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") test3a_bootstrap <- nonprob(outcome = y1 ~ x, data = source_nonprob_p, svydesign = svy_a, + control_inference = controlInf(var_method = "bootstrap", cores = 1)) + ) + + expect_silent( + test2a_bootstrap_nn <- nonprob(selection = ~ x, + outcome = y1 ~ x, + data = source_nonprob_p, + method_selection = "logit", + method_outcome = "nn", + svydesign = svy_a, + control_inference = controlInf(var_method = "bootstrap", cores = 1)) + ) + + + expect_silent( + test3a_bootstrap_nn <- nonprob(outcome = y1 ~ x, + data = source_nonprob_p, + method_outcome = "nn", + svydesign = svy_a, + control_inference = controlInf(var_method = "bootstrap", cores = 1)) + ) + + expect_silent( + test2a_bootstrap_pmm <- nonprob(selection = ~ x, + outcome = y1 ~ x, + data = source_nonprob_p, + method_selection = "logit", + method_outcome = "pmm", + svydesign = svy_a, + control_inference = controlInf(var_method = "bootstrap", cores = 1)) + ) + + + expect_silent( + test3a_bootstrap_pmm <- nonprob(outcome = y1 ~ x, + data = source_nonprob_p, + method_outcome = "pmm", + svydesign = svy_a, + control_inference = controlInf(var_method = "bootstrap", cores = 1), + control_outcome = controlOut(predictive_match = 1)) + ) + + expect_silent( + test3a_bootstrap_pmm_2 <- nonprob(outcome = y1 ~ x, + data = source_nonprob_p, + method_outcome = "pmm", + svydesign = svy_a, control_inference = controlInf(var_method = "bootstrap", cores = 1), - verbose = TRUE) + control_outcome = controlOut(predictive_match = 2)) ) + expect_equivalent(test1a$output$mean, test1a_bootstrap$output$mean, tolerance = 0.1) # expect_equivalent(test1a$output$SE, test1a_bootstrap$output$SE, tolerance = 0.1) to check @@ -293,4 +339,4 @@ if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true") expect_equivalent(test3a$output$mean, test3a_bootstrap$output$mean, tolerance = 0.1) expect_equivalent(test3a$output$SE, test3a_bootstrap$output$SE, tolerance = 0.1) -} +# } diff --git a/inst/tinytest/test_nonprobsvy_main.R b/inst/tinytest/test_nonprobsvy_main.R index c326fd7..d4eb890 100644 --- a/inst/tinytest/test_nonprobsvy_main.R +++ b/inst/tinytest/test_nonprobsvy_main.R @@ -62,6 +62,63 @@ expect_true( (test_ipw_2$confidence_interval[1] < 0.621989) & (0.621989 < test_ipw_2$confidence_interval[2]) ) +# SCAD +expect_silent( + test_ipw_1_scad <- nonprob(selection = ~ klasa_pr + sek + zawod_kod2 + woj, + target = ~ jedna_zmiana , + data = cbop_df_long, + svydesign = popyt_svy, + method_selection = "logit", + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "SCAD")) +) + +expect_equivalent(test_ipw_1_scad$output$mean, + 0.6219899, # TODO, + tolerance = 0.1) + +expect_true( + (test_ipw_1_scad$confidence_interval[1] < 0.6219899) & (0.6219899 < test_ipw_1_scad$confidence_interval[2]) +) + +# LASSO +expect_silent( + test_ipw_1_lasso <- nonprob(selection = ~ klasa_pr + sek + zawod_kod2 + woj, + target = ~ jedna_zmiana , + data = cbop_df_long, + svydesign = popyt_svy, + method_selection = "logit", + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "lasso")) + ) + + expect_equivalent(test_ipw_1_lasso$output$mean, + 0.6191018, # TODO, + tolerance = 0.1) + + expect_true( + (test_ipw_1_lasso$confidence_interval[1] < 0.6191018) & (0.6191018 < test_ipw_1$confidence_interval[2]) + ) + +# MCP +expect_silent( + test_ipw_1_mcp <- nonprob(selection = ~ klasa_pr + sek + zawod_kod2 + woj, + target = ~ jedna_zmiana , + data = cbop_df_long, + svydesign = popyt_svy, + method_selection = "logit", + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "MCP")) +) + +expect_equivalent(test_ipw_1_mcp$output$mean, + 0.621989, # TODO, + tolerance = 0.1) + +expect_true( + (test_ipw_1_mcp$confidence_interval[1] < 0.6191018) & (0.6191018 < test_ipw_1$confidence_interval[2]) +) + # DR expect_silent( test_dr_1 <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, @@ -95,7 +152,117 @@ expect_true( (0.6747754 < test_dr_2$confidence_interval[2]) ) +# PMM +test_dr_1_pmm <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "pmm") + +expect_equivalent(test_dr_1_pmm$output$mean, + 0.6490007, + tolerance = 0.1) + +expect_true( + (test_dr_1_pmm$confidence_interval[1] < 0.6490007) & + (0.6490007< test_dr_1_pmm$confidence_interval[2]) +) +# NN +test_dr_1_nn <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "nn") + +expect_equivalent(test_dr_1_nn$output$mean, + 0.6003411, + tolerance = 0.1) + +expect_true( + (test_dr_1_nn$confidence_interval[1] < 0.6003411) & + (0.6003411< test_dr_1_nn$confidence_interval[2]) +) + +# bias minimization +expect_silent( + test_dr_1_bm <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + control_inference = controlInf(bias_correction = TRUE)) +) + +expect_equivalent(test_dr_1_bm$output$mean, + 0.6064716, + tolerance = 0.1) + +expect_true( + (test_dr_1_bm$confidence_interval[1] < 0.6064716) & + (0.6064716 < test_dr_1_bm$confidence_interval[2]) +) + +# SCAD +expect_silent( + test_dr_1_scad <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "SCAD"), + control_outcome = controlOut(penalty = "SCAD")) +) + +expect_equivalent(test_dr_1_scad$output$mean, + 0.604803, + tolerance = 0.1) + +expect_true( + (test_dr_1_scad$confidence_interval[1] < 0.604803) & + (0.604803 < test_dr_1_scad$confidence_interval[2]) +) + +# LASSO +expect_silent( + test_dr_1_lasso <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "lasso"), + control_outcome = controlOut(penalty = "lasso")) +) + +expect_equivalent(test_dr_1_lasso$output$mean, + 0.604803, + tolerance = 0.1) + +expect_true( + (test_dr_1_lasso$confidence_interval[1] < 0.604803) & + (0.604803 < test_dr_1_lasso$confidence_interval[2]) +) + +# MCP +expect_silent( + test_dr_1_mcp <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + selection = ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + control_inference = controlInf(vars_selection = TRUE), + control_selection = controlSel(penalty = "MCP"), + control_outcome = controlOut(penalty = "MCP")) +) + +expect_equivalent(test_dr_1_mcp$output$mean, + 0.604803, + tolerance = 0.1) + +expect_true( + (test_dr_1_mcp$confidence_interval[1] < 0.604803) & + (0.604803< test_dr_1_mcp$confidence_interval[2]) +) + # MI +# GLM test_mi_1 <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, data = cbop_df_long, svydesign = popyt_svy, @@ -116,3 +283,148 @@ expect_true( (test_mi_1$confidence_interval[1] < 0.6107926) & (0.6107926 < test_mi_1$confidence_interval[2]) ) + +#NN +test_mi_1_nn <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "nn") + +expect_silent( + test_mi_1_nn <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "nn") +) + +expect_equivalent(test_mi_1_nn$output$mean, + 0.6410426, + tolerance = 0.1) + +expect_true( + (test_mi_1_nn$confidence_interval[1] < 0.6410426) & + (0.6410426 < test_mi_1_nn$confidence_interval[2]) +) + +# PMM +test_mi_1_pmm <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "pmm") + +expect_silent( + test_mi_1_pmm <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "pmm") +) + +expect_equivalent(test_mi_1_pmm$output$mean, + 0.6490007, + tolerance = 0.1) + +expect_true( + (test_mi_1_pmm$confidence_interval[1] < 0.6490007) & + (0.6490007 < test_mi_1_pmm$confidence_interval[2]) +) + +test_mi_1_pmm_2 <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "pmm", + control_outcome = controlOut(predictive_match = 2)) + +expect_silent( + test_mi_1_pmm_2 <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "pmm", + control_outcome = controlOut(predictive_match = 2)) +) + +expect_equivalent(test_mi_1_pmm_2$output$mean, + 0.7553603, + tolerance = 0.1) + +expect_true( + (test_mi_1_pmm_2$confidence_interval[1] < 0.7553603) & + (0.7553603 < test_mi_1_pmm_2$confidence_interval[2]) +) + +# SCAD +test_mi_1_scad <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "SCAD")) + +expect_silent( + test_mi_1_scad <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "SCAD")) +) + +expect_equivalent(test_mi_1_scad$output$mean, + 0.6107926, + tolerance = 0.1) + +expect_true( + (test_mi_1_scad$confidence_interval[1] < 0.6107926) & + (0.6107926 < test_mi_1_scad$confidence_interval[2]) +) + +# LASSO +test_mi_1_lasso <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "lasso")) + +expect_silent( + test_mi_1_lasso <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "lasso")) +) + +expect_equivalent(test_mi_1_lasso$output$mean, + 0.6107926, + tolerance = 0.1) + +expect_true( + (test_mi_1_lasso$confidence_interval[1] < 0.6107926) & + (0.6107926 < test_mi_1_lasso$confidence_interval[2]) +) + +# MCP +test_mi_1_mcp <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "MCP")) + +expect_silent( + test_mi_1_mcp <- nonprob(outcome = jedna_zmiana ~ klasa_pr + sek + zawod_kod2 + woj, + data = cbop_df_long, + svydesign = popyt_svy, + method_outcome = "glm", + control_inference = controlInf(vars_selection = TRUE), + control_outcome = controlOut(penalty = "MCP")) +) + +expect_equivalent(test_mi_1_mcp$output$mean, + 0.6107926, + tolerance = 0.1) + +expect_true( + (test_mi_1_mcp$confidence_interval[1] < 0.6107926) & + (0.6107926 < test_mi_1_mcp$confidence_interval[2]) +) diff --git a/man/median.nonprobsvy.Rd b/man/median.nonprobsvy.Rd deleted file mode 100644 index 653ef0b..0000000 --- a/man/median.nonprobsvy.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simple_methods.R -\name{median.nonprobsvy} -\alias{median.nonprobsvy} -\title{Median values of covariates in subgroups} -\usage{ -\method{median}{nonprobsvy}(x, nonprob) -} -\arguments{ -\item{x}{\itemize{ -\item formula -}} - -\item{nonprob}{\itemize{ -\item object of nonprobsvy class -}} -} -\value{ -A vector of estimated medians of the given covariates in subgroups -} -\description{ -Median values of covariates in subgroups -} diff --git a/man/quantile.nonprobsvy.Rd b/man/quantile.nonprobsvy.Rd deleted file mode 100644 index 5f23775..0000000 --- a/man/quantile.nonprobsvy.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simple_methods.R -\name{quantile.nonprobsvy} -\alias{quantile.nonprobsvy} -\title{Quantile values of covariates in subgroups} -\usage{ -\method{quantile}{nonprobsvy}(x, nonprob, probs = 0.5) -} -\arguments{ -\item{x}{\itemize{ -\item formula -}} - -\item{nonprob}{\itemize{ -\item object of nonprobsvy class -}} - -\item{probs}{\itemize{ -\item probability -}} -} -\value{ -A vector of estimated quantiles if the given covariates in subgroups -} -\description{ -Quantile values of covariates in subgroups -} From 37ae2bdf77da812a51328cfd3f3aff9485ade39b Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Fri, 4 Oct 2024 18:28:33 +0200 Subject: [PATCH 19/24] edit badge for codecove --- README.Rmd | 3 +-- README.md | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/README.Rmd b/README.Rmd index b9b5f7c..07baa5b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -23,8 +23,7 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/ncn-foreigners/nonprobsvy/workflows/R-CMD-check/badge.svg)](https://github.com/ncn-foreigners/nonprobsvy/actions) -[![Codecov test -coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badge.svg)](https://app.codecov.io/gh/ncn-foreigners/nonprobsvy?branch=main) +[![codecov](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/dev/graph/badge.svg?token=TK9GDFOJF5)](https://codecov.io/gh/ncn-foreigners/nonprobsvy) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10280114.svg)](https://doi.org/10.5281/zenodo.10280114) [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) diff --git a/README.md b/README.md index 79bd182..35d795b 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ [![R-CMD-check](https://github.com/ncn-foreigners/nonprobsvy/workflows/R-CMD-check/badge.svg)](https://github.com/ncn-foreigners/nonprobsvy/actions) -[![Codecov test -coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badge.svg)](https://app.codecov.io/gh/ncn-foreigners/nonprobsvy?branch=main) +[![codecov](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/dev/graph/badge.svg?token=TK9GDFOJF5)](https://codecov.io/gh/ncn-foreigners/nonprobsvy) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10280114.svg)](https://doi.org/10.5281/zenodo.10280114) [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) From 2a5aca618305f5636df9b50dc59111da21383eeb Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Fri, 4 Oct 2024 18:47:19 +0200 Subject: [PATCH 20/24] add dev branch to workflow --- .github/workflows/R-CMD-check.yaml | 4 ++-- .github/workflows/pkgdown.yaml | 4 ++-- .github/workflows/test-coverage.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a1e9e7c..57d153e 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master, package-development] + branches: [main, master, dev] pull_request: - branches: [main, master, package-development] + branches: [main, master, dev] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ed7650c..1661077 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, dev] pull_request: - branches: [main, master] + branches: [main, master, dev] release: types: [published] workflow_dispatch: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 9da2528..8e2e127 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, dev] pull_request: - branches: [main, master] + branches: [main, master, dev] name: test-coverage From f33fa1a9bccbf39dc2b514162ffe6fe835dd276c Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 27 Oct 2024 15:35:47 +0100 Subject: [PATCH 21/24] work on estimating in groups/subsets - IPW done (no bigger tests), still much work TODO for MI and DR - moved to seperated branch --- R/nonprobDR.R | 13 ++++- R/nonprobIPW.R | 15 ++++- R/nonprobMI.R | 5 +- R/simple_methods.R | 140 +++++++++++++++++++++++++++++++++++++++++---- R/varianceIPW.R | 1 - README.Rmd | 2 +- README.md | 3 +- 7 files changed, 161 insertions(+), 18 deletions(-) diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 2686013..7b1c709 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -453,7 +453,7 @@ nonprobDR <- function(selection, method_selection = method_selection ) theta_hat <- selection_model$theta_hat - # grad <- est_method_obj$grad + grad <- selection_model$grad hess <- selection_model$hess ps_nons <- selection_model$ps_nons est_ps_rand <- selection_model$est_ps_rand @@ -495,6 +495,7 @@ nonprobDR <- function(selection, # beta_statistics <- model_obj$parameters sigma <- NULL OutcomeList[[k]] <- model_obj$model + OutcomeList[[k]]$model_frame <- OutcomeModel$model_frame y_nons <- OutcomeModel$y_nons mu_hat <- mu_hatDR( @@ -689,6 +690,7 @@ nonprobDR <- function(selection, y_nons_pred <- model_obj$y_nons_pred parameters <- model_obj$parameters OutcomeList[[k]] <- model_obj$model + OutcomeList[[k]]$model_frame <- OutcomeModel$model_frame_rand mu_hat <- 1 / N_nons * sum((1 / ps_nons) * (weights * (OutcomeModel$y_nons - y_nons_pred))) + y_rand_pred } else { @@ -872,6 +874,12 @@ nonprobDR <- function(selection, formula = selection, df_residual = selection_model$df_residual, log_likelihood = selection_model$log_likelihood, + hessian = hess, + gradient = grad, + method = est_method, + prob_der = ps_nons_der, + prob_rand = ps_rand, + prob_rand_est = est_ps_rand, cve = if (control_inference$vars_selection == TRUE) { cve_selection } else { @@ -904,7 +912,8 @@ nonprobDR <- function(selection, pop_totals = prob_pop_totals, outcome = OutcomeList, selection = SelectionList, - boot_sample = boot_sample + boot_sample = boot_sample, + svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_dr") ) diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index a40aed0..6a2b95d 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -169,7 +169,7 @@ nonprobIPW <- function(selection, method_selection = method_selection ) theta_hat <- selection_model$theta_hat - # grad <- est_method_obj$grad + grad <- selection_model$grad hess <- selection_model$hess var_cov1 <- selection_model$var_cov1 var_cov2 <- selection_model$var_cov2 @@ -563,9 +563,19 @@ nonprobIPW <- function(selection, weights = as.vector(weights_nons), prior.weights = weights, est_totals = est_totals, + pop_totals = pop_totals, formula = selection, df_residual = selection_model$df_residual, log_likelihood = selection_model$log_likelihood, + method_selection = method_selection, + hessian = hess, + gradient = grad, + method = est_method, + prob_der = ps_nons_der, + prob_rand = ps_rand, + prob_rand_est = est_ps_rand, + prob_rand_est_der = est_ps_rand_der, + h_function = h, cve = if (control_inference$vars_selection == TRUE) { cve_selection } else { @@ -594,7 +604,8 @@ nonprobIPW <- function(selection, pop_totals = prob_pop_totals, outcome = NULL, selection = SelectionList, - boot_sample = boot_sample + boot_sample = boot_sample, + svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_ipw") ) diff --git a/R/nonprobMI.R b/R/nonprobMI.R index d1c2258..bb45c4e 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -244,6 +244,7 @@ nonprobMI <- function(outcome, y_nons_pred <- model_obj$y_nons_pred # parameters <- model_obj$parameters OutcomeList[[k]] <- model_obj$model + OutcomeList[[k]]$model_frame <- OutcomeModel$model_frame_rand # updating probability sample by adding y_hat variable svydesign <- stats::update(svydesign, @@ -326,6 +327,7 @@ nonprobMI <- function(outcome, y_nons_pred <- model_obj$y_nons_pred parameters <- model_obj$parameters OutcomeList[[k]] <- model_obj$model + OutcomeList[[k]]$model_frame <- Model$model_frame_rand mu_hat <- y_rand_pred } else { stop("Please, provide svydesign object or pop_totals/pop_means.") @@ -491,7 +493,8 @@ nonprobMI <- function(outcome, pop_totals = prob_pop_totals, outcome = OutcomeList, selection = NULL, - boot_sample = boot_sample + boot_sample = boot_sample, + svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_mi") ) diff --git a/R/simple_methods.R b/R/simple_methods.R index 1a937fb..6ae4aab 100644 --- a/R/simple_methods.R +++ b/R/simple_methods.R @@ -334,7 +334,7 @@ deviance.nonprobsvy <- function(object, total.nonprobsvy <- function(x, nonprob) { variables <- lhs.vars(x) groups <- rhs.vars(x) - + # TODO variance implementation if (is.null(variables)) { data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] total <- tapply(nonprob$weights[which(nonprob$R == 1)], data, sum) @@ -357,21 +357,141 @@ total.nonprobsvy <- function(x, nonprob) { #' @importFrom formula.tools rhs.vars #' @importFrom stats aggregate mean.nonprobsvy <- function(x, nonprob) { + # TODO variance for MI + # TODO all implementation for DR variables <- lhs.vars(x) groups <- rhs.vars(x) - + class_nonprob <- class(nonprob)[2] + if (!is.null(variables) & class_nonprob != 'nonprobsvy_ipw') + stop("DR or MI estimators only for y variable in subgroups. Recommended formula definition + is something like ~ x + y where x and y are grouping variables") if (is.null(variables)) { - data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] - mean_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, mean) # to consider + # data <- nonprob$data[, groups, drop = FALSE] + data <- model.matrix(as.formula(paste(x, "- 1")), data = nonprob$data) + if (class_nonprob == "nonprobsvy_ipw") { + mean_value <- sapply(as.data.frame(data), function(col) weighted.mean(col, nonprob$weights)) + ys <- data + } else { + # MI TODO for more than one y + if (class_nonprob == "nonprobsvy_mi") { + data_nonprob <- split(cbind(nonprob$data, nonprob$outcome[[1]]$fitted.values), x) + data_prob <- split(cbind(nonprob$svydesign$variables, nonprob$svydesign$prob), x) + + # X_nons <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_nonprob) + # X_rand <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_prob) + + mean_value <- aggregate(formula(paste("y_hat_MI", x)), + data = nonprob$svydesign$variables, + FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), + w = 1 / nonprob$svydesign$prob) + + } else if (class_nonprob == "nonprobsvy_dr") { + mean_value <- list() + for (i in 1:p) { + mean_value[[i]] <- mu_hatDR( + y = , + y_nons = , + y_rand = , + weights = , # TODO + weights_nons = , + weights_rand = , + N_nons = , + N_rand = ) + } + } + } } else { - data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] - weights <- nonprob$weights[which(nonprob$R == 1)] + data <- nonprob$data[, c(variables, groups)] + weights <- nonprob$weights + if (class_nonprob == "nonprobsvy_ipw") { + mean_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], + FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), + w = weights) + } + } + p <- length(mean_value) + variances <- numeric(length = p) - mean_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], - FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), - w = weights) + for (i in 1:p) { + if (class_nonprob == "nonprobsvy_ipw") { + yy <- ys[,i] + mu <- mean_value[i] + var <- internal_varIPW(svydesign = nonprob$svydesign, + X_nons = nonprob$X[nonprob$R == 1, ], + X_rand = nonprob$X[nonprob$R == 0, ], + y_nons = yy, + weights = nonprob$selection$prior.weights, + ps_nons = nonprob$prob[nonprob$R == 1], + mu_hat = mu, + hess = nonprob$selection$hessian, + ps_nons_der = as.vector(nonprob$selection$prob_der), + N = nonprob$pop_size, + est_ps_rand = as.vector(nonprob$selection$prob_rand_est), # TODO + ps_rand = nonprob$svydesign$prob_rand, + est_ps_rand_der = as.vector(nonprob$selection$prob_rand_est_der), # TODO + n_rand = nonprob$prob_size, + pop_size = nonprob$pop_size, + pop_totals = nonprob$selection$pop_totals, + method_selection = nonprob$selection$method_selection, + est_method = nonprob$selection$method, + theta = nonprob$selection$coefficients, + h = nonprob$selection$h_function, + verbose = FALSE, + var_cov1 = nonprob$selection$link$variance_covariance1, + var_cov2 = nonprob$selection$link$variance_covariance2 + ) + } else if (nonprob_class == "nonprobsvy_mi") { + + var <- internal_varMI(svydesign = nonprob$svydesign, + X_nons = nonprob$X[nonprob$R == 1, ], # subset by x formula + X_rand = nonprob$X[nonprob$R == 0, ], # subset by x formula + y = ys_nons[[i]]$single_shift, # TODO y_nons + y_pred = ys_pred_nons[[i]]$single_shift, # TODO + weights_rand = 1 / nonprob$svydesig$prob, # subset by x formula + method = method_outcome, + n_rand = nonprob$prob_size, + n_nons = nonprob$nonprob_size, + N = nonprob$pop_size, + family = nonprob$outcome$family, + model_obj = model_objects, # TODO + pop_totals = nonprob$pop_totals, + # we should probably just pass full control list + k = nonprob$control$control_outcome$k, + predictive_match = nonprob$control$control_outcome$predictive_match, + nn_exact_se = nonprob$control$control_inference$nn_exact_se, + pmm_reg_engine = nonprob$control$control_outcome$pmm_reg_engine, + pi_ij = nonprob$control$control_inference$pi_ij + ) + } else if (nonprob_class == "nonprobsvy_dr") { + var <- internal_varDR(OutcomeModel = OutcomeModel, + SelectionModel = SelectionModel, + y_nons_pred = ys_nons_pred, + weights = 1, + weights_rand = 1 / nonprob$svydesign$prob, + method_selection = method_selection, + control_selection = nonprob$control$control_selection, + ps_nons = nonprob$prob, + theta = nonprob$selection$coefficients, + hess = nonprob$selection$hessian, + ps_nons_der = nonprob$selection$ps_nons_der, + est_ps_rand = nonprob$selection$prob_rand_est, + y_rand_pred = ys_pred, + N_nons = nonprob$pop_size, + est_ps_rand_der = nonprob$selection$prob_rand_est_der, + svydesign = nonprob$svydesign, + est_method = nonprob$selection$method, + h = nonprob$selection$h_function, + pop_totals = nonprob$pop_totals, + sigma = nonprob$outcome$sigma, # TODO + bias_correction = nonprob$control$control_inference$bias_corr, + verbose = FALSE + ) + } + variances[i] <- var$var } - mean_value + res <- data.frame(mean = mean_value, + se = sqrt(variances)) # perhaps convert to data.frame + res } # @title Median values of covariates in subgroups # diff --git a/R/varianceIPW.R b/R/varianceIPW.R index aae869d..07ac9d8 100644 --- a/R/varianceIPW.R +++ b/R/varianceIPW.R @@ -70,7 +70,6 @@ internal_varIPW <- function(svydesign, psd = est_ps_rand_der ) - # variance-covariance matrix for set of parameters (mu_hat and theta_hat) V_mx_nonprob <- sparse_mx %*% V1 %*% t(as.matrix(sparse_mx)) # nonprobability component V_mx_prob <- sparse_mx %*% V2 %*% t(as.matrix(sparse_mx)) # probability component diff --git a/README.Rmd b/README.Rmd index 07baa5b..c2f4d59 100644 --- a/README.Rmd +++ b/README.Rmd @@ -23,7 +23,7 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/ncn-foreigners/nonprobsvy/workflows/R-CMD-check/badge.svg)](https://github.com/ncn-foreigners/nonprobsvy/actions) -[![codecov](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/dev/graph/badge.svg?token=TK9GDFOJF5)](https://codecov.io/gh/ncn-foreigners/nonprobsvy) +[![Codecov test coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badge.svg)](https://app.codecov.io/gh/ncn-foreigners/nonprobsvy?branch=main) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10280114.svg)](https://doi.org/10.5281/zenodo.10280114) [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) diff --git a/README.md b/README.md index 35d795b..79bd182 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ [![R-CMD-check](https://github.com/ncn-foreigners/nonprobsvy/workflows/R-CMD-check/badge.svg)](https://github.com/ncn-foreigners/nonprobsvy/actions) -[![codecov](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/dev/graph/badge.svg?token=TK9GDFOJF5)](https://codecov.io/gh/ncn-foreigners/nonprobsvy) +[![Codecov test +coverage](https://codecov.io/gh/ncn-foreigners/nonprobsvy/branch/main/graph/badge.svg)](https://app.codecov.io/gh/ncn-foreigners/nonprobsvy?branch=main) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10280114.svg)](https://doi.org/10.5281/zenodo.10280114) [![CRAN status](https://www.r-pkg.org/badges/version/nonprobsvy)](https://CRAN.R-project.org/package=nonprobsvy) From 5101e0500ce75b069cbdbd38f92109866e8ff4e4 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 27 Oct 2024 16:48:14 +0100 Subject: [PATCH 22/24] Initial cleaning before going on CRAN --- DESCRIPTION | 3 +- NAMESPACE | 3 - NEWS.md | 1 - R/simple_methods.R | 233 ------------------------------ inst/tinytest/test-2-ipw-totals.R | 15 +- inst/tinytest/test-3-mi-totals.R | 9 +- inst/tinytest/test-4-dr-totals.R | 8 +- inst/tinytest/test_nonprobsvy.R | 4 +- man/mean.nonprobsvy.Rd | 23 --- man/total.nonprobsvy.Rd | 23 --- src/Makevars | 4 +- 11 files changed, 19 insertions(+), 307 deletions(-) delete mode 100644 man/mean.nonprobsvy.Rd delete mode 100644 man/total.nonprobsvy.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 080ce4c..34b80db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,8 +45,7 @@ Imports: nleqslv, doParallel, foreach, - parallel, - formula.tools + parallel LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index 2ec47a4..6c7a33e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,8 +34,6 @@ importFrom(Rcpp,sourceCpp) importFrom(doParallel,registerDoParallel) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) -importFrom(formula.tools,lhs.vars) -importFrom(formula.tools,rhs.vars) importFrom(maxLik,maxLik) importFrom(ncvreg,cv.ncvreg) importFrom(nleqslv,nleqslv) @@ -43,7 +41,6 @@ importFrom(parallel,makeCluster) importFrom(parallel,stopCluster) importFrom(stats,AIC) importFrom(stats,BIC) -importFrom(stats,aggregate) importFrom(stats,as.formula) importFrom(stats,binomial) importFrom(stats,coef) diff --git a/NEWS.md b/NEWS.md index fbb5c38..c11f086 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,6 @@ - add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables - add estimation of exact standard error for k-nearest neighbor estimator. - add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching. - - add methods for estimation of means and totals in subsets/groups with its variances. - implement `div` option when variable selection (more in documentation) for doubly robust estimation. ## nonprobsvy 0.1.0 diff --git a/R/simple_methods.R b/R/simple_methods.R index 6ae4aab..983fb1d 100644 --- a/R/simple_methods.R +++ b/R/simple_methods.R @@ -321,236 +321,3 @@ deviance.nonprobsvy <- function(object, if (class(object)[2] == "nonprobsvy_dr") res <- c("selection" = res_sel, "outcome" = res_out) res } -#' @title Total values of covariates in subgroups -#' -#' @param x - formula -#' @param nonprob - object of nonprobsvy class -#' -#' @method total nonprobsvy -#' @return A vector of estimated totals of the given covariates in subgroups -#' @importFrom formula.tools lhs.vars -#' @importFrom formula.tools rhs.vars -#' @importFrom stats aggregate -total.nonprobsvy <- function(x, nonprob) { - variables <- lhs.vars(x) - groups <- rhs.vars(x) - # TODO variance implementation - if (is.null(variables)) { - data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] - total <- tapply(nonprob$weights[which(nonprob$R == 1)], data, sum) - } else { - data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] - weights <- nonprob$weights[which(nonprob$R == 1)] - - total <- aggregate(data[, variables] * weights, by = data[, groups, drop = FALSE], sum) - } - total -} -#' @title Mean values of covariates in subgroups -#' -#' @param x - formula -#' @param nonprob - object of nonprobsvy class -#' -#' @method mean nonprobsvy -#' @return A vector of estimated means of the given covariates in subgroups -#' @importFrom formula.tools lhs.vars -#' @importFrom formula.tools rhs.vars -#' @importFrom stats aggregate -mean.nonprobsvy <- function(x, nonprob) { - # TODO variance for MI - # TODO all implementation for DR - variables <- lhs.vars(x) - groups <- rhs.vars(x) - class_nonprob <- class(nonprob)[2] - if (!is.null(variables) & class_nonprob != 'nonprobsvy_ipw') - stop("DR or MI estimators only for y variable in subgroups. Recommended formula definition - is something like ~ x + y where x and y are grouping variables") - if (is.null(variables)) { - # data <- nonprob$data[, groups, drop = FALSE] - data <- model.matrix(as.formula(paste(x, "- 1")), data = nonprob$data) - if (class_nonprob == "nonprobsvy_ipw") { - mean_value <- sapply(as.data.frame(data), function(col) weighted.mean(col, nonprob$weights)) - ys <- data - } else { - # MI TODO for more than one y - if (class_nonprob == "nonprobsvy_mi") { - data_nonprob <- split(cbind(nonprob$data, nonprob$outcome[[1]]$fitted.values), x) - data_prob <- split(cbind(nonprob$svydesign$variables, nonprob$svydesign$prob), x) - - # X_nons <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_nonprob) - # X_rand <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_prob) - - mean_value <- aggregate(formula(paste("y_hat_MI", x)), - data = nonprob$svydesign$variables, - FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), - w = 1 / nonprob$svydesign$prob) - - } else if (class_nonprob == "nonprobsvy_dr") { - mean_value <- list() - for (i in 1:p) { - mean_value[[i]] <- mu_hatDR( - y = , - y_nons = , - y_rand = , - weights = , # TODO - weights_nons = , - weights_rand = , - N_nons = , - N_rand = ) - } - } - } - } else { - data <- nonprob$data[, c(variables, groups)] - weights <- nonprob$weights - if (class_nonprob == "nonprobsvy_ipw") { - mean_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], - FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]), - w = weights) - } - } - p <- length(mean_value) - variances <- numeric(length = p) - - for (i in 1:p) { - if (class_nonprob == "nonprobsvy_ipw") { - yy <- ys[,i] - mu <- mean_value[i] - var <- internal_varIPW(svydesign = nonprob$svydesign, - X_nons = nonprob$X[nonprob$R == 1, ], - X_rand = nonprob$X[nonprob$R == 0, ], - y_nons = yy, - weights = nonprob$selection$prior.weights, - ps_nons = nonprob$prob[nonprob$R == 1], - mu_hat = mu, - hess = nonprob$selection$hessian, - ps_nons_der = as.vector(nonprob$selection$prob_der), - N = nonprob$pop_size, - est_ps_rand = as.vector(nonprob$selection$prob_rand_est), # TODO - ps_rand = nonprob$svydesign$prob_rand, - est_ps_rand_der = as.vector(nonprob$selection$prob_rand_est_der), # TODO - n_rand = nonprob$prob_size, - pop_size = nonprob$pop_size, - pop_totals = nonprob$selection$pop_totals, - method_selection = nonprob$selection$method_selection, - est_method = nonprob$selection$method, - theta = nonprob$selection$coefficients, - h = nonprob$selection$h_function, - verbose = FALSE, - var_cov1 = nonprob$selection$link$variance_covariance1, - var_cov2 = nonprob$selection$link$variance_covariance2 - ) - } else if (nonprob_class == "nonprobsvy_mi") { - - var <- internal_varMI(svydesign = nonprob$svydesign, - X_nons = nonprob$X[nonprob$R == 1, ], # subset by x formula - X_rand = nonprob$X[nonprob$R == 0, ], # subset by x formula - y = ys_nons[[i]]$single_shift, # TODO y_nons - y_pred = ys_pred_nons[[i]]$single_shift, # TODO - weights_rand = 1 / nonprob$svydesig$prob, # subset by x formula - method = method_outcome, - n_rand = nonprob$prob_size, - n_nons = nonprob$nonprob_size, - N = nonprob$pop_size, - family = nonprob$outcome$family, - model_obj = model_objects, # TODO - pop_totals = nonprob$pop_totals, - # we should probably just pass full control list - k = nonprob$control$control_outcome$k, - predictive_match = nonprob$control$control_outcome$predictive_match, - nn_exact_se = nonprob$control$control_inference$nn_exact_se, - pmm_reg_engine = nonprob$control$control_outcome$pmm_reg_engine, - pi_ij = nonprob$control$control_inference$pi_ij - ) - } else if (nonprob_class == "nonprobsvy_dr") { - var <- internal_varDR(OutcomeModel = OutcomeModel, - SelectionModel = SelectionModel, - y_nons_pred = ys_nons_pred, - weights = 1, - weights_rand = 1 / nonprob$svydesign$prob, - method_selection = method_selection, - control_selection = nonprob$control$control_selection, - ps_nons = nonprob$prob, - theta = nonprob$selection$coefficients, - hess = nonprob$selection$hessian, - ps_nons_der = nonprob$selection$ps_nons_der, - est_ps_rand = nonprob$selection$prob_rand_est, - y_rand_pred = ys_pred, - N_nons = nonprob$pop_size, - est_ps_rand_der = nonprob$selection$prob_rand_est_der, - svydesign = nonprob$svydesign, - est_method = nonprob$selection$method, - h = nonprob$selection$h_function, - pop_totals = nonprob$pop_totals, - sigma = nonprob$outcome$sigma, # TODO - bias_correction = nonprob$control$control_inference$bias_corr, - verbose = FALSE - ) - } - variances[i] <- var$var - } - res <- data.frame(mean = mean_value, - se = sqrt(variances)) # perhaps convert to data.frame - res -} -# @title Median values of covariates in subgroups -# -# @param x - formula -# @param nonprob - object of nonprobsvy class -# -# @method median nonprobsvy -# @return A vector of estimated medians of the given covariates in subgroups -# @importFrom formula.tools lhs.vars -# @importFrom formula.tools rhs.vars -# @importFrom spatstat weighted.median -# @importFrom stats aggregate -# @importFrom stats median -# median.nonprobsvy <- function(x, nonprob) { -# variables <- lhs.vars(x) -# groups <- rhs.vars(x) -# -# if (is.null(variables)) { -# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] -# median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median) -# } else { -# -# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] -# weights <- nonprob$weights[which(nonprob$R == 1)] -# -# median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], -# FUN = function(y, w) weighted.median(y, w = w[1:length(y)]), -# w = weights) -# } -# median_value -# } -# @title Quantile values of covariates in subgroups -# -# @param x - formula -# @param nonprob - object of nonprobsvy class -# @param probs - probability -# -# @method quantile nonprobsvy -# @return A vector of estimated quantiles if the given covariates in subgroups -# @importFrom formula.tools lhs.vars -# @importFrom formula.tools rhs.vars -# @importFrom spatstat weighted.quantile -# @importFrom stats aggregate -# @importFrom stats quantile -# quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) { -# variables <- lhs.vars(x) -# groups <- rhs.vars(x) -# -# if (is.null(variables)) { -# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE] -# percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs)) -# } else { -# -# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)] -# weights <- nonprob$weights[which(nonprob$R == 1)] -# -# percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE], -# FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs), -# w = weights) -# } -# percentile_value -# } diff --git a/inst/tinytest/test-2-ipw-totals.R b/inst/tinytest/test-2-ipw-totals.R index 932802c..5a593e3 100644 --- a/inst/tinytest/test-2-ipw-totals.R +++ b/inst/tinytest/test-2-ipw-totals.R @@ -1,5 +1,3 @@ -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { - library(sampling) library(survey) @@ -505,7 +503,7 @@ #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- - # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { ## y_11 expect_silent( @@ -625,7 +623,7 @@ # verbose = T) # ) - # } + } # check probit ---------------------------------------------------------------------------- @@ -811,7 +809,7 @@ row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) - # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -933,7 +931,7 @@ # verbose = T) # ) - # } + } ## non-linear case ------------------------------------------------------------------------ #### correctly specified variables -------------------------------------------------------- @@ -1119,7 +1117,7 @@ row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) - # if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { + if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -1241,7 +1239,7 @@ # verbose = T) # ) - # } + } # check cloglog ----------------------------------------------------- ## linear case ---------------------------------------------------------------------------- @@ -1752,4 +1750,3 @@ # # expect_true(y22_corr_scad$confidence_interval$lower_bound < mean(Y_22) & # # y22_corr_scad$confidence_interval$upper_bound > mean(Y_22)) ## conf int # expect_true(NROW(y22_corr_scad$selection$coefficients) == 2) -# } diff --git a/inst/tinytest/test-3-mi-totals.R b/inst/tinytest/test-3-mi-totals.R index 2a2dcc7..f901356 100644 --- a/inst/tinytest/test-3-mi-totals.R +++ b/inst/tinytest/test-3-mi-totals.R @@ -43,7 +43,6 @@ expect_silent( expect_equal(y12_mi_corr_one$output$mean, 7.465879, tolerance = 0.0001) expect_equal(y12_mi_corr_one$output$SE, 0.2523682, tolerance = 0.0001) -# TODO # expect_true(y12_mi_corr_one$confidence_interval$lower_bound < mean(Y_12) & # y12_mi_corr_one$confidence_interval$upper_bound > mean(Y_12)) @@ -88,7 +87,7 @@ expect_true(y22_mi_corr_one$confidence_interval$lower_bound < mean(Y_22) & # For Y_11 with all X variables expect_silent( y11_mi_corr_all <- nonprob( - outcome = as.formula(paste('Y_11', X_formula)), + outcome = as.formula(paste('Y_11 ~ ', as.character(X_formula)[2])), data = sample_B1, # Include all X variables pop_totals = X_totals, method_outcome = "glm", @@ -104,7 +103,7 @@ expect_true(y11_mi_corr_all$confidence_interval$lower_bound < mean(Y_11) & # For Y_12 with all X variables expect_silent( y12_mi_corr_all <- nonprob( - outcome = as.formula(paste('Y_12', X_formula)), + outcome = as.formula(paste('Y_12 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_outcome = "glm", @@ -121,7 +120,7 @@ expect_equal(y12_mi_corr_all$output$SE, 0.2454465, tolerance = 0.0001) # For Y_21 with all X variables expect_silent( y21_mi_corr_all <- nonprob( - outcome = as.formula(paste('Y_21', X_formula)), + outcome = as.formula(paste('Y_21 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_outcome = "glm", @@ -138,7 +137,7 @@ expect_equal(y21_mi_corr_all$output$SE, 0.01992489, tolerance = 0.0001) # For Y_22 with all X variables expect_silent( y22_mi_corr_all <- nonprob( - outcome = as.formula(paste('Y_22', X_formula)), + outcome = as.formula(paste('Y_22 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_outcome = "glm", diff --git a/inst/tinytest/test-4-dr-totals.R b/inst/tinytest/test-4-dr-totals.R index cee8879..f826d6e 100644 --- a/inst/tinytest/test-4-dr-totals.R +++ b/inst/tinytest/test-4-dr-totals.R @@ -97,7 +97,7 @@ expect_true(y22_dr_corr_one$confidence_interval$lower_bound < mean(Y_22) & expect_silent( y11_dr_corr_all <- nonprob( selection = X_formula, - outcome = as.formula(paste('Y_11', X_formula)), + outcome = as.formula(paste('Y_11 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_selection = "logit", @@ -115,7 +115,7 @@ expect_true(y11_dr_corr_all$confidence_interval$lower_bound < mean(Y_11) & expect_silent( y12_dr_corr_all <- nonprob( selection = X_formula, - outcome = as.formula(paste('Y_12', X_formula)), + outcome = as.formula(paste('Y_12 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_selection = "logit", @@ -133,7 +133,7 @@ expect_true(y12_dr_corr_all$confidence_interval$lower_bound < mean(Y_12) & expect_silent( y21_dr_corr_all <- nonprob( selection = X_formula, - outcome = as.formula(paste('Y_21', X_formula)), + outcome = as.formula(paste('Y_21 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_selection = "logit", @@ -152,7 +152,7 @@ expect_equal(y21_dr_corr_all$output$SE, 0.0192478, tolerance = 0.0001) expect_silent( y22_dr_corr_all <- nonprob( selection = X_formula, - outcome = as.formula(paste('Y_22', X_formula)), + outcome = as.formula(paste('Y_22 ~ ', as.character(X_formula)[2])), data = sample_B1, pop_totals = X_totals, method_selection = "logit", diff --git a/inst/tinytest/test_nonprobsvy.R b/inst/tinytest/test_nonprobsvy.R index 1f2b85c..28ab91f 100644 --- a/inst/tinytest/test_nonprobsvy.R +++ b/inst/tinytest/test_nonprobsvy.R @@ -252,7 +252,7 @@ expect_true( # These tests are only supposed to be run on developer's machine and # package GitHub page not on CRAN (they take too long) -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { expect_silent( test1a_bootstrap <- nonprob(selection = ~ x, @@ -339,4 +339,4 @@ expect_true( expect_equivalent(test3a$output$mean, test3a_bootstrap$output$mean, tolerance = 0.1) expect_equivalent(test3a$output$SE, test3a_bootstrap$output$SE, tolerance = 0.1) -# } +} diff --git a/man/mean.nonprobsvy.Rd b/man/mean.nonprobsvy.Rd deleted file mode 100644 index 4248b7f..0000000 --- a/man/mean.nonprobsvy.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simple_methods.R -\name{mean.nonprobsvy} -\alias{mean.nonprobsvy} -\title{Mean values of covariates in subgroups} -\usage{ -\method{mean}{nonprobsvy}(x, nonprob) -} -\arguments{ -\item{x}{\itemize{ -\item formula -}} - -\item{nonprob}{\itemize{ -\item object of nonprobsvy class -}} -} -\value{ -A vector of estimated means of the given covariates in subgroups -} -\description{ -Mean values of covariates in subgroups -} diff --git a/man/total.nonprobsvy.Rd b/man/total.nonprobsvy.Rd deleted file mode 100644 index d509a22..0000000 --- a/man/total.nonprobsvy.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simple_methods.R -\name{total.nonprobsvy} -\alias{total.nonprobsvy} -\title{Total values of covariates in subgroups} -\usage{ -\method{total}{nonprobsvy}(x, nonprob) -} -\arguments{ -\item{x}{\itemize{ -\item formula -}} - -\item{nonprob}{\itemize{ -\item object of nonprobsvy class -}} -} -\value{ -A vector of estimated totals of the given covariates in subgroups -} -\description{ -Total values of covariates in subgroups -} diff --git a/src/Makevars b/src/Makevars index 3d1e654..f09302a 100644 --- a/src/Makevars +++ b/src/Makevars @@ -15,5 +15,5 @@ ## set the appropriate value, possibly CXX17. # CXX_STD = CXX11 -#PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -#PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) From 6c64da2efbd7332cb4ae73d27b3a7447e6605b00 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Sun, 27 Oct 2024 20:54:04 +0100 Subject: [PATCH 23/24] improvements before CRAN submit --- .github/workflows/test-coverage.yaml | 1 + R/bias_correction_ipw.R | 6 ++- R/control_selection.R | 9 ++-- R/data_manip.R | 10 ++-- R/glm.R | 2 +- R/nn.R | 9 ++-- R/nonprobDR.R | 6 +-- R/nonprobIPW.R | 4 +- R/nonprobMI.R | 2 +- R/pmm.R | 68 ++++++++++++++-------------- R/theta_funcs.R | 1 - R/varianceMI.R | 3 +- 12 files changed, 61 insertions(+), 60 deletions(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 8e2e127..998794d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -13,6 +13,7 @@ jobs: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + TEST_NONPROBSVY_MULTICORE_DEVELOPER: "true" steps: - uses: actions/checkout@v3 diff --git a/R/bias_correction_ipw.R b/R/bias_correction_ipw.R index 919530f..ca10aad 100644 --- a/R/bias_correction_ipw.R +++ b/R/bias_correction_ipw.R @@ -84,8 +84,10 @@ mm <- function(X, global = nleqslv_global, xscalm = nleqslv_xscalm, jacobian = TRUE, - control = list(scalex = rep(1, length(par0)), - maxit = maxit), # TODO algorithm did not converge in maxit iterations for cloglog + control = list( + scalex = rep(1, length(par0)), + maxit = maxit + ), # TODO algorithm did not converge in maxit iterations for cloglog R = R, X = X, y = y, diff --git a/R/control_selection.R b/R/control_selection.R index a5cfd19..940a316 100644 --- a/R/control_selection.R +++ b/R/control_selection.R @@ -69,10 +69,11 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo print_level = 0, start_type = c("glm", "naive", "zero"), nleqslv_method = c("Broyden", "Newton"), - nleqslv_global = c("dbldog", "pwldog", - "cline", "qline", "gline", "hook", "none"), - nleqslv_xscalm = c("fixed","auto") - ) { + nleqslv_global = c( + "dbldog", "pwldog", + "cline", "qline", "gline", "hook", "none" + ), + nleqslv_xscalm = c("fixed", "auto")) { list( epsilon = epsilon, maxit = maxit, diff --git a/R/data_manip.R b/R/data_manip.R index a52e250..0ced926 100644 --- a/R/data_manip.R +++ b/R/data_manip.R @@ -7,9 +7,9 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot outcome_name <- names(model_Frame)[1] mt <- terms(formula) # attr(model_Frame, "terms") - nons_names <- all.vars(as.formula(paste("~", paste(attr(mt, "term.labels"), collapse=" + ")))) - #nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) - ##### Model frame for probability sample ##### + nons_names <- all.vars(as.formula(paste("~", paste(attr(mt, "term.labels"), collapse = " + ")))) + # nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) + ##### Model frame for probability sample ##### if (outcome_name %in% colnames(svydesign$variables)) { # design_to_frame <- svydesign$variables # design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0 @@ -28,8 +28,8 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot design_to_frame <- svydesign$variables ## terms_object <- terms(formula) - #names_rand <- all.vars(terms_object) - names_rand <- all.vars(as.formula(paste("~", paste(attr(terms_object, "term.labels"), collapse=" + ")))) + # names_rand <- all.vars(terms_object) + names_rand <- all.vars(as.formula(paste("~", paste(attr(terms_object, "term.labels"), collapse = " + ")))) ## # names_rand <- all.vars(formula[-2]) # old ## diff --git a/R/glm.R b/R/glm.R index f588957..6d0cf2e 100644 --- a/R/glm.R +++ b/R/glm.R @@ -74,7 +74,7 @@ glm_nonprobsvy <- function(outcome, formula_str <- paste(outcome_name, "~", paste(predictors, collapse = " + ")) formula <- as.formula(formula_str) model$formula <- formula - model$data <- as.data.frame(cbind(y_nons, X_nons[,-1, drop = FALSE])) + model$data <- as.data.frame(cbind(y_nons, X_nons[, -1, drop = FALSE])) colnames(model$data) <- c(outcome_name, predictors) model_out <- list( diff --git a/R/nn.R b/R/nn.R index 2e827ba..ff73a1a 100644 --- a/R/nn.R +++ b/R/nn.R @@ -100,7 +100,7 @@ nn_exact <- function(pi_ij, X_nons, X_rand, k, - #control, + # control, N) { # if (isTRUE("ppsmat" %in% class(pi_ij))) { # pi_ij <- pi_ij$pij @@ -121,7 +121,6 @@ nn_exact <- function(pi_ij, dd <- vector(mode = "numeric", length = loop_size) for (jj in 1:loop_size) { - boot_samp <- sample(1:n_nons, size = n_nons, replace = TRUE) # boot_samp <- sample(1:n_rand, size = n_rand, replace = TRUE) y_nons_b <- y[boot_samp] @@ -133,9 +132,9 @@ nn_exact <- function(pi_ij, k = k, searchtype = "standard", treetype = "kd" - #TODO:: add control options - #treetype = control$treetype, - #searchtype = control$searchtype + # TODO:: add control options + # treetype = control$treetype, + # searchtype = control$searchtype ) dd[jj] <- weighted.mean( diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 7b1c709..6b8a3e1 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -563,7 +563,7 @@ nonprobDR <- function(selection, colnames(X) <- c("(Intercept)", colnames(Xsel)) OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons, ] SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx + 1]) - pop_totals <- c(pop_totals[1], pop_totals[idx+1]) + pop_totals <- c(pop_totals[1], pop_totals[idx + 1]) } else if (control_inference$bias_inf == "div") { X_outcome <- as.matrix(X[, beta_selected[-1] + 1, drop = FALSE]) Xsel <- X_selection <- as.matrix(X[, theta_selected[-1] + 1, drop = FALSE]) @@ -848,7 +848,7 @@ nonprobDR <- function(selection, if (is.null(pop_size)) pop_size <- N_nons names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) - est_totals <- colSums(SelectionModel$X_nons*as.vector(weights_nons)) + est_totals <- colSums(SelectionModel$X_nons * as.vector(weights_nons)) names(prob_pop_totals) <- colnames(SelectionModel$X_nons) boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { @@ -913,7 +913,7 @@ nonprobDR <- function(selection, outcome = OutcomeList, selection = SelectionList, boot_sample = boot_sample, - svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only + svydesign = if (is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_dr") ) diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index 6a2b95d..10913e3 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -540,7 +540,7 @@ nonprobIPW <- function(selection, if (is.null(pop_size)) pop_size <- N # estimated pop_size names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) - est_totals <- colSums(X_nons*as.vector(weights_nons)) + est_totals <- colSums(X_nons * as.vector(weights_nons)) names(prob_pop_totals) <- colnames(X_nons) boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { @@ -605,7 +605,7 @@ nonprobIPW <- function(selection, outcome = NULL, selection = SelectionList, boot_sample = boot_sample, - svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only + svydesign = if (is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_ipw") ) diff --git a/R/nonprobMI.R b/R/nonprobMI.R index bb45c4e..acb18aa 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -494,7 +494,7 @@ nonprobMI <- function(outcome, outcome = OutcomeList, selection = NULL, boot_sample = boot_sample, - svydesign = if(is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only + svydesign = if (is.null(pop_totals)) svydesign else NULL # TODO to customize if pop_totals only ), class = c("nonprobsvy", "nonprobsvy_mi") ) diff --git a/R/pmm.R b/R/pmm.R index f486ed2..e26782d 100644 --- a/R/pmm.R +++ b/R/pmm.R @@ -78,26 +78,26 @@ pmm_nonprobsvy <- function(outcome, ) y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) ) switch(control$pmm_weights, - "none" = { - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - }, - "prop_dist" = { - # TODO:: these weights will need to be saved for variance estimation - y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), - FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], - w = 1 / model_rand$nn.dist[x, ] - ) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } ) } else { # I'm not touching this @@ -122,21 +122,21 @@ pmm_nonprobsvy <- function(outcome, ) switch(control$pmm_weights, - "none" = { - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - }, - "prop_dist" = { - # TODO:: these weights will need to be saved for variance estimation - y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), - FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], - w = 1 / model_rand$nn.dist[x, ] - ) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } ) } else { # I'm not touching this @@ -173,8 +173,8 @@ pmm_exact <- function(pi_ij, n_nons, y, pmm_reg_engine, - #stats, #why is this here? - #glm, #why is this here? + # stats, #why is this here? + # glm, #why is this here? model_obj, svydesign, predictive_match, diff --git a/R/theta_funcs.R b/R/theta_funcs.R index 424b0e3..70ba214 100644 --- a/R/theta_funcs.R +++ b/R/theta_funcs.R @@ -93,7 +93,6 @@ theta_h_estimation <- function(R, start = NULL, pop_totals = NULL, pop_means = NULL) { - p <- ncol(X) # if (is.null(pop_totals) & is.null(pop_means)) { # if (is.null(start)) { diff --git a/R/varianceMI.R b/R/varianceMI.R index dd01cd5..4675c38 100644 --- a/R/varianceMI.R +++ b/R/varianceMI.R @@ -48,11 +48,10 @@ internal_varMI <- function(svydesign, X_rand = X_rand, k = k, # TODO:: add control here - #control = control + # control = control N = N ) } - } else if (method == "glm") { # TODO add variance for count binary outcome variable control_outcome$method beta <- parameters[, 1] From 915c5f38a12bc1540bf5b66fd569b9123685bb47 Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Tue, 12 Nov 2024 19:27:34 +0100 Subject: [PATCH 24/24] small changes in documentation before CRAN submission --- NEWS.md | 3 ++ R/main_function_documentation.R | 69 +++++++++++++++++++-------------- R/summary.R | 2 +- man/nonprob.Rd | 69 +++++++++++++++++++-------------- man/summary.nonprobsvy.Rd | 2 +- 5 files changed, 83 insertions(+), 62 deletions(-) diff --git a/NEWS.md b/NEWS.md index c11f086..868c337 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,9 @@ - add estimation of exact standard error for k-nearest neighbor estimator. - add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching. - implement `div` option when variable selection (more in documentation) for doubly robust estimation. + - add more insights to `nonprob` output such as gradient, hessian and jacobian derived from IPW estimation for `mle` and `gee` methods when `IPW` or `DR` model executed. + - add estimated inclusion probabilities and its derivatives for probability and non-probability samples to `nonprob` output when `IPW` or `DR` model executed. + - add `model_frame` matrix data from probability sample used for mass imputation to `nonprob` when `MI` or `DR` model executed. ## nonprobsvy 0.1.0 diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R index 3fcb96d..c16d6ce 100644 --- a/R/main_function_documentation.R +++ b/R/main_function_documentation.R @@ -8,9 +8,9 @@ NULL #' The function allows you to estimate the population mean with access to a reference probability sample, as well as sums and means of covariates. #' #' The package implements state-of-the-art approaches recently proposed in the literature: Chen et al. (2020), -#' Yang et al. (2020), Wu (2022) and use the [Lumley 2004](https://CRAN.R-project.org/package=survey) `survey` package for inference. +#' Yang et al. (2020), Wu (2022) and uses the [Lumley 2004](https://CRAN.R-project.org/package=survey) `survey` package for inference. #' -#' It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbor) and +#' It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and #' doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators or #' variable selection. #' The package uses `survey` package functionality when a probability sample is available. @@ -24,19 +24,19 @@ NULL #' @param pop_totals an optional `named vector` with population totals of the covariates. #' @param pop_means an optional `named vector` with population means of the covariates. #' @param pop_size an optional `double` with population size. -#' @param method_selection a `character` with method for propensity scores estimation -#' @param method_outcome a `character` with method for response variable estimation +#' @param method_selection a `character` with method for propensity scores estimation. +#' @param method_outcome a `character` with method for response variable estimation. #' @param family_outcome a `character` string describing the error distribution and link function to be used in the model. Default is "gaussian". Currently supports: gaussian with identity link, poisson and binomial. #' @param subset an optional `vector` specifying a subset of observations to be used in the fitting process. #' @param strata an optional `vector` specifying strata. -#' @param weights an optional `vector` of prior weights to be used in the fitting process. Should be NULL or a numeric vector. It is assumed that this vector contains frequency or analytic weights +#' @param weights an optional `vector` of prior weights to be used in the fitting process. Should be NULL or a numeric vector. It is assumed that this vector contains frequency or analytic weights. #' @param na_action a function which indicates what should happen when the data contain `NAs`. -#' @param control_selection a `list` indicating parameters to use in fitting selection model for propensity scores -#' @param control_outcome a `list` indicating parameters to use in fitting model for outcome variable -#' @param control_inference a `list` indicating parameters to use in inference based on probability and non-probability samples, contains parameters such as estimation method or variance method -#' @param start_selection an optional `vector` with starting values for the parameters of the selection equation -#' @param start_outcome an optional `vector` with starting values for the parameters of the outcome equation -#' @param verbose verbose, numeric +#' @param control_selection a `list` indicating parameters to use in fitting selection model for propensity scores. +#' @param control_outcome a `list` indicating parameters to use in fitting model for outcome variable. +#' @param control_inference a `list` indicating parameters to use in inference based on probability and non-probability samples, contains parameters such as estimation method or variance method. +#' @param start_selection an optional `vector` with starting values for the parameters of the selection equation. +#' @param start_outcome an optional `vector` with starting values for the parameters of the outcome equation. +#' @param verbose verbose, numeric. #' @param x Logical value indicating whether to return model matrix of covariates as a part of output. #' @param y Logical value indicating whether to return vector of outcome variable as a part of output. #' @param se Logical value indicating whether to calculate and return standard error of estimated mean. @@ -188,25 +188,26 @@ NULL #' \item{\code{control} -- list of control functions.} #' \item{\code{output} -- output of the model with information on the estimated population mean and standard errors.} #' \item{\code{SE} -- standard error of the estimator of the population mean, divided into errors from probability and non-probability samples.} -#' \item{\code{confidence_interval} -- confidence interval of population mean estimator} -#' \item{\code{nonprob_size} -- size of non-probability sample} -#' \item{\code{prob_size} -- size of probability sample} -#' \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)} +#' \item{\code{confidence_interval} -- confidence interval of population mean estimator.} +#' \item{\code{nonprob_size} -- size of non-probability sample.} +#' \item{\code{prob_size} -- size of probability sample.} +#' \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample).} #' \item{\code{pop_totals} -- the total values of the auxiliary variables derived from a probability sample or vector of total/mean values.} #' \item{\code{outcome} -- list containing information about the fitting of the mass imputation model, in the case of regression model the object containing the list returned by #' [stats::glm()], in the case of the nearest neighbour imputation the object containing list returned by [RANN::nn2()]. If `bias_correction` in [controlInf()] is set to `TRUE`, the estimation is based on #' the joint estimating equations for the `selection` and `outcome` model and therefore, the list is different from the one returned by the [stats::glm()] function and contains elements such as #' \itemize{ -#' \item{\code{coefficients} -- estimated coefficients of the regression model} -#' \item{\code{std_err} -- standard errors of the estimated coefficients} -#' \item{\code{residuals} -- The response residuals} -#' \item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates} -#' \item{\code{df_residual} -- The degrees of freedom for residuals} -#' \item{\code{family} -- specifies the error distribution and link function to be used in the model} -#' \item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model} -#' \item{\code{linear.predictors} -- The linear fit on link scale} -#' \item{\code{X} -- The design matrix} -#' \item{\code{method} -- set on `glm`, since the regression method} +#' \item{\code{coefficients} -- estimated coefficients of the regression model.} +#' \item{\code{std_err} -- standard errors of the estimated coefficients.} +#' \item{\code{residuals} -- The response residuals.} +#' \item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates.} +#' \item{\code{df_residual} -- The degrees of freedom for residuals.} +#' \item{\code{family} -- specifies the error distribution and link function to be used in the model.} +#' \item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model.} +#' \item{\code{linear.predictors} -- The linear fit on link scale.} +#' \item{\code{X} -- The design matrix.} +#' \item{\code{method} -- set on `glm`, since the regression method.} +#' \item{\code{model_frame} -- Matrix of data from probability sample used for mass imputation.} #' } #' } #' In addition, if the variable selection model for the outcome variable is fitting, the list includes the @@ -215,22 +216,30 @@ NULL #' } #' \item{\code{selection} -- list containing information about fitting of propensity score model, such as #' \itemize{ -#' \item{\code{coefficients} -- a named vector of coefficients} -#' \item{\code{std_err} -- standard errors of the estimated model coefficients} -#' \item{\code{residuals} -- the response residuals} -#' \item{\code{variance} -- the root mean square error} +#' \item{\code{coefficients} -- a named vector of coefficients.} +#' \item{\code{std_err} -- standard errors of the estimated model coefficients.} +#' \item{\code{residuals} -- the response residuals.} +#' \item{\code{variance} -- the root mean square error.} #' \item{\code{fitted_values} -- the fitted mean values, obtained by transforming the linear predictors by the inverse of the link function.} #' \item{\code{link} -- the `link` object used.} #' \item{\code{linear_predictors} -- the linear fit on link scale.} #' \item{\code{aic} -- A version of Akaike's An Information Criterion, minus twice the maximized log-likelihood plus twice the number of parameters.} #' \item{\code{weights} -- vector of estimated weights for non-probability sample.} #' \item{\code{prior.weights} -- the weights initially supplied, a vector of 1s if none were.} -#' \item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample.} +#' \item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample}. #' \item{\code{formula} -- the formula supplied.} #' \item{\code{df_residual} -- the residual degrees of freedom.} #' \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NA`.} #' \item{\code{cve} -- the error for each value of the `lambda`, averaged across the cross-validation folds for the variable selection model #' when the propensity score model is fitting. Returned only if selection of variables for the model is used.} +#' \item{\code{method_selection} -- Link function, e.g. `logit`, `cloglog` or `probit`.} +#' \item{\code{hessian} -- Hessian Gradient of the log-likelihood function from `mle` method}. +#' \item{\code{gradient} -- Gradient of the log-likelihood function from `mle` method.} +#' \item{\code{method} -- An estimation method for selection model, e.g. `mle` or `gee`.} +#' \item{\code{prob_der} -- Derivative of the inclusion probability function for units in a non--probability sample.} +#' \item{\code{prob_rand} -- Inclusion probabilities for unit from a probabiliy sample from `svydesign` object.} +#' \item{\code{prob_rand_est} -- Inclusion probabilites to a non--probabiliy sample for unit from probability sample.} +#' \item{\code{prob_rand_est_der} -- Derivative of the inclusion probabilites to a non--probabiliy sample for unit from probability sample.} #' } #' } #' \item{\code{stat} -- matrix of the estimated population means in each bootstrap iteration. diff --git a/R/summary.R b/R/summary.R index f80ea35..1d3b9ec 100644 --- a/R/summary.R +++ b/R/summary.R @@ -15,7 +15,7 @@ #' \item \code{call} -- A call which created \code{object}. #' \item \code{pop_total} -- A list containing information about the estimated population mean, its standard error and confidence interval. #' \item \code{sample_size} -- The size of the samples used in the model. -#' \item \code{population_size} -- The estimated size of the population from which the nonoprobability sample was drawn. +#' \item \code{population_size} -- The estimated size of the population from which the non--probability sample was drawn. #' \item \code{test} -- Type of statistical test performed. #' \item \code{control} -- A List of control parameters used in fitting the model. #' \item \code{model} -- A descriptive name of the model used, e.g., "Doubly-Robust", "Inverse probability weighted", or "Mass Imputation". diff --git a/man/nonprob.Rd b/man/nonprob.Rd index b9cb25c..358e7d0 100644 --- a/man/nonprob.Rd +++ b/man/nonprob.Rd @@ -49,9 +49,9 @@ nonprob( \item{pop_size}{an optional \code{double} with population size.} -\item{method_selection}{a \code{character} with method for propensity scores estimation} +\item{method_selection}{a \code{character} with method for propensity scores estimation.} -\item{method_outcome}{a \code{character} with method for response variable estimation} +\item{method_outcome}{a \code{character} with method for response variable estimation.} \item{family_outcome}{a \code{character} string describing the error distribution and link function to be used in the model. Default is "gaussian". Currently supports: gaussian with identity link, poisson and binomial.} @@ -59,21 +59,21 @@ nonprob( \item{strata}{an optional \code{vector} specifying strata.} -\item{weights}{an optional \code{vector} of prior weights to be used in the fitting process. Should be NULL or a numeric vector. It is assumed that this vector contains frequency or analytic weights} +\item{weights}{an optional \code{vector} of prior weights to be used in the fitting process. Should be NULL or a numeric vector. It is assumed that this vector contains frequency or analytic weights.} \item{na_action}{a function which indicates what should happen when the data contain \code{NAs}.} -\item{control_selection}{a \code{list} indicating parameters to use in fitting selection model for propensity scores} +\item{control_selection}{a \code{list} indicating parameters to use in fitting selection model for propensity scores.} -\item{control_outcome}{a \code{list} indicating parameters to use in fitting model for outcome variable} +\item{control_outcome}{a \code{list} indicating parameters to use in fitting model for outcome variable.} -\item{control_inference}{a \code{list} indicating parameters to use in inference based on probability and non-probability samples, contains parameters such as estimation method or variance method} +\item{control_inference}{a \code{list} indicating parameters to use in inference based on probability and non-probability samples, contains parameters such as estimation method or variance method.} -\item{start_selection}{an optional \code{vector} with starting values for the parameters of the selection equation} +\item{start_selection}{an optional \code{vector} with starting values for the parameters of the selection equation.} -\item{start_outcome}{an optional \code{vector} with starting values for the parameters of the outcome equation} +\item{start_outcome}{an optional \code{vector} with starting values for the parameters of the outcome equation.} -\item{verbose}{verbose, numeric} +\item{verbose}{verbose, numeric.} \item{x}{Logical value indicating whether to return model matrix of covariates as a part of output.} @@ -97,25 +97,26 @@ with type \code{list} containing:\cr \item{\code{control} -- list of control functions.} \item{\code{output} -- output of the model with information on the estimated population mean and standard errors.} \item{\code{SE} -- standard error of the estimator of the population mean, divided into errors from probability and non-probability samples.} -\item{\code{confidence_interval} -- confidence interval of population mean estimator} -\item{\code{nonprob_size} -- size of non-probability sample} -\item{\code{prob_size} -- size of probability sample} -\item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)} +\item{\code{confidence_interval} -- confidence interval of population mean estimator.} +\item{\code{nonprob_size} -- size of non-probability sample.} +\item{\code{prob_size} -- size of probability sample.} +\item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample).} \item{\code{pop_totals} -- the total values of the auxiliary variables derived from a probability sample or vector of total/mean values.} \item{\code{outcome} -- list containing information about the fitting of the mass imputation model, in the case of regression model the object containing the list returned by \code{\link[stats:glm]{stats::glm()}}, in the case of the nearest neighbour imputation the object containing list returned by \code{\link[RANN:nn2]{RANN::nn2()}}. If \code{bias_correction} in \code{\link[=controlInf]{controlInf()}} is set to \code{TRUE}, the estimation is based on the joint estimating equations for the \code{selection} and \code{outcome} model and therefore, the list is different from the one returned by the \code{\link[stats:glm]{stats::glm()}} function and contains elements such as \itemize{ -\item{\code{coefficients} -- estimated coefficients of the regression model} -\item{\code{std_err} -- standard errors of the estimated coefficients} -\item{\code{residuals} -- The response residuals} -\item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates} -\item{\code{df_residual} -- The degrees of freedom for residuals} -\item{\code{family} -- specifies the error distribution and link function to be used in the model} -\item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model} -\item{\code{linear.predictors} -- The linear fit on link scale} -\item{\code{X} -- The design matrix} -\item{\code{method} -- set on \code{glm}, since the regression method} +\item{\code{coefficients} -- estimated coefficients of the regression model.} +\item{\code{std_err} -- standard errors of the estimated coefficients.} +\item{\code{residuals} -- The response residuals.} +\item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates.} +\item{\code{df_residual} -- The degrees of freedom for residuals.} +\item{\code{family} -- specifies the error distribution and link function to be used in the model.} +\item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model.} +\item{\code{linear.predictors} -- The linear fit on link scale.} +\item{\code{X} -- The design matrix.} +\item{\code{method} -- set on \code{glm}, since the regression method.} +\item{\code{model_frame} -- Matrix of data from probability sample used for mass imputation.} } } In addition, if the variable selection model for the outcome variable is fitting, the list includes the @@ -124,22 +125,30 @@ In addition, if the variable selection model for the outcome variable is fitting } \item{\code{selection} -- list containing information about fitting of propensity score model, such as \itemize{ -\item{\code{coefficients} -- a named vector of coefficients} -\item{\code{std_err} -- standard errors of the estimated model coefficients} -\item{\code{residuals} -- the response residuals} -\item{\code{variance} -- the root mean square error} +\item{\code{coefficients} -- a named vector of coefficients.} +\item{\code{std_err} -- standard errors of the estimated model coefficients.} +\item{\code{residuals} -- the response residuals.} +\item{\code{variance} -- the root mean square error.} \item{\code{fitted_values} -- the fitted mean values, obtained by transforming the linear predictors by the inverse of the link function.} \item{\code{link} -- the \code{link} object used.} \item{\code{linear_predictors} -- the linear fit on link scale.} \item{\code{aic} -- A version of Akaike's An Information Criterion, minus twice the maximized log-likelihood plus twice the number of parameters.} \item{\code{weights} -- vector of estimated weights for non-probability sample.} \item{\code{prior.weights} -- the weights initially supplied, a vector of 1s if none were.} -\item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample.} +\item{\code{est_totals} -- the estimated total values of auxiliary variables derived from a non-probability sample}. \item{\code{formula} -- the formula supplied.} \item{\code{df_residual} -- the residual degrees of freedom.} \item{\code{log_likelihood} -- value of log-likelihood function if \code{mle} method, in the other case \code{NA}.} \item{\code{cve} -- the error for each value of the \code{lambda}, averaged across the cross-validation folds for the variable selection model when the propensity score model is fitting. Returned only if selection of variables for the model is used.} +\item{\code{method_selection} -- Link function, e.g. \code{logit}, \code{cloglog} or \code{probit}.} +\item{\code{hessian} -- Hessian Gradient of the log-likelihood function from \code{mle} method}. +\item{\code{gradient} -- Gradient of the log-likelihood function from \code{mle} method.} +\item{\code{method} -- An estimation method for selection model, e.g. \code{mle} or \code{gee}.} +\item{\code{prob_der} -- Derivative of the inclusion probability function for units in a non--probability sample.} +\item{\code{prob_rand} -- Inclusion probabilities for unit from a probabiliy sample from \code{svydesign} object.} +\item{\code{prob_rand_est} -- Inclusion probabilites to a non--probabiliy sample for unit from probability sample.} +\item{\code{prob_rand_est_der} -- Derivative of the inclusion probabilites to a non--probabiliy sample for unit from probability sample.} } } \item{\code{stat} -- matrix of the estimated population means in each bootstrap iteration. @@ -152,9 +161,9 @@ Returned only if a bootstrap method is used to estimate the variance and \code{k The function allows you to estimate the population mean with access to a reference probability sample, as well as sums and means of covariates. The package implements state-of-the-art approaches recently proposed in the literature: Chen et al. (2020), -Yang et al. (2020), Wu (2022) and use the \href{https://CRAN.R-project.org/package=survey}{Lumley 2004} \code{survey} package for inference. +Yang et al. (2020), Wu (2022) and uses the \href{https://CRAN.R-project.org/package=survey}{Lumley 2004} \code{survey} package for inference. -It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbor) and +It provides propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and doubly robust estimators that take into account minimisation of the asymptotic bias of the population mean estimators or variable selection. The package uses \code{survey} package functionality when a probability sample is available. diff --git a/man/summary.nonprobsvy.Rd b/man/summary.nonprobsvy.Rd index c138af7..b47af02 100644 --- a/man/summary.nonprobsvy.Rd +++ b/man/summary.nonprobsvy.Rd @@ -27,7 +27,7 @@ An object of \code{summary_nonprobsvy} class containing: \item \code{call} -- A call which created \code{object}. \item \code{pop_total} -- A list containing information about the estimated population mean, its standard error and confidence interval. \item \code{sample_size} -- The size of the samples used in the model. -\item \code{population_size} -- The estimated size of the population from which the nonoprobability sample was drawn. +\item \code{population_size} -- The estimated size of the population from which the non--probability sample was drawn. \item \code{test} -- Type of statistical test performed. \item \code{control} -- A List of control parameters used in fitting the model. \item \code{model} -- A descriptive name of the model used, e.g., "Doubly-Robust", "Inverse probability weighted", or "Mass Imputation".