diff --git a/.gitignore b/.gitignore index 85be987..0d8333f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ playground src/*.o src/*.so src/Makevars +srs/*.dll docs diff --git a/DESCRIPTION b/DESCRIPTION index 99a2fc2..7306ef7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: nonprobsvy Type: Package -Title: Package for Inference Based on Nonprobability Samples +Title: Package for Inference Based on Non-Probability Samples Version: 0.1.0 Authors@R: c(person(given = "Łukasz", @@ -16,12 +16,7 @@ Authors@R: family = "Chlebicki", role = "ctb", email = "piochl@st.amu.edu.pl")) -Description: An R package for statistical inference with non-probability samples when auxiliary information - from external sources such as probability samples or population totals or means is available. Details can be found - in: Wu et al. (2020) , Kim et al. (2021) , - Wu et al. (2023) , - Kim et al. (2021) , - Kim et al. (2020) . +Description: An R package for statistical inference with non-probability samples when auxiliary information from external sources such as probability samples or population totals or means is available. Details can be found in: Wu et al. (2020) , Kim et al. (2021) , Wu et al. (2023) , Kim et al. (2021) , Kim et al. (2020) . License: MIT + file LICENSE Encoding: UTF-8 LazyData: true @@ -35,7 +30,8 @@ Suggests: covr, sampling, spelling -Depends: survey +Depends: + survey Imports: maxLik, stats, @@ -44,12 +40,13 @@ Imports: ncvreg, mathjaxr, RANN, - Rcpp, + Rcpp (>= 1.0.12), nleqslv, - doParallel, + doSNOW, + progress, foreach, parallel LinkingTo: - RcppArmadillo, - Rcpp + Rcpp, + RcppArmadillo Language: en-US diff --git a/NAMESPACE b/NAMESPACE index a7283e8..f1667ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,8 @@ importFrom(Matrix,Diagonal) importFrom(Matrix,Matrix) importFrom(RANN,nn2) importFrom(Rcpp,evalCpp) -importFrom(doParallel,registerDoParallel) +importFrom(Rcpp,sourceCpp) +importFrom(doSNOW,registerDoSNOW) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) importFrom(maxLik,maxLik) @@ -47,6 +48,7 @@ importFrom(stats,confint) importFrom(stats,contrasts) importFrom(stats,cooks.distance) importFrom(stats,cor) +importFrom(stats,cov) importFrom(stats,delete.response) importFrom(stats,deviance) importFrom(stats,dnorm) @@ -54,12 +56,15 @@ importFrom(stats,get_all_vars) importFrom(stats,glm.fit) importFrom(stats,hatvalues) importFrom(stats,lm.fit) +importFrom(stats,loess) +importFrom(stats,loess.control) importFrom(stats,logLik) importFrom(stats,model.frame) importFrom(stats,model.matrix) importFrom(stats,model.response) importFrom(stats,nobs) importFrom(stats,pnorm) +importFrom(stats,predict) importFrom(stats,predict.glm) importFrom(stats,printCoefmat) importFrom(stats,pt) @@ -75,8 +80,13 @@ importFrom(stats,summary.glm) importFrom(stats,terms) importFrom(stats,uniroot) importFrom(stats,update) +importFrom(stats,var) importFrom(stats,vcov) importFrom(stats,weighted.mean) importFrom(survey,as.svrepdesign) +importFrom(survey,svymean) importFrom(survey,svyrecvar) +importFrom(utils,setTxtProgressBar) +importFrom(utils,txtProgressBar) useDynLib(nonprobsvy) +useDynLib(nonprobsvy, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index f93200b..8440f32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ - implemented population mean estimation using doubly robust, inverse probability weighting and mass imputation methods - implemented inverse probability weighting models with Maximum Likelihood Estimation and Generalized Estimating Equations methods with `logit`, `complementary log-log` and `probit` link functions. -- implemented `generalized linear models` and `nearest neighbours` methods for Mass Imputation +- implemented `generalized linear models`, `nearest neighbours` and `predictive mean matching` methods for Mass Imputation - implemented estimation methods when vector of population means/totals is available - implemented variables selection with `SCAD`, `LASSO` and `MCP` penalization equations - implemented `analytic` and `bootstrap` (with parallel computation) variance for described estimators diff --git a/R/EstimationMethods.R b/R/EstimationMethods.R deleted file mode 100644 index 9043ec6..0000000 --- a/R/EstimationMethods.R +++ /dev/null @@ -1,676 +0,0 @@ -# Internal functions for propensity score estimation models - -# Object with output parameters for Maximum likelihood Estimation for propensity scores -mle <- function(...) { - estimation_model <- function(model, method_selection, ...) { - method <- model$method - dinv_link <- method$make_link_inv_der - maxLik_nons_obj <- model$maxLik_nons_obj - log_likelihood <- maxLik_nons_obj$log_l # maximum of the loglikelihood function - theta_hat <- model$theta - - ps_nons <- model$ps - ps_nons_der <- model$ps_der - est_ps_rand <- model$ps_rand - est_ps_rand_der <- model$ps_rand_der - hess <- maxLik_nons_obj$hess - grad <- maxLik_nons_obj$grad - var_cov1 <- model$var_cov1 - var_cov2 <- model$var_cov2 - df_residual <- model$df_residual - variance_covariance <- solve(-hess) # MASS::ginv # variance-covariance matrix of estimated parameters - eta <- c(model$eta_rand, model$eta_nons) - aic <- 2 * (length(theta_hat) - log_likelihood) - residuals <- model$residuals - variance <- as.vector(model$variance) - deviance <- model$deviance - deviance_null <- model$deviance_null - - list( - theta_hat = theta_hat, - grad = grad, - hess = hess, - var_cov1 = var_cov1, - var_cov2 = var_cov2, - ps_nons = ps_nons, - est_ps_rand = est_ps_rand, - ps_nons_der = ps_nons_der, - est_ps_rand_der = est_ps_rand_der, - variance_covariance = variance_covariance, - log_likelihood = log_likelihood, - df_residual = df_residual, - eta = eta, - aic = aic, - variance = variance, - residuals = residuals, - method = method - ) - } - - make_t <- function(X, ps, psd, b, y_rand, y_nons, h, N, method_selection, weights, weights_sum) { - method <- get_method(method_selection) - t <- method$t_vec( - X = X, - ps = ps, - psd = psd, - b = b, - y_rand = y_rand, - y_nons = y_nons, - N = N, - weights = weights - ) - t - } - - make_var_nonprob <- function(ps, psd, y, y_pred, h_n, X, b, N, h, method_selection, weights = weights, weights_sum, pop_totals = NULL) { - method <- get_method(method_selection) - var_nonprob <- method$var_nonprob( - ps = ps, - psd = psd, - y = y, - y_pred = y_pred, - h_n = h_n, - X = X, - b = b, - N = N, - weights = weights - ) - as.numeric(var_nonprob) - } - - model_selection <- function(X, - X_nons, - X_rand, - weights, - weights_rand, - R, - method_selection, - optim_method, - h = h, - est_method, - maxit, - control_selection, - start, - varcov = FALSE, - ...) { - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method = method_selection_function) - max_lik <- method$make_max_lik # function for propensity score estimation - loglike <- method$make_log_like - gradient <- method$make_gradient - hessian <- method$make_hessian - inv_link <- method$make_link_inv - dinv_link <- method$make_link_inv_der - - # initial values for propensity score estimation - if (is.null(start)) { - if (control_selection$start_type == "glm") { - start <- start_fit( - X = X, - R = R, - weights = weights, - weights_rand = weights_rand, - method_selection = method_selection - ) - } else if (control_selection$start_type == "naive") { - intercept_start <- suppressWarnings(max_lik( - X_nons = X_nons[, 1, drop = FALSE], - X_rand = X_rand[, 1, drop = FALSE], - weights = weights, - weights_rand = weights_rand, - start = 0, - control = control_selection - )$theta_hat) - start <- c(intercept_start, rep(0, ncol(X_nons) - 1)) - } - } - - df_reduced <- nrow(X) - length(start) - - maxLik_nons_obj <- max_lik( - X_nons = X_nons, - X_rand = X_rand, - weights = weights, - weights_rand = weights_rand, - start = start, - control = control_selection - ) - - #### deviance - # # null model - # max_lik_null <- max_lik(X_nons = rep(1, nrow(X_nons)), - # X_rand = rep(1, nrow(X_rand)), - # weights = weights, - # weights_rand = weights_rand, - # start = 0, - # control = control_selection) - # log_lik_null <- max_lik_null$log_l - # - # # saturated model - # sats <- factor(1:length(R)) - # df_sat <- data.frame(R = R, sats = sats) - # mod_sat <- model.frame(R ~ sats, data = df_sat) - # X_sat <- model.matrix(mod_sat) - # max_lik_sat <- max_lik(X_nons = X_sat[which(R == 1), ,drop = FALSE], - # X_rand = X_sat[which(R == 0), ,drop = FALSE], - # weights = weights, - # weights_rand = weights_rand, - # start = rep(0, ncol(X_sat)), - # control = control_selection) - # log_lik_sat <- max_lik_sat$log_l - # - # # null deviance - # deviance_null <- log_lik_sat - log_lik_null - # - # # deviance - # deviance <- log_lik_sat - maxLik_nons_obj$log_l - - theta <- maxLik_nons_obj$theta_hat - eta_nons <- theta %*% t(X_nons) - eta_rand <- theta %*% t(X_rand) - - ps_nons <- inv_link(eta_nons) - est_ps_rand <- inv_link(eta_rand) - - ps_nons_der <- dinv_link(eta_nons) - est_ps_rand_der <- dinv_link(eta_rand) - - resids <- R - c(est_ps_rand, ps_nons) - - variance <- (t(resids) %*% resids) / df_reduced - - list( - maxLik_nons_obj = maxLik_nons_obj, - theta = theta, - ps = ps_nons, - ps_der = ps_nons_der, - ps_rand = est_ps_rand, - ps_rand_der = est_ps_rand_der, - var_cov1 = ifelse(varcov, method$variance_covariance1, "No variance-covariance matrix"), - var_cov2 = ifelse(varcov, method$variance_covariance2, "No variance-covariance matrix"), - df_residual = df_reduced, - eta_nons = eta_nons, - eta_rand = eta_rand, - residuals = resids, - variance = variance, - method = method - ) - } - structure( - list( - estimation_model = estimation_model, - make_t = make_t, - make_var_nonprob = make_var_nonprob, - model_selection = model_selection - ), - class = "method" - ) -} - -# Object with output parameters for estimation by Generalized Estimating Equations for propensity scores -gee <- function(...) { - estimation_model <- function(model, method_selection) { - method <- model$method - theta_hat <- model$theta_hat - hess <- model$hess - grad <- model$grad - ps_nons <- model$ps_nons - est_ps_rand <- model$est_ps_rand - ps_nons_der <- model$ps_nons_der - est_ps_rand_der <- model$est_ps_rand_der - var_cov1 <- model$var_cov1 - var_cov2 <- model$var_cov2 - df_residual <- model$df_residual - variance_covariance <- model$variance_covariance # variance-covariance matrix of estimated parameters - eta <- c(model$eta_rand, model$eta_nons) - residuals <- model$residuals - variance <- as.vector(model$variance) - - list( - theta_hat = theta_hat, - grad = grad, - hess = hess, - var_cov1 = var_cov1, - var_cov2 = var_cov2, - ps_nons = ps_nons, - est_ps_rand = est_ps_rand, - ps_nons_der = ps_nons_der, - est_ps_rand_der = est_ps_rand_der, - variance_covariance = variance_covariance, - df_residual = df_residual, - log_likelihood = NA, - eta = eta, - aic = NA, - variance = variance, - residuals = residuals, - method = method - ) - } - - make_t <- function(X, ps, psd, b, y_rand, y_nons, h, N, method_selection, weights) { - if (h == 1) { - t <- X %*% t(as.matrix(b)) + y_rand - 1 / N * sum(weights * y_nons) - } else if (h == 2) { - t <- as.vector(ps) * X %*% t(as.matrix(b)) + y_rand - 1 / N * sum(weights * y_nons) - } - t - } - - make_var_nonprob <- function(ps, psd, y, y_pred, h_n, X, b, N, h, method_selection, weights, pop_totals) { - if (!is.null(pop_totals)) h <- 1 # perhaps to remove, just check if appropriate var is calculated - if (h == 2) { - var_nonprob <- 1 / N^2 * sum((1 - ps) * ((weights * (y - y_pred - h_n) / ps) - b %*% t(X))^2) - } else if (h == 1) { - var_nonprob <- 1 / N^2 * sum((1 - ps) * ((weights * (y - y_pred - h_n) - b %*% t(X)) / ps)^2) - } - as.numeric(var_nonprob) - } - - model_selection <- function(X, - X_nons, - X_rand, - weights, - weights_rand, - R, - method_selection, - optim_method, - h = h, - est_method, - maxit, - control_selection, - start, - varcov = FALSE, - ...) { - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method = method_selection_function) - inv_link <- method$make_link_inv - - if (is.null(start)) { - if (control_selection$start_type == "glm") { - # start <- start_fit(X = X, # <--- does not work with pop_totals - # R = R, - # weights = weights, - # weights_rand = weights_rand, - # method_selection = method_selection) - - # TODO to test - start_to_gee <- start_fit( - X = X, # <--- does not work with pop_totals - R = R, - weights = weights, - weights_rand = weights_rand, - method_selection = method_selection - ) - start <- method$make_max_lik( - X_nons = X_nons, - X_rand = X_rand, - weights = weights, - weights_rand = weights_rand, - start = start_to_gee, - control = control_selection - )$theta_hat - #### - } else if (control_selection$start_type == "naive") { - start_h <- suppressWarnings(theta_h_estimation( - R = R, - X = X[, 1, drop = FALSE], - weights_rand = weights_rand, - weights = weights, - h = h, - method_selection = method_selection, - maxit = maxit, - start = 0 - )$theta_h) - start <- c(start_h, rep(0, ncol(X) - 1)) - } - } - - - h_object <- theta_h_estimation( - R = R, - X = X, - weights_rand = weights_rand, - weights = weights, - h = h, - method_selection = method_selection, - maxit = maxit, - start = start - ) - theta_hat <- h_object$theta_h - hess <- h_object$hess - grad <- h_object$grad - eta_nons <- theta_hat %*% t(as.matrix(X_nons)) - eta_rand <- theta_hat %*% t(as.matrix(X_rand)) - ps_nons <- inv_link(eta_nons) - est_ps_rand <- inv_link(eta_rand) - variance_covariance <- solve(-hess) - resids <- R - c(est_ps_rand, ps_nons) - - df_reduced <- nrow(X) - length(theta_hat) - variance <- as.vector((t(resids) %*% resids) / df_reduced) - - if (method_selection == "probit") { # for probit model, propensity score derivative is required - dinv_link <- method$make_link_inv_der - ps_nons_der <- dinv_link(theta_hat %*% t(as.matrix(X_nons))) - est_ps_rand_der <- dinv_link(theta_hat %*% t(as.matrix(X_rand))) - } - - list( - theta_hat = theta_hat, - hess = hess, - grad = grad, - ps_nons = ps_nons, - est_ps_rand = est_ps_rand, - ps_nons_der = ifelse(method_selection == "probit", ps_nons_der, NA), - est_ps_rand_der = ifelse(method_selection == "probit", est_ps_rand_der, NA), - variance_covariance = variance_covariance, - var_cov1 = ifelse(varcov, method$variance_covariance1, "No variance-covariance matrix"), - var_cov2 = ifelse(varcov, method$variance_covariance2, "No variance-covariance matrix"), - df_residual = df_reduced, - eta_nons = eta_nons, - eta_rand = eta_rand, - residuals = resids, - method = method - ) - } - - structure( - list( - estimation_model = estimation_model, - make_t = make_t, - make_var_nonprob = make_var_nonprob, - model_selection = model_selection - ), - class = "method" - ) -} - -# bias correction -mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, start_selection, start_outcome, boot = FALSE) { - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection_function) - inv_link <- method$make_link_inv - dinv_link <- method$make_link_inv_der - - loc_nons <- which(R == 1) - loc_rand <- which(R == 0) - - start <- c(start_outcome, start_selection) # TODO consider add info/error for end-user if one of starts provided only - - p <- ncol(X) - if (is.null(start)) { # TODO add default start - par0 <- rep(0, 2 * p) - } else { - par0 <- start - } - prior_weights <- c(weights_rand, weights) - - multiroot <- nleqslv::nleqslv( - x = par0, # TODO add user-specified parameters to control functions - fn = u_theta_beta_dr, - method = "Newton", # TODO consider the method Broyden - global = "qline", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none") - xscalm = "fixed", # c("fixed","auto") - jacobian = TRUE, - control = list(scalex = rep(1, length(par0))), # TODO algorithm did not converge in maxit iterations for cloglog - R = R, - X = X, - y = y, - weights = prior_weights, - method_selection = method_selection, - family_nonprobsvy = family - ) - par_sel <- multiroot$x - if (multiroot$termcd %in% c(2:7, -10)) { - switch(as.character(multiroot$termcd), - "2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."), - "3" = warning("Algorithm did not find suitable point - has stalled cannot find an acceptable new point when fitting selection model by nleqslv."), - "4" = warning("Iteration limit exceeded when fitting selection model by nleqslv."), - "5" = warning("ill-conditioned Jacobian when fitting selection model by nleqslv."), - "6" = warning("Jacobian is singular when fitting selection model by nleqslv."), - "7" = warning("Jacobian is unusable when fitting selection model by nleqslv."), - "-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv.") - ) - } - - theta_hat <- par_sel[1:(p)] - beta_hat <- par_sel[(p + 1):(2 * p)] - names(theta_hat) <- names(beta_hat) <- colnames(X) - df_residual <- nrow(X) - length(theta_hat) - - # selection parameters - ps <- inv_link(theta_hat %*% t(X)) # inv_link(as.vector(X_design %*% as.matrix(theta_hat))) - eta_sel <- theta_hat %*% t(X) - ps_der <- dinv_link(eta_sel) - ps_nons <- ps[loc_nons] - est_ps_rand <- ps[loc_rand] - ps_nons_der <- ps_der[loc_nons] - weights_nons <- 1 / ps_nons - resids <- R - c(est_ps_rand, ps_nons) - variance <- as.vector((t(resids) %*% resids) / df_residual) - - if (!boot) { - N_nons <- sum(weights * weights_nons) - # variance-covariance matrix for selection model toFix - V <- Matrix::Diagonal(n = length(ps), x = ps * (1 - ps)) - vcov_selection <- solve(t(X) %*% V %*% X) - # vcov_selection <- matrix(0, nrow = nrow(X_design), ncol = ncol(X_design)) - theta_errors <- sqrt(diag(vcov_selection)) - } - - eta_out <- as.vector(beta_hat %*% t(X)) - y_hat <- family$linkinv(eta_out) - y_rand_pred <- y_hat[loc_rand] - y_nons_pred <- y_hat[loc_nons] - - if (!boot) { - # sigma_nons <- family$variance(mu = y_nons_pred, y = y[loc_nons]) - # sigma_rand <- family$variance(mu = y_rand_pred, y = y[loc_rand]) - sigma_nons <- family$variance(mu = y_nons_pred) - sigma_rand <- family$variance(mu = y_rand_pred) - residuals <- family$residuals(mu = y_nons_pred, y = y[loc_nons]) - } - - if (!boot) { - # variance-covariance matrix for outcome model - # vcov_outcome <- solve(t(X_design) %*% diag(sigma) %*% X_design) - vcov_outcome <- solve(t(X[loc_nons, ]) %*% (sigma_nons * X[loc_nons, ])) - beta_errors <- sqrt(diag(vcov_outcome)) - } - - # grad = multiroot$f.root[(p+1):(2*p)] - if (!boot) { - hess <- NA - selection <- list( - theta_hat = theta_hat, # TODO list as close as possible to SelecttionList - grad = multiroot$fvec[1:(p)], - hess = hess, # TODO - ps_nons = ps_nons, - est_ps_rand = est_ps_rand, - variance_covariance = vcov_selection, - df_residual = df_residual, - log_likelihood = NA, - eta = eta_sel, - aic = NA, - residuals = resids, - variance = variance, - method = method - ) - - outcome <- list( - coefficients = beta_hat, # TODO list as close as possible to glm - std_err = beta_errors, - variance_covariance = vcov_outcome, - df_residual = df_residual, - family = list( - mu = y_nons_pred, - variance = sigma_nons, - family = family$family - ), - residuals = residuals, - fitted.values = y_nons_pred, - sigma_rand = sigma_rand, - y_rand_pred = y_rand_pred, - y_nons_pred = y_nons_pred, - linear.predictors = eta_out[loc_nons], - X = X[loc_nons, ] - ) - } else { - selection <- list( - coefficients = theta_hat, # TODO list as close as possible to SelecttionList - ps_nons = ps_nons - ) - outcome <- list( - coefficients = beta_hat, - y_rand_pred = y_rand_pred, # TODO list as close as possible to SelecttionList - y_nons_pred = y_nons_pred - ) - } - - list( - selection = selection, - outcome = outcome - ) -} - -##### helpers ######## - -# joint score equation for theta and beta, used in estimation when variable selections -u_theta_beta_dr <- function(par, - R, - X, - y, - weights, - method_selection, - family_nonprobsvy) { - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - - inv_link <- method$make_link_inv - inv_link_rev <- method$make_link_inv_rev - - p <- ncol(X) - theta <- par[1:(p)] - beta <- par[(p + 1):(2 * p)] - eta_pi <- X %*% theta - ps <- inv_link(eta_pi) - y[which(is.na(y))] <- 0 - ps <- as.vector(ps) - - eta <- X %*% beta - mu <- family_nonprobsvy$linkinv(eta) - mu_der <- as.vector(family_nonprobsvy$mu.eta(eta)) - res <- family_nonprobsvy$residuals(mu = mu, y = y) - mu_der <- 1 - - n <- length(R) - R_rand <- 1 - R - - utb <- c( - apply(X * R / ps * mu_der * weights - X * R_rand * weights * mu_der, 2, sum), - apply(X * R * weights * as.vector(-inv_link_rev(eta_pi)) * res, 2, sum) - ) / n - - utb -} - - -u_theta_ipw <- function(par, - R, - X, - y, - weights, - method_selection) { # TODO - - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - inv_link_rev <- method$make_link_inv_rev - inv_link <- method$make_link_inv - - p <- ncol(X) - theta <- par - X0 <- cbind(1, X) - eta_pi <- X0 %*% theta - y[which(is.na(y))] <- 0 - - R_rand <- 1 - R - loc_nons <- which(R == 1) - loc_rand <- which(R == 0) - n <- length(R) - y_mean <- mean(y[loc_nons]) - - # UTB <- apply(X0 * (R * as.vector(inv_link(eta_pi)) - y), 2, sum)/n # TODO - UTB <- apply(X0 * (R / as.vector(inv_link(eta_pi)) * y - R * y) * as.vector(inv_link_rev(eta_pi)), 2, sum) # TODO - - UTB -} - -u_beta_mi <- function(par, - R, - X, - y, - weights, - family_nonprobsvy) { # TODO - - if (is.character(family_nonprobsvy)) { - family_nonprobsvy <- paste(family_nonprobsvy, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - - p <- ncol(X) - beta <- par - eta <- X %*% beta - mu <- family_nonprobsvy$mu(eta) - mu_der <- family_nonprobsvy$mu_der(mu) - - n <- length(R) - R_rand <- 1 - R - loc_nons <- which(R == 1) - loc_rand <- which(R == 0) - y_mean <- mean(y[loc_nons]) - - UTB <- apply(X * y - X * R_rand * weights * as.vector(mu), 2, sum) - UTB -} - -# TODO Jacobian of the estimating equations for dr method -u_theta_beta_dr_jacob <- function(par, - R, - X, - y, - weights, - method_selection, - family_nonprobsvy) { - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - - inv_link <- method$make_link_inv - inv_link_rev <- method$make_link_inv_rev - dinv_link_rev <- method$make_link_inv_rev_de - - p <- ncol(X) - theta <- par[1:(p + 1)] - beta <- par[(p + 2):(2 * p + 2)] - X0 <- cbind(1, X) - eta_pi <- X0 %*% theta - ps <- inv_link(eta_pi) - y[which(is.na(y))] <- 0 - ps <- as.vector(ps) - - eta <- X0 %*% beta - mu <- family_nonprobsvy$mu(eta) - mu_der <- family_nonprobsvy$mu_der(mu) - mu_der2 <- family_nonprobsvy$mu_der2(mu) - res <- family_nonprobsvy$residuals(mu = mu, y = y) - n <- length(R) - R_rand <- 1 - R - - jac <- c( - apply(-X0 * R * weights * as.vector(inv_link_rev(eta_pi)) * mu_der, 2, sum), - apply(X0 * R / ps * mu_der2 * weights - X0 * R_rand * weights * mu_der2, 2, sum), - apply(X0 * R * weights * as.vector(dinv_link_rev(eta_pi)) * res * X0, 2, sum), - apply(X0 * R * weights * as.vector(inv_link_rev(eta_pi)) * mu_der, 2, sum) - ) / n - jac -} diff --git a/R/OutcomeMethods.R b/R/OutcomeMethods.R deleted file mode 100644 index afcba49..0000000 --- a/R/OutcomeMethods.R +++ /dev/null @@ -1,291 +0,0 @@ -# Internal functions for mass imputation models -#' @importFrom stats predict.glm -#' @importFrom stats glm.fit -#' @importFrom stats summary.glm -glm_nonprobsvy <- function(outcome, - data, - weights, - family_outcome, - start_outcome, - X_nons, - y_nons, - X_rand, - control, - n_nons, - n_rand, - model_frame, - vars_selection, - pop_totals) { - if (is.character(family_outcome)) { - family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - if (vars_selection == FALSE) { - # Estimation for outcome model - model_out <- internal_outcome( - outcome = outcome, - data = data, - weights = weights, - family_outcome = family_outcome, - start_outcome = start_outcome - ) - - model_nons_coefs <- model_out$glm$coefficients - parameters <- model_out$glm_summary$coefficients - - if (is.null(pop_totals)) { - y_rand_pred <- stats::predict.glm(model_out$glm, newdata = model_frame, type = "response") - } else { - eta <- pop_totals %*% model_nons_coefs / pop_totals[1] - y_rand_pred <- family_nonprobsvy$linkinv(eta) - } - y_nons_pred <- model_out$glm$fitted.values - } else { - model <- stats::glm.fit( - x = X_nons, - y = y_nons, - weights = weights, - family = get_method(family_outcome), - start = start_outcome, - control = list( - control$epsilon, - control$maxit, - control$trace - ), - intercept = FALSE - ) - model_summ <- stats::summary.glm(model) - parameters <- model_summ$coefficients - model_nons_coefs <- model$coefficients - if (is.null(pop_totals)) { - eta <- X_rand %*% model_nons_coefs - } else { - eta <- pop_totals %*% model_nons_coefs / pop_totals[1] - } - y_rand_pred <- family_nonprobsvy$linkinv(eta) - y_nons_pred <- model$fitted.values - - model_out <- list( - glm = model, - glm_summary = model_summ - ) - } - model_out$glm$std_err <- parameters[, 2] - names(model_out$glm$std_err) <- names(model_out$glm$coefficients) - - list( - model = model_out$glm, - y_rand_pred = y_rand_pred, - y_nons_pred = y_nons_pred, - parameters = parameters - ) -} - -nn_nonprobsvy <- function(outcome, - data, - weights, - family_outcome, - X_nons, - y_nons, - X_rand, - control, - n_nons, - n_rand, - vars_selection, - pop_totals, - model_frame = NULL, - start_outcome = NULL) { # TODO consider add data standardization before modelling - - model_nons <- nonprobMI_nn( - data = X_nons, - query = X_nons, - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - if (is.null(pop_totals)) { - model_rand <- nonprobMI_nn( - data = X_nons, - query = X_rand, - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - y_rand_pred <- vector(mode = "numeric", length = n_rand) - y_nons_pred <- vector(mode = "numeric", length = n_nons) - parameters <- "Non-parametric method for outcome model" - - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - - y_nons_pred <- apply(model_nons$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } else { - model_rand <- nonprobMI_nn( - data = X_nons, - query = t(as.matrix(pop_totals / pop_totals[1])), - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - y_rand_pred <- vector(mode = "numeric", length = 1) - y_nons_pred <- vector(mode = "numeric", length = n_nons) - parameters <- "Non-parametric method for outcome model" - - y_rand_pred <- mean(y_nons[model_rand$nn.idx]) - y_nons_pred <- apply(model_nons$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } - - model_out <- list( - model_nons = model_nons, - model_rand = model_rand - ) - list( - model = model_out, - y_rand_pred = y_rand_pred, - y_nons_pred = y_nons_pred, - parameters = parameters - ) -} - -pmm_nonprobsvy <- function(outcome, - data, - weights, - family_outcome, - start_outcome, - X_nons, - y_nons, - X_rand, - control, - n_nons, - n_rand, - vars_selection, - pop_totals, - model_frame) { - glm_object <- glm_nonprobsvy(outcome, - data, - weights, - family_outcome, - start_outcome = start_outcome, - X_nons, - y_nons, - X_rand, - control, - n_nons, - n_rand, - model_frame, - vars_selection, - pop_totals - ) - - model_nons <- nonprobMI_nn( - data = glm_object$y_nons_pred, - query = glm_object$y_nons_pred, - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - - y_nons_pred <- apply(model_nons$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - - if (is.null(pop_totals)) { - model_rand <- nonprobMI_nn( - data = glm_object$y_nons_pred, - query = glm_object$y_rand_pred, - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - - y_rand_pred <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_nons[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - } else { - model_rand <- nonprobMI_nn( - data = glm_object$y_nons_pred, - query = glm_object$y_rand_pred, - k = control$k, - treetype = control$treetype, - searchtype = control$searchtype - ) - y_rand_pred <- mean(y_nons[model_rand$nn.idx]) - } - - model_out <- list( - model_nons = model_nons, - model_rand = model_rand - ) - list( - model = model_out, - y_rand_pred = y_rand_pred, - y_nons_pred = y_nons_pred, - parameters = glm_object$parameters - ) -} - -nonprobMI_fit <- function(outcome, - data, - weights, - svydesign, - family_outcome, - start, - control_outcome = controlOut(), - verbose, - model, - x, - y) { - family <- family_outcome - - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - data$weights <- weights # TODO just for now, find more efficient way - model_nons <- stats::glm( - formula = outcome, - data = data, - weights = weights, - family = family, - start = start, - control = list( - control_outcome$epsilon, - control_outcome$maxit, - control_outcome$trace - ) - ) - - model_nons -} - -nonprobMI_nn <- function(data, - query, - k, - treetype, - searchtype, - radius = 0, - eps = 0) { - model_nn <- RANN::nn2( - data = data, - query = query, - k = k, - treetype = treetype, - searchtype = searchtype, - radius = radius, - eps = eps - ) - model_nn -} diff --git a/R/RcppExports.R b/R/RcppExports.R index ddf099c..24e9aa1 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,6 +2,6 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 cv_nonprobsvy_rcpp <- function(X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, verbose, lambda = -1) { - .Call('_nonprobsvy_cv_nonprobsvy_rcpp', PACKAGE = 'nonprobsvy', X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, verbose, lambda) + .Call(`_nonprobsvy_cv_nonprobsvy_rcpp`, X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, verbose, lambda) } diff --git a/R/beta_funcs.R b/R/beta_funcs.R new file mode 100644 index 0000000..7db1f32 --- /dev/null +++ b/R/beta_funcs.R @@ -0,0 +1,31 @@ +u_beta_mi <- function(par, + R, + X, + y, + weights, + family_nonprobsvy) { # TODO + + if (is.character(family_nonprobsvy)) { + family_nonprobsvy <- paste(family_nonprobsvy, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + + p <- ncol(X) + beta <- par + eta <- X %*% beta + mu <- family_nonprobsvy$mu.eta(eta) + # mu_der <- family_nonprobsvy$mu.eta2(mu) + + n <- length(R) + R_rand <- 1 - R + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + y_mean <- mean(y[loc_nons]) + + # UTB <- apply(X * y - X * R_rand * weights * as.vector(mu), 2, sum) + # UTB <- apply(X[loc_rand,] * (weights[loc_rand] * as.vector(mu[loc_rand]) - mean(y)), 2, sum) + # UTB <- apply(X * (weights * as.vector(mu) - mean(y)), 2, sum) + UTB <- apply(X * (R_rand * weights * as.vector(eta) - y_mean), 2, sum) + UTB +} diff --git a/R/bias_correction_ipw.R b/R/bias_correction_ipw.R new file mode 100644 index 0000000..0973a1b --- /dev/null +++ b/R/bias_correction_ipw.R @@ -0,0 +1,192 @@ +# bias correction +mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, start_selection, start_outcome, boot = FALSE) { + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection_function) + inv_link <- method$make_link_inv + dinv_link <- method$make_link_inv_der + + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + + start <- c(start_outcome, start_selection) # TODO consider add info/error for end-user if one of starts provided only + + p <- ncol(X) + if (is.null(start)) { # TODO add default start + par0 <- rep(0, 2 * p) + } else { + par0 <- start + } + prior_weights <- c(weights_rand, weights) + + # MI - bias correction ######### + # multiroot <- nleqslv::nleqslv( # TODO to fix "Jacobian is completely unusable (all zero entries?)" + # x = rep(0, p), # TODO add user-specified parameters to control functions + # fn = u_beta_mi,# TODO algorithm did not converge in maxit iterations for cloglog + # R = R, + # X = X, + # y = y, + # weights = prior_weights, + # family_nonprobsvy = family + # ) + # print(multiroot$x) + ########## + + # IPW - bias correction ######### + # multiroot <- nleqslv::nleqslv( + # x = rep(0, p), # TODO add user-specified parameters to control functions + # fn = u_theta_ipw, + # method = "Newton", # TODO consider the method Broyden + # global = "qline", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none") + # xscalm = "fixed", # c("fixed","auto") + # jacobian = TRUE, + # control = list(scalex = rep(1, length(rep(0, p)))), # TODO algorithm did not converge in maxit iterations for cloglog + # R = R, + # X = X, + # y = y, + # weights = weights, + # method_selection = method_selection + # ) + # print(multiroot$x) + ########## + + ######### BB + # multiroot <- nleqslv::nleqslv( + # par = par0, # TODO add user-specified parameters to control functions + # fn = u_theta_beta_dr, + # R = R, + # X = X, + # y = y, + # weights = prior_weights, + # method_selection = method_selection, + # family_nonprobsvy = family + # ) + # par_sel <- multiroot$par + ######### NLESQLV + multiroot <- nleqslv::nleqslv( + x = par0, # TODO add user-specified parameters to control functions + fn = u_theta_beta_dr, + method = "Newton", # TODO consider the method Broyden + global = "dbldog", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none") + xscalm = "auto", # c("fixed","auto") + jacobian = TRUE, + control = list(scalex = rep(1, length(par0))), # TODO algorithm did not converge in maxit iterations for cloglog + R = R, + X = X, + y = y, + weights = prior_weights, + method_selection = method_selection, + family_nonprobsvy = family + ) + par_sel <- multiroot$x + if (multiroot$termcd %in% c(2:7, -10)) { + switch(as.character(multiroot$termcd), + "2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."), + "3" = warning("Algorithm did not find suitable point - has stalled cannot find an acceptable new point when fitting selection model by nleqslv."), + "4" = warning("Iteration limit exceeded when fitting selection model by nleqslv."), + "5" = warning("ill-conditioned Jacobian when fitting selection model by nleqslv."), + "6" = warning("Jacobian is singular when fitting selection model by nleqslv."), + "7" = warning("Jacobian is unusable when fitting selection model by nleqslv."), + "-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv.") + ) + } + + theta_hat <- par_sel[1:(p)] + beta_hat <- par_sel[(p + 1):(2 * p)] + names(theta_hat) <- names(beta_hat) <- colnames(X) + df_residual <- nrow(X) - length(theta_hat) + + # selection parameters + ps <- inv_link(theta_hat %*% t(X)) # inv_link(as.vector(X_design %*% as.matrix(theta_hat))) + eta_sel <- theta_hat %*% t(X) + ps_der <- dinv_link(eta_sel) + ps_nons <- ps[loc_nons] + est_ps_rand <- ps[loc_rand] + ps_nons_der <- ps_der[loc_nons] + weights_nons <- 1 / ps_nons + resids <- R - c(est_ps_rand, ps_nons) + variance <- as.vector((t(resids) %*% resids) / df_residual) + + if (!boot) { + N_nons <- sum(weights * weights_nons) + # variance-covariance matrix for selection model toFix + V <- Matrix::Diagonal(n = length(ps), x = ps * (1 - ps)) + vcov_selection <- solve(t(X) %*% V %*% X) + # vcov_selection <- matrix(0, nrow = nrow(X_design), ncol = ncol(X_design)) + theta_errors <- sqrt(diag(vcov_selection)) + } + + eta_out <- as.vector(beta_hat %*% t(X)) + y_hat <- family$linkinv(eta_out) + y_rand_pred <- y_hat[loc_rand] + y_nons_pred <- y_hat[loc_nons] + + if (!boot) { + # sigma_nons <- family$variance(mu = y_nons_pred, y = y[loc_nons]) + # sigma_rand <- family$variance(mu = y_rand_pred, y = y[loc_rand]) + sigma_nons <- family$variance(mu = y_nons_pred) + sigma_rand <- family$variance(mu = y_rand_pred) + residuals <- family$residuals(mu = y_nons_pred, y = y[loc_nons]) + } + + if (!boot) { + # variance-covariance matrix for outcome model + # vcov_outcome <- solve(t(X_design) %*% diag(sigma) %*% X_design) + vcov_outcome <- solve(t(X[loc_nons, ]) %*% (sigma_nons * X[loc_nons, ])) + beta_errors <- sqrt(diag(vcov_outcome)) + } + + # grad = multiroot$f.root[(p+1):(2*p)] + if (!boot) { + hess <- NA + selection <- list( + theta_hat = theta_hat, # TODO list as close as possible to SelecttionList + grad = multiroot$fvec[1:(p)], + hess = hess, # TODO + ps_nons = ps_nons, + est_ps_rand = est_ps_rand, + variance_covariance = vcov_selection, + df_residual = df_residual, + log_likelihood = NA, + eta = eta_sel, + aic = NA, + residuals = resids, + variance = variance, + method = method + ) + + outcome <- list( + coefficients = beta_hat, # TODO list as close as possible to glm + std_err = beta_errors, + variance_covariance = vcov_outcome, + df_residual = df_residual, + family = list( + mu = y_nons_pred, + variance = sigma_nons, + family = family$family + ), + residuals = residuals, + fitted.values = y_nons_pred, + sigma_rand = sigma_rand, + y_rand_pred = y_rand_pred, + y_nons_pred = y_nons_pred, + linear.predictors = eta_out[loc_nons], + X = X[loc_nons, ] + ) + } else { + selection <- list( + coefficients = theta_hat, # TODO list as close as possible to SelecttionList + ps_nons = ps_nons + ) + outcome <- list( + coefficients = beta_hat, + y_rand_pred = y_rand_pred, # TODO list as close as possible to SelecttionList + y_nons_pred = y_nons_pred + ) + } + + list( + selection = selection, + outcome = outcome + ) +} + diff --git a/R/boot_dr.R b/R/boot_dr.R new file mode 100644 index 0000000..5b40ff2 --- /dev/null +++ b/R/boot_dr.R @@ -0,0 +1,509 @@ +bootDR <- function(outcome, + data, + svydesign, + SelectionModel, + OutcomeModel, + family_outcome, + method_outcome, + start_outcome, + num_boot, + weights, + weights_rand, + R, + theta_hat, + mu_hat, + method_selection, + start_selection, + control_selection, + control_outcome, + control_inference, + n_nons, + n_rand, + optim_method, + est_method, + h, + maxit, + pop_size, + pop_totals, + pop_means, + bias_correction, + verbose, + ...) { + mu_hats <- vector(mode = "numeric", length = num_boot) + k <- 1 + rep_type <- control_inference$rep_type + if (is.character(family_outcome)) { + family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + family <- family_outcome + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") + MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) + + if (bias_correction == TRUE) { + X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons) + p <- ncol(X) + y_rand <- vector(mode = "numeric", length = n_rand) + y <- c(y_rand, OutcomeModel$y_nons) # outcome variable for joint model + var_obj <- bootDR_sel( + X = X, + R = R, + y = y, + svydesign = svydesign, + rep_type = rep_type, + weights = weights, + weights_rand = weights_rand, + method_selection = method_selection, + family_nonprobsvy = family_nonprobsvy, + mu_hat = mu_hat, + n_nons = n_nons, + n_rand = n_rand, + num_boot = num_boot, + start_selection = start_selection, + start_outcome = start_outcome, + verbose = verbose + ) + boot_var <- var_obj$var + mu_hat_boot <- var_obj$mu + } else { + if (verbose) { + pb <- utils::txtProgressBar(min = 0, max = num_boot, style = 3) + } + estimation_method <- get_method(est_method) + if (is.null(pop_totals)) { + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + N <- sum(weights_rand) + while (k <= num_boot) { + tryCatch( + { + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + # N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + model_obj <- MethodOutcome( + outcome = outcome, + data = data[strap_nons, ], + weights = weights[strap_nons], + family_outcome = family_outcome, + start_outcome = start_outcome, + X_nons = OutcomeModel$X_nons[strap_nons, , drop = FALSE], + y_nons = OutcomeModel$y_nons[strap_nons], + X_rand = OutcomeModel$X_rand[strap_rand_svy, , drop = FALSE], + control = control_outcome, + n_nons = n_nons, + n_rand = n_rand, + model_frame = OutcomeModel$model_frame_rand[strap_rand_svy, ], + vars_selection = control_inference$vars_selection, + pop_totals = pop_totals + ) + + + y_rand_pred <- model_obj$y_rand_pred + y_nons_pred <- model_obj$y_nons_pred + + X_sel <- rbind( + SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], + SelectionModel$X_nons[strap_nons, , drop = FALSE] + ) + n_rand_strap <- nrow(SelectionModel$X_rand[strap_rand_svy, , drop = FALSE]) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_sel <- internal_selection( + X = X_sel, + X_nons = SelectionModel$X_nons[strap_nons, , drop = FALSE], + X_rand = SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], + weights = weights[strap_nons], + weights_rand = weights_strap_rand, + R = R, + method_selection = method_selection, + optim_method = optim_method, + h = h, + est_method = est_method, + maxit = maxit, + control_selection = control_selection, + start = start_selection + ) + + est_method_obj <- estimation_method$estimation_model( + model = model_sel, + method_selection = method_selection + ) + ps_nons <- est_method_obj$ps_nons + weights_nons <- 1 / ps_nons + N_est_nons <- sum(weights_nons) + N_est_rand <- sum(weights_strap_rand) + + mu_hat_boot <- mu_hatDR( + y = OutcomeModel$y_nons[strap_nons], + y_nons = y_nons_pred, + y_rand = y_rand_pred, + weights = weights[strap_nons], + weights_nons = weights_nons, + weights_rand = weights_strap_rand, + N_nons = N_est_nons, + N_rand = N_est_rand + ) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + X_strap_nons <- SelectionModel$X_nons[strap, , drop = FALSE] + y_strap <- OutcomeModel$y_nons[strap] + R_strap <- rep(1, n_nons) + weights_strap <- weights[strap] + n_rand <- 0 + X_strap_rand <- NULL + + h_object_strap <- theta_h_estimation( + R = R_strap, + X = X_strap_nons, + weights = weights_strap, + h = h, + method_selection = method_selection, + maxit = maxit, + pop_totals = pop_totals, + weights_rand = NULL, + start = start_selection + ) + + theta_hat_strap <- h_object_strap$theta_h + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection_function) + inv_link <- method$make_link_inv + ps_nons_strap <- inv_link(theta_hat_strap %*% t(X_strap_nons)) + weights_nons_strap <- 1 / ps_nons_strap + N_est <- sum(weights_strap * weights_nons_strap) + if (is.null(pop_size)) pop_size <- N_est + + model_obj <- MethodOutcome( + outcome = outcome, + data = data[strap, , drop = FALSE], + weights = weights_strap, + family_outcome = family_outcome, + start_outcome = start_outcome, + X_nons = X_strap_nons, + y_nons = y_strap, + X_rand = X_strap_rand, + control = control_outcome, + n_nons = n_nons, + n_rand = n_rand, + model_frame = OutcomeModel$model_frame_rand, + vars_selection = control_inference$vars_selection, + pop_totals = pop_totals + ) + + y_rand_pred <- model_obj$y_rand_pred + y_nons_pred <- model_obj$y_nons_pred + + mu_hat_boot <- 1 / N_est * sum(weights_nons_strap * (weights_strap * (y_strap - y_nons_pred))) + ifelse(method_outcome == "glm", 1 / pop_size * y_rand_pred, y_rand_pred) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } + # mu_hat_boot <- mean(mu_hats) + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + } + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats + ) +} + +#' @importFrom foreach %dopar% +#' @importFrom foreach foreach +#' @importFrom parallel makeCluster +#' @importFrom parallel stopCluster +#' @importFrom doSNOW registerDoSNOW +bootDR_multicore <- function(outcome, + data, + svydesign, + SelectionModel, + OutcomeModel, + family_outcome, + method_outcome, + start_outcome, + num_boot, + weights, + weights_rand, + R, + theta_hat, + mu_hat, + method_selection, + control_selection, + start_selection, + control_outcome, + control_inference, + n_nons, + n_rand, + optim_method, + est_method, + h, + maxit, + pop_size, + pop_totals, + pop_means, + bias_correction, + cores, + verbose, + ...) { + # mu_hats <- vector(mode = "numeric", length = num_boot) + # k <- 1 + if (is.character(family_outcome)) { + family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + family <- family_outcome + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + rep_type <- control_inference$rep_type + method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") + MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) + + if (bias_correction == TRUE) { + X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons) + p <- ncol(X) + y_rand <- vector(mode = "numeric", length = n_rand) + y <- c(y_rand, OutcomeModel$y_nons) # outcome variable for joint model + var_obj <- bootDR_sel_multicore( + X = X, + R = R, + y = y, + svydesign = svydesign, + rep_type = rep_type, + weights = weights, + weights_rand = weights_rand, + method_selection = method_selection, + family_nonprobsvy = family_nonprobsvy, + mu_hat = mu_hat, + n_nons = n_nons, + n_rand = n_rand, + num_boot = num_boot, + start_selection = start_selection, + start_outcome = start_outcome, + cores = cores + ) + boot_var <- var_obj$var + mu_hat_boot <- var_obj$mu + } else { + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + + cl <- parallel::makeCluster(cores) + doSNOW::registerDoSNOW(cl) + on.exit(parallel::stopCluster(cl)) + ## progress bar + if (verbose) { + pb <- progress::progress_bar$new(total = num_boot) + opts <- list(progress = \(n) pb$tick()) + } else { + opts <- NULL + } + parallel::clusterExport(cl = cl, varlist = c( + "internal_selection", "internal_outcome", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", "theta_h_estimation", + "mle", "mu_hatDR", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "glm_nonprobsvy", "nn_nonprobsvy", "pmm_nonprobsvy", + "gaussian_nonprobsvy", "poisson_nonprobsvy", "binomial_nonprobsvy", "nonprobMI_fit", "controlOut" + )) + if (is.null(pop_totals)) { + N <- sum(weights_rand) + + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + estimation_method <- get_method(est_method) + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + # N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + model_obj <- MethodOutcome( + outcome = outcome, + data = data[strap_nons, ], + weights = weights[strap_nons], + family_outcome = family_outcome, + start_outcome = start_outcome, + X_nons = OutcomeModel$X_nons[strap_nons, , drop = FALSE], + y_nons = OutcomeModel$y_nons[strap_nons], + X_rand = OutcomeModel$X_rand[strap_rand_svy, , drop = FALSE], + control = control_outcome, + n_nons = n_nons, + n_rand = n_rand, + model_frame = OutcomeModel$model_frame_rand[strap_rand_svy, ], + vars_selection = control_inference$vars_selection, + pop_totals = pop_totals + ) + + + y_rand_pred <- model_obj$y_rand_pred + y_nons_pred <- model_obj$y_nons_pred + + X_sel <- rbind( + SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], + SelectionModel$X_nons[strap_nons, , drop = FALSE] + ) + n_rand_strap <- nrow(SelectionModel$X_rand[strap_rand_svy, , drop = FALSE]) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_sel <- internal_selection( + X = X_sel, + X_nons = SelectionModel$X_nons[strap_nons, , drop = FALSE], + X_rand = SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], + weights = weights[strap_nons], + weights_rand = weights_strap_rand, + R = R, + method_selection = method_selection, + optim_method = optim_method, + h = h, + est_method = est_method, + maxit = maxit, + control_selection = control_selection, + start = start_selection + ) + + est_method_obj <- estimation_method$estimation_model( + model = model_sel, + method_selection = method_selection + ) + ps_nons <- est_method_obj$ps_nons + weights_nons <- 1 / ps_nons + N_est_nons <- sum(weights_nons) + N_est_rand <- sum(weights_strap_rand) + + mu_hatDR( + y = OutcomeModel$y_nons[strap_nons], + y_nons = y_nons_pred, + y_rand = y_rand_pred, + weights = weights[strap_nons], + weights_nons = weights_nons, + weights_rand = weights_strap_rand, + N_nons = N_est_nons, + N_rand = N_est_rand + ) + } + ) + } else { + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + X_nons_strap <- SelectionModel$X_nons[strap, , drop = FALSE] + y_strap <- OutcomeModel$y_nons[strap] + R_strap <- rep(1, n_nons) + weights_strap <- weights[strap] + X_rand_strap <- NULL + + h_object_strap <- theta_h_estimation( + R = R_strap, + X = X_nons_strap, + weights = weights_strap, + h = h, + method_selection = method_selection, + maxit = maxit, + pop_totals = pop_totals, + start = start_selection, + weights_rand = NULL + ) + + theta_hat_strap <- h_object_strap$theta_h + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection_function) + inv_link <- method$make_link_inv + ps_nons_strap <- inv_link(theta_hat_strap %*% t(X_nons_strap)) + weights_nons_strap <- 1 / ps_nons_strap + N_est <- sum(weights_strap * weights_nons_strap) + if (is.null(pop_size)) pop_size <- N_est + + model_obj <- MethodOutcome( + outcome = outcome, + data = data[strap, , drop = FALSE], + weights = weights_strap, + family_outcome = family_outcome, + start_outcome = start_outcome, + X_nons = X_nons_strap, + y_nons = y_strap, + X_rand = X_rand_strap, + control = control_outcome, + n_nons = n_nons, + n_rand = n_rand, + model_frame = OutcomeModel$model_frame_rand, + vars_selection = control_inference$vars_selection, + pop_totals = pop_totals + ) + + y_rand_pred <- model_obj$y_rand_pred + y_nons_pred <- model_obj$y_nons_pred + + mu_hat_boot <- 1 / N_est * sum(weights_nons_strap * (weights_strap * (y_strap - y_nons_pred))) + ifelse(method_outcome == "glm", 1 / pop_size * y_rand_pred, y_rand_pred) + mu_hat_boot + } + ) + } + # mu_hat_boot <- mean(mu_hats) + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + } + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats + ) +} diff --git a/R/boot_dr_sel.R b/R/boot_dr_sel.R new file mode 100644 index 0000000..9da3bc0 --- /dev/null +++ b/R/boot_dr_sel.R @@ -0,0 +1,223 @@ +bootDR_sel <- function(X, + R, + y, + svydesign, + weights, + weights_rand, + method_selection, + family_nonprobsvy, + mu_hat, + n_nons, + n_rand, + num_boot, + rep_type, + start_selection, + start_outcome, + verbose) { # TODO function to test + mu_hats <- vector(mode = "numeric", length = num_boot) + k <- 1 + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + X_nons <- X[loc_nons, , drop = FALSE] + X_rand <- X[loc_rand, , drop = FALSE] + y_nons <- y[loc_nons] + y_rand <- y[loc_rand] + + if (verbose) { + pb <- utils::txtProgressBar(min = 0, max = num_boot, style = 3) + } + + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + while (k <= num_boot) { + tryCatch( + { + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + + weights_nons <- weights[strap_nons] + # weights_rand_strap <- weights_rand[strap_rand] + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + + X_strap <- rbind(X_rand[strap_rand_svy, , drop = FALSE], X_nons[strap_nons, , drop = FALSE]) + y_strap <- c(y_rand[strap_rand_svy], y_nons[strap_nons]) + n_rand_strap <- nrow(X_rand[strap_rand_svy, , drop = FALSE]) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_strap <- mm( + X = X_strap, + y = y_strap, + weights = weights_nons, + weights_rand = weights_strap_rand, + R = R, # c(R[loc_nons][strap_nons], R[loc_rand][strap_rand]), + n_nons = n_nons, + n_rand = n_rand_strap, + method_selection = method_selection, + family = family_nonprobsvy, + start_outcome = start_outcome, + start_selection = start_selection, + boot = TRUE + ) + + weights_nons_strap <- 1 / model_strap$selection$ps_nons + N_nons <- sum(weights_nons * weights_nons_strap) + N_rand <- sum(weights_strap_rand) + + mu_hat_boot <- mu_hatDR( + y = y_nons[strap_nons], + y_nons = model_strap$outcome$y_nons_pred, + y_rand = model_strap$outcome$y_rand_pred, + weights = weights_nons, + weights_nons = weights_nons_strap, + weights_rand = weights_strap_rand, + N_nons = N_nons, + N_rand = N_rand + ) # DR estimator + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + # mu_hat_boot <- mean(mu_hats) + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats + ) +} + + +# multicore +#' @importFrom foreach %dopar% +#' @importFrom foreach foreach +#' @importFrom parallel makeCluster +#' @importFrom parallel stopCluster +#' @importFrom doSNOW registerDoSNOW +bootDR_sel_multicore <- function(X, + svydesign, + R, + y, + weights, + weights_rand, + method_selection, + family_nonprobsvy, + mu_hat, + n_nons, + n_rand, + num_boot, + rep_type, + start_selection, + start_outcome, + cores, + verbose) { # TODO function to test + mu_hats <- vector(mode = "numeric", length = num_boot) + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + X_nons <- X[loc_nons, , drop = FALSE] + X_rand <- X[loc_rand, , drop = FALSE] + y_nons <- y[loc_nons] + y_rand <- y[loc_rand] + + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + + cl <- parallel::makeCluster(cores) + doSNOW::registerDoSNOW(cl) + on.exit(parallel::stopCluster(cl)) + ## progress bar + if (verbose) { + pb <- progress::progress_bar$new(total = num_boot) + opts <- list(progress = \(n) pb$tick()) + } else { + opts <- NULL + } + parallel::clusterExport(cl = cl, varlist = c( + "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", "mle", + "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "mm", "u_theta_beta_dr", + "mu_hatDR" + )) + + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + + weights_strap <- weights[strap_nons] + # weights_rand_strap <- weights_rand[strap_rand] + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + # N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + X_strap <- rbind(X_rand[strap_rand_svy, , drop = FALSE], X_nons[strap_nons, , drop = FALSE]) + y_strap <- c(y_rand[strap_rand_svy], y_nons[strap_nons]) + n_rand_strap <- nrow(X_rand[strap_rand_svy, , drop = FALSE]) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_strap <- mm( + X = X_strap, + y = y_strap, + weights = weights_strap, + weights_rand = weights_strap_rand, + R = R, # c(R[loc_nons][strap_nons], R[loc_rand][strap_rand]), + n_nons = n_nons, + n_rand = n_rand_strap, + method_selection = method_selection, + family = family_nonprobsvy, + start_selection = start_selection, + start_outcome = start_outcome, + boot = TRUE + ) + + weights_nons_strap <- 1 / model_strap$selection$ps_nons + N_nons <- sum(weights_strap * weights_nons_strap) + N_rand <- sum(weights_strap_rand) + + mu_hatDR( + y = y_nons[strap_nons], + y_nons = model_strap$outcome$y_nons_pred, + y_rand = model_strap$outcome$y_rand_pred, + weights = weights_strap, + weights_nons = weights_nons_strap, + weights_rand = weights_strap_rand, + N_nons = N_nons, + N_rand = N_rand + ) # DR estimator + } + ) + # mu_hat_boot <- mean(mu_hats) + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats + ) +} diff --git a/R/boot_ipw.R b/R/boot_ipw.R new file mode 100644 index 0000000..8dfa6c2 --- /dev/null +++ b/R/boot_ipw.R @@ -0,0 +1,360 @@ +bootIPW <- function(X_rand, + X_nons, + svydesign, + weights, + ys, + R, + theta_hat, + num_boot, + weights_rand, + mu_hats, + method_selection, + start_selection, + n_nons, + n_rand, + optim_method, + est_method, + h, + rep_type, + maxit, + control_inference, + control_selection, + verbose, + pop_size, + pop_totals, + ...) { + if (!is.null(weights_rand)) N <- sum(weights_rand) + estimation_method <- get_method(est_method) + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection_function) + inv_link <- method$make_link_inv + k <- 1 + rep_type <- control_inference$rep_type + mu_len <- length(mu_hats) + mu_hats_boot <- matrix(nrow = num_boot, ncol = mu_len) + boot_vars <- numeric(length = mu_len) + + if (verbose) { + pb <- utils::txtProgressBar(min = 0, max = num_boot, style = 3) + } + + if (is.null(pop_totals)) { + rep_weights <- survey::as.svrepdesign(design = svydesign, type = rep_type, replicates = num_boot)$repweights$weights # TODO customise to calibrated svydesign + while (k <= num_boot) { + tryCatch( + { + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + + X_nons_strap <- X_nons[strap_nons, , drop = FALSE] + X <- rbind(X_rand_strap, X_nons_strap) + n_rand_strap <- nrow(X_rand_strap) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_sel <- internal_selection( + X = X, + X_nons = X_nons_strap, + X_rand = X_rand_strap, + weights = weights[strap_nons], + weights_rand = weights_strap_rand, + R = R, + method_selection = method_selection, + optim_method = optim_method, + h = h, + est_method = est_method, + maxit = maxit, + control_selection = control_selection, + start = start_selection + ) + + est_method_obj <- estimation_method$estimation_model( + model = model_sel, + method_selection = method_selection + ) + + ps_nons <- est_method_obj$ps_nons + weights_nons <- 1 / ps_nons + N_est_nons <- ifelse(is.null(pop_size), sum(weights[strap_nons] * weights_nons), pop_size) + + for (l in 1:mu_len) { + mu_hats_boot[k, l] <- mu_hatIPW( + y = ys[[l]][strap_nons], + weights = weights[strap_nons], + weights_nons = weights_nons, + N = N_est_nons + ) # IPW estimator + } + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hats_boot[k,], sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + + X_strap <- X_nons[strap, , drop = FALSE] + R_strap <- R[strap] + weights_strap <- weights[strap] + + h_object_strap <- theta_h_estimation( + R = R_strap, + X = X_strap, + weights_rand = NULL, + weights = weights_strap, + h = h, + method_selection = method_selection, + maxit = maxit, + pop_totals = pop_totals, + start = start_selection + ) + theta_hat_strap <- h_object_strap$theta_h + ps_nons <- inv_link(theta_hat_strap %*% t(X_strap)) + + weights_nons <- 1 / ps_nons + N_est_nons <- ifelse(is.null(pop_size), sum(weights_strap * weights_nons), pop_size) + + for (l in 1:mu_len) { + mu_hats_boot[k, l] <- mu_hatIPW( + y = ys[[l]][strap], + weights = weights_strap, + weights_nons = weights_nons, + N = N_est_nons + ) # IPW estimator + } + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hats_boot[k], sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } + # mu_hats_boot_means <- colMeans(mu_hats_boot) + # boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) + for (l in 1:mu_len) { + boot_vars[l] <- 1 / (num_boot - 1) * sum((mu_hats_boot[, l] - mu_hats[l])^2) + } + if (verbose) { + close(pb) + } + list( + var = boot_vars, + # mu = mu_hats_boot_means, + stat = mu_hats_boot + ) +} + +# Multicore +#' @importFrom foreach %dopar% +#' @importFrom foreach foreach +#' @importFrom parallel makeCluster +#' @importFrom parallel stopCluster +#' @importFrom doSNOW registerDoSNOW +bootIPW_multicore <- function(X_rand, + X_nons, + svydesign, + weights, + ys, + R, + theta_hat, + num_boot, + weights_rand, + mu_hats, + method_selection, + start_selection, + n_nons, + n_rand, + optim_method, + est_method, + h, + maxit, + control_selection, + control_inference, + cores, + verbose, + pop_size, + pop_totals, + ...) { + if (!is.null(weights_rand)) N <- sum(weights_rand) + estimation_method <- get_method(est_method) + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection_function) + inv_link <- method$make_link_inv + rep_type <- control_inference$rep_type + + mu_len <- length(mu_hats) + mu_hats_boot <- numeric(length = num_boot * mu_len) + boot_vars <- numeric(length = mu_len) + + cl <- parallel::makeCluster(cores) + doSNOW::registerDoSNOW(cl) + on.exit(parallel::stopCluster(cl)) + ## progress bar + if (verbose) { + pb <- progress::progress_bar$new(total = num_boot) + opts <- list(progress = \(n) pb$tick()) + } else { + opts <- NULL + } + ### + parallel::clusterExport(cl = cl, varlist = c( + "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", + "mle", "mu_hatIPW", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "theta_h_estimation" + )) + + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + + k <- 1:num_boot + mu_hats_boot <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + if (is.null(pop_totals)) { + strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + + X_nons_strap <- X_nons[strap_nons, , drop = FALSE] + X <- rbind(X_rand_strap, X_nons_strap) + n_rand_strap <- nrow(X_rand_strap) + + R_nons <- rep(1, n_nons) + R_rand <- rep(0, n_rand_strap) + R <- c(R_rand, R_nons) + + model_sel <- internal_selection( + X = X, + X_nons = X_nons_strap, + X_rand = X_rand_strap, + weights = weights[strap_nons], + weights_rand = weights_strap_rand, + R = R, + method_selection = method_selection, + optim_method = optim_method, + h = h, + est_method = est_method, + maxit = maxit, + control_selection = control_selection, + start = start_selection + ) + + est_method_obj <- estimation_method$estimation_model( + model = model_sel, + method_selection = method_selection + ) + + ps_nons <- est_method_obj$ps_nons + weights_nons <- 1 / ps_nons + N_est_nons <- ifelse(is.null(pop_size), sum(weights[strap_nons] * weights_nons), pop_size) + + # mu_hat_boot <- mu_hatIPW( + # y = y[strap_nons], + # weights = weights[strap_nons], + # weights_nons = weights_nons, + # N = N_est_nons + # ) # IPW estimator + + mu_hats_this_boot <- numeric(mu_len) + + for (l in 1:mu_len) { + mu_hats_this_boot[l] <- mu_hatIPW( + y = ys[[l]][strap_nons], + weights = weights[strap_nons], + weights_nons = weights_nons, + N = N_est_nons + ) # IPW estimator + } + mu_hats_this_boot + } else { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + X_strap <- X_nons[strap, , drop = FALSE] + R_strap <- R[strap] + weights_strap <- weights[strap] + + h_object_strap <- theta_h_estimation( + R = R_strap, + X = X_strap, + weights_rand = NULL, + weights = weights_strap, + h = h, + method_selection = method_selection, + maxit = maxit, + pop_totals = pop_totals, + start = start_selection + ) + theta_hat_strap <- h_object_strap$theta_h + ps_nons <- inv_link(theta_hat_strap %*% t(X_strap)) + + weights_nons <- 1 / ps_nons + N_est_nons <- ifelse(is.null(pop_size), sum(weights_strap * weights_nons), pop_size) + + # mu_hat_boot <- mu_hatIPW( + # y = y[strap], + # weights = weights_strap, + # weights_nons = weights_nons, + # N = N_est_nons + # ) # IPW estimator + for (l in 1:mu_len) { + mu_hats_boot[k, l] <- mu_hatIPW( + y = ys[[l]][strap], + weights = weights_strap, + weights_nons = weights_nons, + N = N_est_nons + ) # IPW estimator + } + mu_hats_boot + } + } + ) + mu_hats_boot <- matrix(mu_hats_boot, nrow = num_boot, ncol = mu_len, byrow = TRUE) + # mu_hats_boot_means <- colMeans(mu_hats_boot) + # boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) + for (l in 1:mu_len) { + boot_vars[l] <- 1 / (num_boot - 1) * sum((mu_hats_boot[, l] - mu_hats[l])^2) + } + list( + var = boot_vars, + # mu = mu_hats_boot_means, + stat = mu_hats_boot + ) +} diff --git a/R/boot_mi.R b/R/boot_mi.R new file mode 100644 index 0000000..d8a3033 --- /dev/null +++ b/R/boot_mi.R @@ -0,0 +1,717 @@ +# These functions are only used internally, so there is no need for documenting them +#' @importFrom survey as.svrepdesign +#' @importFrom nleqslv nleqslv +#' @importFrom utils setTxtProgressBar +#' @importFrom utils txtProgressBar + +bootMI <- function(X_rand, + X_nons, + weights, + y, + family_outcome, + start_outcome, + num_boot, + weights_rand, + mu_hat, + svydesign, + model_obj = model_obj, + rep_type, + method, + control_outcome, + control_inference, + pop_totals, + verbose, + ...) { # TODO add methods instead of conditions + + mu_hats <- vector(mode = "numeric", length = num_boot) + n_nons <- nrow(X_nons) + k <- 1 + family <- family_outcome + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + + if (is.character(family_outcome)) { + family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + + if (verbose) { + pb <- utils::txtProgressBar(min = 0, max = num_boot, style = 3) + } + + + predictive_match = control_outcome$predictive_match + pmm_exact_se = control_inference$pmm_exact_se + pmm_reg_engine = control_outcome$pmm_reg_engine + pi_ij = control_inference$pi_ij + pmm_exact_se <- control_inference$pmm_exact_se + comp2_stat <- numeric(length = num_boot) + + if (is.null(pop_totals)) { + n_rand <- nrow(X_rand) + N <- sum(weights_rand) + if (class(svydesign)[1] != "pps") { + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + } else { + stop("pps bootstrap variance in development") + } + if (method == "glm") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta <- X_rand %*% beta + y_strap_rand <- family_nonprobsvy$linkinv(eta) + + # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) + mu_hat_boot <- weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else if (method == "nn") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # weights_rand_strap <- weights_rand[strap_rand] + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + # N_strap <- sum(weights_rand_strap) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[which(rep_weights[, k] != 0), ] + weights_rand_strap <- weights_rand_strap_svy[strap_rand_svy] + + model_rand <- nonprobMI_nn( + data = X_nons_strap, + query = X_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + + y_rand_strap <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_strap[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + + mu_hat_boot <- weighted.mean(x = y_rand_strap, w = weights_rand_strap) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else if (method == "pmm") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # weights_rand_strap <- weights_rand[strap_rand] + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + # N_strap <- sum(weights_rand_strap) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[which(rep_weights[, k] != 0), ] + n_rand_strap <- nrow(X_rand_strap) + weights_rand_strap <- weights_rand_strap_svy[strap_rand_svy] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta_rand <- X_rand_strap %*% beta + eta_nons <- X_nons_strap %*% beta + y_rand_strap <- family_nonprobsvy$linkinv(eta_rand) + y_nons_strap <- family_nonprobsvy$linkinv(eta_nons) + + model_rand <- switch(control_outcome$predictive_match, + { # 1 + nonprobMI_nn( + data = y_strap, + query = y_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + }, + { # 2 + nonprobMI_nn( + data = y_nons_strap, + query = y_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + } + ) + + y_rand_strap <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_strap[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + + mu_hat_boot <- weighted.mean(x = y_rand_strap, w = weights_rand_strap) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + # slower option + # if (pmm_exact_se) { + # comp2 <- pmm_exact(pi_ij, + # weights_rand, + # n_nons = n_nons, + # y = y, + # pmm_reg_engine = pmm_reg_engine, + # model_obj = model_obj, + # svydesign = svydesign, + # predictive_match = predictive_match, + # k = control_inference$k, + # N = N) + # comp2_stat[k] <- comp2 + # } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } + } else { + N <- pop_totals[1] + if (method == "glm") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta <- pop_totals %*% beta / N + y_strap_rand <- family_nonprobsvy$linkinv(eta) + + # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) + mu_hat_boot <- as.vector(y_strap_rand) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else if (method == "nn") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_rand <- nonprobMI_nn( + data = X_nons_strap, + query = t(pop_totals / N), + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + mu_hat_boot <- mean(y_strap[model_rand$nn.idx]) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } else if (method == "pmm") { + while (k <= num_boot) { + tryCatch( + { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta_rand <- pop_totals %*% beta / N + eta_nons <- X_nons_strap %*% beta + y_strap_rand <- family_nonprobsvy$linkinv(eta_rand) + y_strap_nons <- family_nonprobsvy$linkinv(eta_nons) + + + model_rand <- switch(control_outcome$predictive_match, + { # 1 + nonprobMI_nn( + data = y_strap, + query = y_strap_rand, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + }, + { # 2 + nonprobMI_nn( + data = y_strap_nons, + query = y_strap_rand, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + } + ) + # + # model_rand <- nonprobMI_nn( + # data = y_strap_nons, + # query = y_strap_rand, + # k = control_outcome$k, + # treetype = control_outcome$treetype, + # searchtype = control_outcome$searchtype + # ) + + + mu_hat_boot <- mean(y_strap[model_rand$nn.idx]) + mu_hats[k] <- mu_hat_boot + if (verbose) { + # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") + # print(info) + utils::setTxtProgressBar(pb, k) + } + k <- k + 1 + }, + error = function(e) { + if (verbose) { + info <- paste("An error occurred in ", k, " iteration: ", e$message, sep = "") + print(info) + } + } + ) + } + } + } + # mu_hat_boot <- mean(mu_hats) + if (method == "pmm") { + if (pmm_exact_se) { + comp2 <- pmm_exact(pi_ij, + weights_rand, + n_nons = n_nons, + y = y, + pmm_reg_engine = pmm_reg_engine, + model_obj = model_obj, + svydesign = svydesign, + predictive_match = predictive_match, + k = control_inference$k, + N = N) + comp2 <- mean(comp2_stat) + } else { + comp2 <- 0 + } + } else { + comp2 <- 0 + } + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + comp2 + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats, + comp2 = comp2 + ) +} + + +# multicore +#' @importFrom foreach %dopar% +#' @importFrom foreach foreach +#' @importFrom parallel makeCluster +#' @importFrom parallel stopCluster +#' @importFrom doSNOW registerDoSNOW +bootMI_multicore <- function(X_rand, + X_nons, + weights, + y, + family_outcome, + start_outcome, + num_boot, + weights_rand, + mu_hat, + svydesign, + method, + control_outcome, + control_inference, + pop_totals, + cores, + verbose, + ...) { + # mu_hats <- vector(mode = "numeric", length = num_boot) + n_nons <- nrow(X_nons) + family <- family_outcome + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + rep_type <- control_inference$rep_type + + if (is.character(family_outcome)) { + family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + + cl <- parallel::makeCluster(cores) + doSNOW::registerDoSNOW(cl) + on.exit(parallel::stopCluster(cl)) + + ## progress bar + if (verbose) { + pb <- progress::progress_bar$new(total = num_boot) + opts <- list(progress = \(n) pb$tick()) + } else { + opts <- NULL + } + ### + parallel::clusterExport(cl = cl, varlist = c( + "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", + "mle", "mu_hatIPW", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "nonprobMI_nn" + )) + + if (is.null(pop_totals)) { + n_rand <- nrow(X_rand) + N <- sum(weights_rand) + if (class(svydesign)[1] != "pps") { + rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights + } else { + stop("pps bootstrap variance in development") + } + if (method == "glm") { + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + # X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta <- X_rand %*% beta + y_strap_rand <- family_nonprobsvy$linkinv(eta) + weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy) + } + ) + } else if (method == "nn") { + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # weights_rand_strap <- weights_rand[strap_rand] + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + # N_strap <- sum(weights_rand_strap) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] + + model_rand <- nonprobMI_nn( + data = X_nons_strap, + query = X_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + y_rand_strap <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_strap[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + weighted.mean(x = y_rand_strap, w = weights_strap_rand) + } + ) + } else if (method == "pmm") { + k <- 1:num_boot + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = k, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) + # weights_rand_strap <- weights_rand[strap_rand] + # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] + # N_strap <- sum(weights_rand_strap) + + # using svy package + strap_rand_svy <- which(rep_weights[, k] != 0) + weights_rand_strap_svy <- rep_weights[, k] * weights_rand + N_strap <- sum(weights_rand_strap_svy) + X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] + weights_strap_rand <- weights_rand_strap_svy[weights_rand_strap_svy != 0] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + + beta <- model_strap$coefficients + eta_rand <- X_rand_strap %*% beta + eta_nons <- X_nons_strap %*% beta + y_rand_strap <- family_nonprobsvy$linkinv(eta_rand) + y_nons_strap <- family_nonprobsvy$linkinv(eta_nons) + + + model_rand <- switch(control_outcome$predictive_match, + { # 1 + nonprobMI_nn( + data = y_strap, + query = y_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + }, + { # 2 + nonprobMI_nn( + data = y_nons_strap, + query = y_rand_strap, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + } + ) + + y_rand_strap <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_strap[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + weighted.mean(x = y_rand_strap, w = weights_strap_rand) + } + ) + } + } else { + N <- pop_totals[1] + if (method == "glm") { + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = 1:num_boot, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta <- pop_totals %*% beta / N + y_strap_rand <- family_nonprobsvy$linkinv(eta) + + # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) + as.vector(y_strap_rand) + } + ) + } else if (method == "nn") { + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = 1:num_boot, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_rand <- nonprobMI_nn( + data = X_nons_strap, + query = t(pop_totals / N), + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + mean(y_strap[model_rand$nn.idx]) + } + ) + } else if (method == "pmm") { + mu_hats <- foreach::`%dopar%`( + obj = foreach::foreach(k = 1:num_boot, .combine = c, .options.snow = opts), + ex = { + strap <- sample.int(replace = TRUE, n = n_nons, prob = 1 / weights) + weights_strap <- weights[strap] + X_nons_strap <- X_nons[strap, , drop = FALSE] + y_strap <- y[strap] + + model_strap <- stats::glm.fit( + x = X_nons_strap, + y = y_strap, + weights = weights_strap, + family = family, + start = start_outcome + ) + + beta <- model_strap$coefficients + eta_rand <- pop_totals %*% beta + eta_nons <- X_nons_strap %*% beta + y_strap_rand <- family_nonprobsvy$linkinv(eta_rand) + y_strap_nons <- family_nonprobsvy$linkinv(eta_nons) + + + model_rand <- nonprobMI_nn( + data = y_strap_nons, + query = y_strap_rand, + k = control_outcome$k, + treetype = control_outcome$treetype, + searchtype = control_outcome$searchtype + ) + mean(y_strap[model_rand$nn.idx]) + } + ) + } + } + # mu_hat_boot <- mean(mu_hats) + boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) + list( + var = boot_var, + # mu = mu_hat_boot, + stat = mu_hats + ) +} diff --git a/R/bootstraps.R b/R/bootstraps.R deleted file mode 100644 index ec1c680..0000000 --- a/R/bootstraps.R +++ /dev/null @@ -1,1493 +0,0 @@ -# These functions are only used internally, so there is no need for documenting them -#' @importFrom survey as.svrepdesign -#' @importFrom nleqslv nleqslv - -bootMI <- function(X_rand, - X_nons, - weights, - y, - family_outcome, - start_outcome, - num_boot, - weights_rand, - mu_hat, - svydesign, - rep_type, - method, - control_outcome, - control_inference, - pop_totals, - verbose, - ...) { # TODO add methods instead of conditions - - mu_hats <- vector(mode = "numeric", length = num_boot) - n_nons <- nrow(X_nons) - k <- 1 - family <- family_outcome - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - - if (is.character(family_outcome)) { - family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - - if (is.null(pop_totals)) { - n_rand <- nrow(X_rand) - N <- sum(weights_rand) - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - if (method == "glm") { - - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta <- X_rand %*% beta - y_strap_rand <- family_nonprobsvy$linkinv(eta) - - # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) - mu_hat_boot <- weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else if (method == "nn") { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # weights_rand_strap <- weights_rand[strap_rand] - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - # N_strap <- sum(weights_rand_strap) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] - weights_rand_strap <- weights_rand_strap_svy[strap_rand_svy] - - model_rand <- nonprobMI_nn( - data = X_nons_strap, - query = X_rand_strap, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - - y_rand_strap <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_strap[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - - mu_hat_boot <- weighted.mean(x = y_rand_strap, w = weights_rand_strap) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else if (method == "pmm") { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # weights_rand_strap <- weights_rand[strap_rand] - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - # N_strap <- sum(weights_rand_strap) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] - weights_rand_strap <- weights_rand_strap_svy[strap_rand_svy] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta_rand <- X_rand_strap %*% beta - eta_nons <- X_nons_strap %*% beta - y_rand_strap <- family_nonprobsvy$linkinv(eta_rand) - y_nons_strap <- family_nonprobsvy$linkinv(eta_nons) - - - model_rand <- nonprobMI_nn( - data = y_nons_strap, - query = y_rand_strap, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - - y_rand_strap <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_strap[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - - mu_hat_boot <- weighted.mean(x = y_rand_strap, w = weights_rand_strap) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } - } else { - N <- pop_totals[1] - if (method == "glm") { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta <- pop_totals %*% beta / N - y_strap_rand <- family_nonprobsvy$linkinv(eta) - - # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) - mu_hat_boot <- as.vector(y_strap_rand) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else if (method == "nn") { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_rand <- nonprobMI_nn( - data = X_nons_strap, - query = t(pop_totals / N), - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - mu_hat_boot <- mean(y_strap[model_rand$nn.idx]) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else if (method == "pmm") { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta_rand <- pop_totals %*% beta - eta_nons <- X_nons_strap %*% beta - y_strap_rand <- family_nonprobsvy$linkinv(eta_rand) - y_strap_nons <- family_nonprobsvy$linkinv(eta_nons) - - - model_rand <- nonprobMI_nn( - data = y_strap_nons, - query = y_strap_rand, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - mu_hat_boot <- mean(y_strap[model_rand$nn.idx]) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -bootIPW <- function(X_rand, - X_nons, - svydesign, - weights, - y, - R, - theta_hat, - num_boot, - weights_rand, - mu_hat, - method_selection, - start_selection, - n_nons, - n_rand, - optim_method, - est_method, - h, - rep_type, - maxit, - control_inference, - control_selection, - verbose, - pop_size, - pop_totals, - ...) { - mu_hats <- vector(mode = "numeric", length = num_boot) - if (!is.null(weights_rand)) N <- sum(weights_rand) - estimation_method <- get_method(est_method) - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection_function) - inv_link <- method$make_link_inv - k <- 1 - rep_type <- control_inference$rep_type - - if (is.null(pop_totals)) { - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - while (k <= num_boot) { - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - - X_nons_strap <- X_nons[strap_nons, , drop = FALSE] - X <- rbind(X_rand_strap, X_nons_strap) - n_rand_strap <- nrow(X_rand_strap) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_sel <- internal_selection( - X = X, - X_nons = X_nons_strap, - X_rand = X_rand_strap, - weights = weights[strap_nons], - weights_rand = weights_strap_rand, - R = R, - method_selection = method_selection, - optim_method = optim_method, - h = h, - est_method = est_method, - maxit = maxit, - control_selection = control_selection, - start = start_selection - ) - - est_method_obj <- estimation_method$estimation_model( - model = model_sel, - method_selection = method_selection - ) - - ps_nons <- est_method_obj$ps_nons - weights_nons <- 1 / ps_nons - N_est_nons <- ifelse(is.null(pop_size), sum(weights[strap_nons] * weights_nons), pop_size) - - mu_hat_boot <- mu_hatIPW( - y = y[strap_nons], - weights = weights[strap_nons], - weights_nons = weights_nons, - N = N_est_nons - ) # IPW estimator - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - - X_strap <- X_nons[strap, , drop = FALSE] - R_strap <- R[strap] - weights_strap <- weights[strap] - - h_object_strap <- theta_h_estimation( - R = R_strap, - X = X_strap, - weights_rand = NULL, - weights = weights_strap, - h = h, - method_selection = method_selection, - maxit = maxit, - pop_totals = pop_totals, - start = start_selection - ) - theta_hat_strap <- h_object_strap$theta_h - ps_nons <- inv_link(theta_hat_strap %*% t(X_strap)) - - weights_nons <- 1 / ps_nons - N_est_nons <- ifelse(is.null(pop_size), sum(weights_strap * weights_nons), pop_size) - - mu_hat_boot <- mu_hatIPW( - y = y[strap], - weights = weights_strap, - weights_nons = weights_nons, - N = N_est_nons - ) # IPW estimator - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -bootDR <- function(outcome, - data, - svydesign, - SelectionModel, - OutcomeModel, - family_outcome, - method_outcome, - start_outcome, - num_boot, - weights, - weights_rand, - R, - theta_hat, - mu_hat, - method_selection, - start_selection, - control_selection, - control_outcome, - control_inference, - n_nons, - n_rand, - optim_method, - est_method, - h, - maxit, - pop_size, - pop_totals, - pop_means, - bias_correction, - verbose, - ...) { - mu_hats <- vector(mode = "numeric", length = num_boot) - k <- 1 - rep_type <- control_inference$rep_type - if (is.character(family_outcome)) { - family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - family <- family_outcome - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") - MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) - - if (bias_correction == TRUE) { - X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons) - p <- ncol(X) - y_rand <- vector(mode = "numeric", length = n_rand) - y <- c(y_rand, OutcomeModel$y_nons) # outcome variable for joint model - var_obj <- bootDR_sel( - X = X, - R = R, - y = y, - svydesign = svydesign, - rep_type = rep_type, - weights = weights, - weights_rand = weights_rand, - method_selection = method_selection, - family_nonprobsvy = family_nonprobsvy, - mu_hat = mu_hat, - n_nons = n_nons, - n_rand = n_rand, - num_boot = num_boot, - start_selection = start_selection, - start_outcome = start_outcome, - verbose = verbose - ) - boot_var <- var_obj$var - mu_hat_boot <- var_obj$mu - } else { - estimation_method <- get_method(est_method) - if (is.null(pop_totals)) { - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - N <- sum(weights_rand) - while (k <= num_boot) { - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - # N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - model_obj <- MethodOutcome( - outcome = outcome, - data = data[strap_nons, ], - weights = weights[strap_nons], - family_outcome = family_outcome, - start_outcome = start_outcome, - X_nons = OutcomeModel$X_nons[strap_nons, , drop = FALSE], - y_nons = OutcomeModel$y_nons[strap_nons], - X_rand = OutcomeModel$X_rand[strap_rand_svy, , drop = FALSE], - control = control_outcome, - n_nons = n_nons, - n_rand = n_rand, - model_frame = OutcomeModel$model_frame_rand[strap_rand_svy, ], - vars_selection = control_inference$vars_selection, - pop_totals = pop_totals - ) - - - y_rand_pred <- model_obj$y_rand_pred - y_nons_pred <- model_obj$y_nons_pred - - X_sel <- rbind( - SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], - SelectionModel$X_nons[strap_nons, , drop = FALSE] - ) - n_rand_strap <- nrow(SelectionModel$X_rand[strap_rand_svy, , drop = FALSE]) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_sel <- internal_selection( - X = X_sel, - X_nons = SelectionModel$X_nons[strap_nons, , drop = FALSE], - X_rand = SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], - weights = weights[strap_nons], - weights_rand = weights_strap_rand, - R = R, - method_selection = method_selection, - optim_method = optim_method, - h = h, - est_method = est_method, - maxit = maxit, - control_selection = control_selection, - start = start_selection - ) - - est_method_obj <- estimation_method$estimation_model( - model = model_sel, - method_selection = method_selection - ) - ps_nons <- est_method_obj$ps_nons - weights_nons <- 1 / ps_nons - N_est_nons <- sum(weights_nons) - N_est_rand <- sum(weights_strap_rand) - - mu_hat_boot <- mu_hatDR( - y = OutcomeModel$y_nons[strap_nons], - y_nons = y_nons_pred, - y_rand = y_rand_pred, - weights = weights[strap_nons], - weights_nons = weights_nons, - weights_rand = weights_strap_rand, - N_nons = N_est_nons, - N_rand = N_est_rand - ) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } else { - while (k <= num_boot) { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - X_strap_nons <- SelectionModel$X_nons[strap, , drop = FALSE] - y_strap <- OutcomeModel$y_nons[strap] - R_strap <- rep(1, n_nons) - weights_strap <- weights[strap] - n_rand <- 0 - X_strap_rand <- NULL - - h_object_strap <- theta_h_estimation( - R = R_strap, - X = X_strap_nons, - weights = weights_strap, - h = h, - method_selection = method_selection, - maxit = maxit, - pop_totals = pop_totals, - weights_rand = NULL, - start = start_selection - ) - - theta_hat_strap <- h_object_strap$theta_h - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection_function) - inv_link <- method$make_link_inv - ps_nons_strap <- inv_link(theta_hat_strap %*% t(X_strap_nons)) - weights_nons_strap <- 1 / ps_nons_strap - N_est <- sum(weights_strap * weights_nons_strap) - if (is.null(pop_size)) pop_size <- N_est - - model_obj <- MethodOutcome( - outcome = outcome, - data = data[strap, , drop = FALSE], - weights = weights_strap, - family_outcome = family_outcome, - start_outcome = start_outcome, - X_nons = X_strap_nons, - y_nons = y_strap, - X_rand = X_strap_rand, - control = control_outcome, - n_nons = n_nons, - n_rand = n_rand, - model_frame = OutcomeModel$model_frame_rand, - vars_selection = control_inference$vars_selection, - pop_totals = pop_totals - ) - - y_rand_pred <- model_obj$y_rand_pred - y_nons_pred <- model_obj$y_nons_pred - - mu_hat_boot <- 1 / N_est * sum(weights_nons_strap * (weights_strap * (y_strap - y_nons_pred))) + ifelse(method_outcome == "glm", 1 / pop_size * y_rand_pred, y_rand_pred) - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - } - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -bootDR_sel <- function(X, - R, - y, - svydesign, - weights, - weights_rand, - method_selection, - family_nonprobsvy, - mu_hat, - n_nons, - n_rand, - num_boot, - rep_type, - start_selection, - start_outcome, - verbose) { # TODO function to test - mu_hats <- vector(mode = "numeric", length = num_boot) - k <- 1 - loc_nons <- which(R == 1) - loc_rand <- which(R == 0) - X_nons <- X[loc_nons, , drop = FALSE] - X_rand <- X[loc_rand, , drop = FALSE] - y_nons <- y[loc_nons] - y_rand <- y[loc_rand] - - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - while (k <= num_boot) { - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - - weights_nons <- weights[strap_nons] - # weights_rand_strap <- weights_rand[strap_rand] - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - - X_strap <- rbind(X_rand[strap_rand_svy, , drop = FALSE], X_nons[strap_nons, , drop = FALSE]) - y_strap <- c(y_rand[strap_rand_svy], y_nons[strap_nons]) - n_rand_strap <- nrow(X_rand[strap_rand_svy, , drop = FALSE]) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_strap <- mm( - X = X_strap, - y = y_strap, - weights = weights_nons, - weights_rand = weights_strap_rand, - R = R, # c(R[loc_nons][strap_nons], R[loc_rand][strap_rand]), - n_nons = n_nons, - n_rand = n_rand_strap, - method_selection = method_selection, - family = family_nonprobsvy, - start_outcome = start_outcome, - start_selection = start_selection, - boot = TRUE - ) - - weights_nons_strap <- 1 / model_strap$selection$ps_nons - N_nons <- sum(weights_nons * weights_nons_strap) - N_rand <- sum(weights_strap_rand) - - mu_hat_boot <- mu_hatDR( - y = y_nons[strap_nons], - y_nons = model_strap$outcome$y_nons_pred, - y_rand = model_strap$outcome$y_rand_pred, - weights = weights_nons, - weights_nons = weights_nons_strap, - weights_rand = weights_strap_rand, - N_nons = N_nons, - N_rand = N_rand - ) # DR estimator - mu_hats[k] <- mu_hat_boot - if (verbose) { - info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "") - print(info) - } - k <- k + 1 - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -# multicore -#' @importFrom foreach %dopar% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster -#' @importFrom parallel stopCluster -#' @importFrom doParallel registerDoParallel -bootMI_multicore <- function(X_rand, - X_nons, - weights, - y, - family_outcome, - start_outcome, - num_boot, - weights_rand, - mu_hat, - svydesign, - method, - control_outcome, - control_inference, - pop_totals, - cores, - verbose, - ...) { - # mu_hats <- vector(mode = "numeric", length = num_boot) - n_nons <- nrow(X_nons) - family <- family_outcome - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - rep_type <- control_inference$rep_type - - if (is.character(family_outcome)) { - family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - cl <- parallel::makeCluster(cores) - doParallel::registerDoParallel(cl) - on.exit(parallel::stopCluster(cl)) - parallel::clusterExport(cl = cl, varlist = c( - "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", - "mle", "mu_hatIPW", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "nonprobMI_nn" - )) - - if (is.null(pop_totals)) { - n_rand <- nrow(X_rand) - N <- sum(weights_rand) - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - if (method == "glm") { - - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[which(rep_weights[,k] != 0),] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta <- X_rand %*% beta - y_strap_rand <- family_nonprobsvy$linkinv(eta) - weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy) - } - ) - } else if (method == "nn") { - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # weights_rand_strap <- weights_rand[strap_rand] - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - # N_strap <- sum(weights_rand_strap) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - model_rand <- nonprobMI_nn( - data = X_nons_strap, - query = X_rand_strap, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - y_rand_strap <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_strap[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - weighted.mean(x = y_rand_strap, w = weights_strap_rand) - } - ) - } else if (method == "pmm") { - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # weights_rand_strap <- weights_rand[strap_rand] - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - # N_strap <- sum(weights_rand_strap) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[weights_rand_strap_svy!=0] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta_rand <- X_rand_strap %*% beta - eta_nons <- X_nons_strap %*% beta - y_rand_strap <- family_nonprobsvy$linkinv(eta_rand) - y_nons_strap <- family_nonprobsvy$linkinv(eta_nons) - - - model_rand <- nonprobMI_nn( - data = y_nons_strap, - query = y_rand_strap, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - - y_rand_strap <- apply(model_rand$nn.idx, 1, - FUN = \(x) mean(y_strap[x]) - # FUN=\(x) mean(sample_nonprob$short_[x]) - ) - weighted.mean(x = y_rand_strap, w = weights_strap_rand) - } - ) - } - } else { - N <- pop_totals[1] - if (method == "glm") { - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = 1:num_boot, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta <- pop_totals %*% beta / N - y_strap_rand <- family_nonprobsvy$linkinv(eta) - - # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap) - as.vector(y_strap_rand) - } - ) - } else if (method == "nn") { - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = 1:num_boot, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_rand <- nonprobMI_nn( - data = X_nons_strap, - query = t(pop_totals / N), - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - mean(y_strap[model_rand$nn.idx]) - } - ) - } else if (method == "pmm") { - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = 1:num_boot, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - weights_strap <- weights[strap] - X_nons_strap <- X_nons[strap, , drop = FALSE] - y_strap <- y[strap] - - model_strap <- stats::glm.fit( - x = X_nons_strap, - y = y_strap, - weights = weights_strap, - family = family, - start = start_outcome - ) - - beta <- model_strap$coefficients - eta_rand <- pop_totals %*% beta - eta_nons <- X_nons_strap %*% beta - y_strap_rand <- family_nonprobsvy$linkinv(eta_rand) - y_strap_nons <- family_nonprobsvy$linkinv(eta_nons) - - - model_rand <- nonprobMI_nn( - data = y_strap_nons, - query = y_strap_rand, - k = control_outcome$k, - treetype = control_outcome$treetype, - searchtype = control_outcome$searchtype - ) - mean(y_strap[model_rand$nn.idx]) - } - ) - } - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -#' @importFrom foreach %dopar% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster -#' @importFrom parallel stopCluster -#' @importFrom doParallel registerDoParallel -bootIPW_multicore <- function(X_rand, - X_nons, - svydesign, - weights, - y, - R, - theta_hat, - num_boot, - weights_rand, - mu_hat, - method_selection, - start_selection, - n_nons, - n_rand, - optim_method, - est_method, - h, - maxit, - control_selection, - control_inference, - cores, - verbose, - pop_size, - pop_totals, - ...) { - if (!is.null(weights_rand)) N <- sum(weights_rand) - estimation_method <- get_method(est_method) - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection_function) - inv_link <- method$make_link_inv - rep_type <- control_inference$rep_type - - cl <- parallel::makeCluster(cores) - doParallel::registerDoParallel(cl) - on.exit(parallel::stopCluster(cl)) - parallel::clusterExport(cl = cl, varlist = c( - "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", - "mle", "mu_hatIPW", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "theta_h_estimation" - )) - - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - if (is.null(pop_totals)) { - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - N_strap <- sum(weights_rand_strap_svy) - X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - # X_rand_strap <- X_rand[strap_rand, , drop = FALSE] - - X_nons_strap <- X_nons[strap_nons, , drop = FALSE] - X <- rbind(X_rand_strap, X_nons_strap) - n_rand_strap <- nrow(X_rand_strap) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_sel <- internal_selection( - X = X, - X_nons = X_nons_strap, - X_rand = X_rand_strap, - weights = weights[strap_nons], - weights_rand = weights_strap_rand, - R = R, - method_selection = method_selection, - optim_method = optim_method, - h = h, - est_method = est_method, - maxit = maxit, - control_selection = control_selection, - start = start_selection - ) - - est_method_obj <- estimation_method$estimation_model( - model = model_sel, - method_selection = method_selection - ) - - ps_nons <- est_method_obj$ps_nons - weights_nons <- 1 / ps_nons - N_est_nons <- ifelse(is.null(pop_size), sum(weights[strap_nons] * weights_nons), pop_size) - - mu_hat_boot <- mu_hatIPW( - y = y[strap_nons], - weights = weights[strap_nons], - weights_nons = weights_nons, - N = N_est_nons - ) # IPW estimator - } else { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - X_strap <- X_nons[strap, , drop = FALSE] - R_strap <- R[strap] - weights_strap <- weights[strap] - - h_object_strap <- theta_h_estimation( - R = R_strap, - X = X_strap, - weights_rand = NULL, - weights = weights_strap, - h = h, - method_selection = method_selection, - maxit = maxit, - pop_totals = pop_totals, - start = start_selection - ) - theta_hat_strap <- h_object_strap$theta_h - ps_nons <- inv_link(theta_hat_strap %*% t(X_strap)) - - weights_nons <- 1 / ps_nons - N_est_nons <- ifelse(is.null(pop_size), sum(weights_strap * weights_nons), pop_size) - - mu_hat_boot <- mu_hatIPW( - y = y[strap], - weights = weights_strap, - weights_nons = weights_nons, - N = N_est_nons - ) # IPW estimator - } - mu_hat_boot - } - ) - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -#' @importFrom foreach %dopar% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster -#' @importFrom parallel stopCluster -#' @importFrom doParallel registerDoParallel -bootDR_multicore <- function(outcome, - data, - svydesign, - SelectionModel, - OutcomeModel, - family_outcome, - method_outcome, - start_outcome, - num_boot, - weights, - weights_rand, - R, - theta_hat, - mu_hat, - method_selection, - control_selection, - start_selection, - control_outcome, - control_inference, - n_nons, - n_rand, - optim_method, - est_method, - h, - maxit, - pop_size, - pop_totals, - pop_means, - bias_correction, - cores, - verbose, - ...) { - # mu_hats <- vector(mode = "numeric", length = num_boot) - # k <- 1 - if (is.character(family_outcome)) { - family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - family <- family_outcome - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - rep_type <- control_inference$rep_type - method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") - MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) - - if (bias_correction == TRUE) { - X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons) - p <- ncol(X) - y_rand <- vector(mode = "numeric", length = n_rand) - y <- c(y_rand, OutcomeModel$y_nons) # outcome variable for joint model - var_obj <- bootDR_sel_multicore( - X = X, - R = R, - y = y, - svydesign = svydesign, - rep_type = rep_type, - weights = weights, - weights_rand = weights_rand, - method_selection = method_selection, - family_nonprobsvy = family_nonprobsvy, - mu_hat = mu_hat, - n_nons = n_nons, - n_rand = n_rand, - num_boot = num_boot, - start_selection = start_selection, - start_outcome = start_outcome, - cores = cores - ) - boot_var <- var_obj$var - mu_hat_boot <- var_obj$mu - } else { - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - cl <- parallel::makeCluster(cores) - doParallel::registerDoParallel(cl) - on.exit(parallel::stopCluster(cl)) - parallel::clusterExport(cl = cl, varlist = c( - "internal_selection", "internal_outcome", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", "theta_h_estimation", - "mle", "mu_hatDR", "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "glm_nonprobsvy", "nn_nonprobsvy", "pmm_nonprobsvy", - "gaussian_nonprobsvy", "poisson_nonprobsvy", "binomial_nonprobsvy", "nonprobMI_fit", "controlOut" - )) - if (is.null(pop_totals)) { - N <- sum(weights_rand) - - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - estimation_method <- get_method(est_method) - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - # N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - model_obj <- MethodOutcome( - outcome = outcome, - data = data[strap_nons, ], - weights = weights[strap_nons], - family_outcome = family_outcome, - start_outcome = start_outcome, - X_nons = OutcomeModel$X_nons[strap_nons, , drop = FALSE], - y_nons = OutcomeModel$y_nons[strap_nons], - X_rand = OutcomeModel$X_rand[strap_rand_svy, , drop = FALSE], - control = control_outcome, - n_nons = n_nons, - n_rand = n_rand, - model_frame = OutcomeModel$model_frame_rand[strap_rand_svy, ], - vars_selection = control_inference$vars_selection, - pop_totals = pop_totals - ) - - - y_rand_pred <- model_obj$y_rand_pred - y_nons_pred <- model_obj$y_nons_pred - - X_sel <- rbind( - SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], - SelectionModel$X_nons[strap_nons, , drop = FALSE] - ) - n_rand_strap <- nrow(SelectionModel$X_rand[strap_rand_svy, , drop = FALSE]) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_sel <- internal_selection( - X = X_sel, - X_nons = SelectionModel$X_nons[strap_nons, , drop = FALSE], - X_rand = SelectionModel$X_rand[strap_rand_svy, , drop = FALSE], - weights = weights[strap_nons], - weights_rand = weights_strap_rand, - R = R, - method_selection = method_selection, - optim_method = optim_method, - h = h, - est_method = est_method, - maxit = maxit, - control_selection = control_selection, - start = start_selection - ) - - est_method_obj <- estimation_method$estimation_model( - model = model_sel, - method_selection = method_selection - ) - ps_nons <- est_method_obj$ps_nons - weights_nons <- 1 / ps_nons - N_est_nons <- sum(weights_nons) - N_est_rand <- sum(weights_strap_rand) - - mu_hatDR( - y = OutcomeModel$y_nons[strap_nons], - y_nons = y_nons_pred, - y_rand = y_rand_pred, - weights = weights[strap_nons], - weights_nons = weights_nons, - weights_rand = weights_strap_rand, - N_nons = N_est_nons, - N_rand = N_est_rand - ) - } - ) - } else { - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - strap <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - X_nons_strap <- SelectionModel$X_nons[strap, , drop = FALSE] - y_strap <- OutcomeModel$y_nons[strap] - R_strap <- rep(1, n_nons) - weights_strap <- weights[strap] - X_rand_strap <- NULL - - h_object_strap <- theta_h_estimation( - R = R_strap, - X = X_nons_strap, - weights = weights_strap, - h = h, - method_selection = method_selection, - maxit = maxit, - pop_totals = pop_totals, - start = start_selection, - weights_rand = NULL - ) - - theta_hat_strap <- h_object_strap$theta_h - method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection_function) - inv_link <- method$make_link_inv - ps_nons_strap <- inv_link(theta_hat_strap %*% t(X_nons_strap)) - weights_nons_strap <- 1 / ps_nons_strap - N_est <- sum(weights_strap * weights_nons_strap) - if (is.null(pop_size)) pop_size <- N_est - - model_obj <- MethodOutcome( - outcome = outcome, - data = data[strap, , drop = FALSE], - weights = weights_strap, - family_outcome = family_outcome, - start_outcome = start_outcome, - X_nons = X_nons_strap, - y_nons = y_strap, - X_rand = X_rand_strap, - control = control_outcome, - n_nons = n_nons, - n_rand = n_rand, - model_frame = OutcomeModel$model_frame_rand, - vars_selection = control_inference$vars_selection, - pop_totals = pop_totals - ) - - y_rand_pred <- model_obj$y_rand_pred - y_nons_pred <- model_obj$y_nons_pred - - mu_hat_boot <- 1 / N_est * sum(weights_nons_strap * (weights_strap * (y_strap - y_nons_pred))) + ifelse(method_outcome == "glm", 1 / pop_size * y_rand_pred, y_rand_pred) - mu_hat_boot - } - ) - } - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat)^2) - } - list( - var = boot_var, - mu = mu_hat_boot - ) -} - -# multicore -#' @importFrom foreach %dopar% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster -#' @importFrom parallel stopCluster -#' @importFrom doParallel registerDoParallel -bootDR_sel_multicore <- function(X, - svydesign, - R, - y, - weights, - weights_rand, - method_selection, - family_nonprobsvy, - mu_hat, - n_nons, - n_rand, - num_boot, - rep_type, - start_selection, - start_outcome, - cores, - verbose) { # TODO function to test - mu_hats <- vector(mode = "numeric", length = num_boot) - loc_nons <- which(R == 1) - loc_rand <- which(R == 0) - X_nons <- X[loc_nons, , drop = FALSE] - X_rand <- X[loc_rand, , drop = FALSE] - y_nons <- y[loc_nons] - y_rand <- y[loc_rand] - - rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights - - cl <- parallel::makeCluster(cores) - doParallel::registerDoParallel(cl) - on.exit(parallel::stopCluster(cl)) - parallel::clusterExport(cl = cl, varlist = c( - "internal_selection", "logit_model_nonprobsvy", "start_fit", "get_method", "controlSel", "mle", - "probit_model_nonprobsvy", "cloglog_model_nonprobsvy", "mm", "u_theta_beta_dr", - "mu_hatDR" - )) - - k <- 1:num_boot - mu_hats <- foreach::`%dopar%`( - obj = foreach::foreach(k = k, .combine = c), - ex = { - strap_nons <- sample.int(replace = TRUE, n = n_nons, prob = 1/weights) - # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand) - - weights_strap <- weights[strap_nons] - # weights_rand_strap <- weights_rand[strap_rand] - - # using svy package - strap_rand_svy <- which(rep_weights[, k] != 0) - weights_rand_strap_svy <- rep_weights[, k] * weights_rand - # N_strap <- sum(weights_rand_strap_svy) - # X_rand_strap <- X_rand[strap_rand_svy, , drop = FALSE] - weights_strap_rand <- weights_rand_strap_svy[strap_rand_svy] - - X_strap <- rbind(X_rand[strap_rand_svy, , drop = FALSE], X_nons[strap_nons, , drop = FALSE]) - y_strap <- c(y_rand[strap_rand_svy], y_nons[strap_nons]) - n_rand_strap <- nrow(X_rand[strap_rand_svy, , drop = FALSE]) - - R_nons <- rep(1, n_nons) - R_rand <- rep(0, n_rand_strap) - R <- c(R_rand, R_nons) - - model_strap <- mm( - X = X_strap, - y = y_strap, - weights = weights_strap, - weights_rand = weights_strap_rand, - R = R, # c(R[loc_nons][strap_nons], R[loc_rand][strap_rand]), - n_nons = n_nons, - n_rand = n_rand_strap, - method_selection = method_selection, - family = family_nonprobsvy, - start_selection = start_selection, - start_outcome = start_outcome, - boot = TRUE - ) - - weights_nons_strap <- 1 / model_strap$selection$ps_nons - N_nons <- sum(weights_strap * weights_nons_strap) - N_rand <- sum(weights_strap_rand) - - mu_hatDR( - y = y_nons[strap_nons], - y_nons = model_strap$outcome$y_nons_pred, - y_rand = model_strap$outcome$y_rand_pred, - weights = weights_strap, - weights_nons = weights_nons_strap, - weights_rand = weights_strap_rand, - N_nons = N_nons, - N_rand = N_rand - ) # DR estimator - } - ) - mu_hat_boot <- mean(mu_hats) - boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2) - list( - var = boot_var, - mu = mu_hat_boot - ) -} diff --git a/R/cloglogModel.R b/R/cloglogModel.R index 4be059a..200820b 100644 --- a/R/cloglogModel.R +++ b/R/cloglogModel.R @@ -267,9 +267,13 @@ cloglog_model_nonprobsvy <- function(...) { V2 } - b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) { # TODO to fix + b_vec_ipw <- function(y, mu, ps, X, hess, pop_size, weights, verbose, psd = NULL, eta = NULL) { - hess_inv_neg <- solve(-hess) + hess_inv_neg <- try(solve(-hess), silent = TRUE) + if(inherits(hess_inv_neg, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv_neg <- MASS::ginv(-hess) + } # print(mean(-((1 - ps)/ps^2 * log(1 - ps) * weights * (y - mu)))) if (is.null(pop_size)) { b <- -((1 - ps) / ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?) @@ -282,8 +286,12 @@ cloglog_model_nonprobsvy <- function(...) { list(b = b) } - b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights) { - hess_inv <- solve(hess) + b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights, verbose) { + hess_inv <- try(solve(hess), silent = TRUE) + if(inherits(hess_inv, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv <- MASS::ginv(hess) + } (((1 - ps) / ps^2) * weights * (y - y_pred - h_n) * exp(eta)) %*% X %*% hess_inv } diff --git a/R/control.R b/R/control.R deleted file mode 100644 index 1d80021..0000000 --- a/R/control.R +++ /dev/null @@ -1,194 +0,0 @@ -#' @title Control parameters for selection model -#' @author Łukasz Chrostowski, Maciej Beręsewicz -#' @description \code{controlSel} constructs a list with all necessary control parameters -#' for selection model. -#' -#' \loadmathjax -#' -#' @param method estimation method. -#' @param epsilon Tolerance for fitting algorithms by default \code{1e-6}. -#' @param maxit Maximum number of iterations. -#' @param trace logical value. If `TRUE` trace steps of the fitting algorithms. Default is `FALSE` -#' @param optimizer - optimization function for maximum likelihood estimation. -#' @param optim_method maximisation method that will be passed to [stats::optim()] function. Default is `BFGS`. -#' @param maxLik_method maximisation method that will be passed to [maxLik::maxLik()] function. Default is `NR`. -#' @param dependence logical value - `TRUE` if samples are dependent. -#' @param key binary key variable -#' @param est_method_sel Method of estimation for propensity score model. -#' @param h Smooth function for the generalized estimating equations methods taking the following values -#' \itemize{ -#' \item if \code{1} then \mjseqn{\mathbf{h}\left(\mathbf{x}, \boldsymbol{\theta}\right) = -#' \frac{\pi(\mathbf{x}, \boldsymbol{\theta})}{\mathbf{x}}} -#' \item if \code{2} then \mjseqn{ \mathbf{h}\left(\mathbf{x}, \boldsymbol{\theta}\right) = \mathbf{x}} -#' } -#' @param penalty The penanlization function used during variables selection. -#' @param a_SCAD The tuning parameter of the SCAD penalty for selection model. Default is 3.7. -#' @param a_MCP The tuning parameter of the MCP penalty for selection model. Default is 3. -#' @param lambda A user-specified \mjseqn{\lambda} value during variable selection model fitting. -#' @param lambda_min The smallest value for lambda, as a fraction of `lambda.max`. Default is .001. -#' @param nlambda The number of `lambda` values. Default is 50. -#' @param nfolds The number of folds for cross validation. Default is 10. -#' @param print_level this argument determines the level of printing which is done during the optimization (for propensity score model) process. -#' @param start_type - Type of method for start points for model fitting taking the following values -#' \itemize{ -#' \item if \code{glm} then start taken from the glm function called on samples. -#' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest. -#' } -#' -#' @return List with selected parameters. -#' -#' @seealso -#' -#' [nonprob()] -- for fitting procedure with non-probability samples. -#' -#' @export - -controlSel <- function(method = "glm.fit", # perhaps another control function for model with variables selection - epsilon = 1e-4, - maxit = 500, - trace = FALSE, - optimizer = c("maxLik", "optim"), - maxLik_method = "NR", - optim_method = "BFGS", - dependence = FALSE, - key = NULL, - est_method_sel = c("mle", "gee"), - h = c(1, 2), - penalty = c("SCAD", "lasso", "MCP"), - a_SCAD = 3.7, - a_MCP = 3, - lambda = -1, - lambda_min = .001, - nlambda = 50, - nfolds = 10, - print_level = 0, - start_type = c("glm", "naive")) { - list( - epsilon = epsilon, - maxit = maxit, - trace = trace, - optimizer = if (missing(optimizer)) "optim" else optimizer, - maxLik_method = maxLik_method, - optim_method = optim_method, - dependence = dependence, - key = key, - est_method_sel = if (missing(est_method_sel)) "mle" else est_method_sel, - h = if (missing(h)) 1 else h, - penalty = if (missing(penalty)) "SCAD" else penalty, - a_SCAD = a_SCAD, - a_MCP = a_MCP, - lambda_min = lambda_min, - nlambda = nlambda, - nfolds = nfolds, - lambda = lambda, - print_level = print_level, - start_type = if (missing(start_type)) "naive" else start_type - ) -} - -#' @title Control parameters for outcome model -#' @description \code{controlOut} constructs a list with all necessary control parameters -#' for outcome model. -#' @param epsilon Tolerance for fitting algorithms. Default is \code{1e-6}. -#' @param maxit Maximum number of iterations. -#' @param trace logical value. If `TRUE` trace steps of the fitting algorithms. Default is `FALSE`. -#' @param k The k parameter in the [RANN::nn2()] function. Default is 5. -#' @param penalty penalty algorithm for variable selection. Default is `SCAD` -#' @param a_SCAD The tuning parameter of the SCAD penalty for outcome model. Default is 3.7. -#' @param a_MCP The tuning parameter of the MCP penalty for outcome model. Default is 3. -#' @param lambda_min The smallest value for lambda, as a fraction of lambda.max. Default is .001. -#' @param nlambda The number of lambda values. Default is 100. -#' @param nfolds The number of folds during cross-validation for variables selection model. -#' @param treetype type of tree for nearest neighbour imputation passed to [RANN::nn2()] function. -#' @param searchtype type of search for nearest neighbour imputation passed to [RANN::nn2()] function. -#' -#' @return List with selected parameters. -#' -#' @seealso -#' -#' [nonprob()] -- for fitting procedure with non-probability samples. -#' -#' -#' @export - -controlOut <- function(epsilon = 1e-4, - maxit = 100, - trace = FALSE, - k = 1, - penalty = c("SCAD", "lasso", "MCP"), - a_SCAD = 3.7, - a_MCP = 3, - lambda_min = .001, - nlambda = 100, - nfolds = 10, - treetype = "kd", - searchtype = "standard") { - list( - epsilon = epsilon, - maxit = maxit, - trace = trace, - k = k, - penalty = if (missing(penalty)) "SCAD" else penalty, - a_SCAD = a_SCAD, - a_MCP = a_MCP, - lambda_min = lambda_min, - nlambda = nlambda, - nfolds = nfolds, - treetype = treetype, - searchtype = searchtype - ) -} - - -#' @title Control parameters for inference -#' @description \code{controlInf} constructs a list with all necessary control parameters -#' for statistical inference. -#' @param vars_selection If `TRUE`, then variables selection model is used. -#' @param var_method variance method. -#' @param rep_type replication type for weights in the bootstrap method for variance estimation passed to [survey::as.svrepdesign()]. -#' Default is `subbootstrap`. -#' @param bias_inf inference method in the bias minimization. -#' \itemize{ -#' \item if \code{union} then final model is fitting on union of selected variables for selection and outcome models -#' \item if \code{div} then final model is fitting separately on division of selected variables into relevant ones for -#' selection and outcome model. -#' } -#' @param bias_correction if `TRUE`, then bias minimization estimation used during fitting the model. -#' @param num_boot number of iteration for bootstrap algorithms. -#' @param alpha Significance level, Default is 0.05. -#' @param cores Number of cores in parallel computing. -#' -#' -#' @return List with selected parameters. -#' -#' @seealso -#' -#' [nonprob()] -- for fitting procedure with non-probability samples. -#' -#' @export - -controlInf <- function(vars_selection = FALSE, - var_method = c( - "analytic", - "bootstrap" - ), - rep_type = c( - "auto", "JK1", "JKn", "BRR", "bootstrap", - "subbootstrap", "mrbbootstrap", "Fay" - ), - bias_inf = c("union", "div"), - num_boot = 500, - bias_correction = FALSE, - alpha = 0.05, - cores = 1) { - list( - vars_selection = if (missing(vars_selection)) FALSE else vars_selection, - var_method = if (missing(var_method)) "analytic" else var_method, - rep_type = if (missing(rep_type)) "subbootstrap" else rep_type, - bias_inf = if (missing(bias_inf)) "union" else bias_inf, - bias_correction = bias_correction, - num_boot = num_boot, - alpha = alpha, - cores = cores - ) -} diff --git a/R/control_inference.R b/R/control_inference.R new file mode 100644 index 0000000..3da3f13 --- /dev/null +++ b/R/control_inference.R @@ -0,0 +1,78 @@ +#' @title Control parameters for inference +#' @description \code{controlInf} constructs a list with all necessary control parameters +#' for statistical inference. +#' @param vars_selection If `TRUE`, then variables selection model is used. +#' @param var_method variance method. +#' @param rep_type replication type for weights in the bootstrap method for variance estimation passed to [survey::as.svrepdesign()]. +#' Default is `subbootstrap`. +#' @param bias_inf inference method in the bias minimization. +#' \itemize{ +#' \item if \code{union} then final model is fitting on union of selected variables for selection and outcome models +#' \item if \code{div} then final model is fitting separately on division of selected variables into relevant ones for +#' selection and outcome model. +#' } +#' @param bias_correction if `TRUE`, then bias minimization estimation used during fitting the model. +#' @param num_boot number of iteration for bootstrap algorithms. +#' @param alpha Significance level, Default is 0.05. +#' @param cores Number of cores in parallel computing. +#' @param keep_boot Logical indicating whether statistics from bootstrap should be kept. +#' By default set to \code{TRUE} +#' @param pmm_exact_se Logical value indicating whether to compute the exact +#' standard error estimate for \code{pmm} estimator. The variance estimator for +#' estimation based on \code{pmm} can be decomposed into three parts, with the +#' third being computed using covariance between imputed values for units in +#' probability sample using predictive matches from non-probability sample. +#' In most situations this term is negligible and is very computationally +#' expensive so by default this is set to \code{FALSE}, but it is recommended to +#' set this value to \code{TRUE} before submitting final results. +#' @param pi_ij TODO, either matrix or \code{ppsmat} class object. +#' +#' +#' @return List with selected parameters. +#' +#' @seealso +#' +#' [nonprob()] -- for fitting procedure with non-probability samples. +#' +#' @export + +controlInf <- function(vars_selection = FALSE, + var_method = c( + "analytic", + "bootstrap" + ), + rep_type = c( + "auto", "JK1", "JKn", "BRR", "bootstrap", + "subbootstrap", "mrbbootstrap", "Fay" + ), + bias_inf = c("union", "div"), + num_boot = 500, + bias_correction = FALSE, + alpha = 0.05, + cores = 1, + keep_boot, + pmm_exact_se = FALSE, + pi_ij) { + list( + vars_selection = if (missing(vars_selection)) FALSE else vars_selection, + var_method = if (missing(var_method)) "analytic" else var_method, + rep_type = if (missing(rep_type)) "subbootstrap" else rep_type, + bias_inf = if (missing(bias_inf)) "union" else bias_inf, + bias_correction = bias_correction, + num_boot = num_boot, + alpha = alpha, + cores = cores, + keep_boot = if (missing(keep_boot)) { + TRUE + } else { + if (!is.logical(keep_boot)) { + stop("keep_boot argument for controlInf must be logical") + } else { + keep_boot + } + }, + pmm_exact_se = if (!is.logical(pmm_exact_se) & length(pmm_exact_se) == 1) + stop("Argument pmm_exact_se must be a logical scalar") else pmm_exact_se, + pi_ij = if (missing(pi_ij)) NULL else pi_ij + ) +} diff --git a/R/control_outcome.R b/R/control_outcome.R new file mode 100644 index 0000000..e37d010 --- /dev/null +++ b/R/control_outcome.R @@ -0,0 +1,89 @@ +#' @title Control parameters for outcome model +#' @description \code{controlOut} constructs a list with all necessary control parameters +#' for outcome model. +#' @param epsilon Tolerance for fitting algorithms. Default is \code{1e-6}. +#' @param maxit Maximum number of iterations. +#' @param trace logical value. If `TRUE` trace steps of the fitting algorithms. Default is `FALSE`. +#' @param k The k parameter in the [RANN::nn2()] function. Default is 5. +#' @param penalty penalty algorithm for variable selection. Default is `SCAD` +#' @param a_SCAD The tuning parameter of the SCAD penalty for outcome model. Default is 3.7. +#' @param a_MCP The tuning parameter of the MCP penalty for outcome model. Default is 3. +#' @param lambda_min The smallest value for lambda, as a fraction of lambda.max. Default is .001. +#' @param nlambda The number of lambda values. Default is 100. +#' @param nfolds The number of folds during cross-validation for variables selection model. +#' @param treetype Type of tree for nearest neighbour imputation passed to [RANN::nn2()] function. +#' @param searchtype Type of search for nearest neighbour imputation passed to [RANN::nn2()] function. +#' @param predictive_match (Only for predictive mean matching) +#' Indicates how to select 'closest' unit from nonprobability sample for each +#' unit in probability sample. Either \code{1} (default) or \code{2} where +#' \code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2} +#' is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +#' \mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}. +#' @param pmm_weights (Only for predictive mean matching) +#' Indicate how to weight \code{k} nearest neighbours in \mjseqn{S_{B}} to +#' create imputed value for units in \mjseqn{S_{A}}. The default value +#' \code{"none"} indicates that mean of \code{k} nearest \mjseqn{y}'s from +#' \mjseqn{S_{B}} should be used whereas \code{"prop_dist"} results in +#' weighted mean of these \code{k} values where weights are inversely +#' proportional to distance between matched values. +#' @param pmm_k_choice Character value indicating how \code{k} hyper-parameter +#' should be chosen, by default \code{"none"} meaning \code{k} provided in +#' \code{control_outcome} argument will be used. For now the only other +#' option \code{"min_var"} means that \code{k} will be chosen by minimizing +#' estimated variance of estimator for mean. Parameter \code{k} provided in +#' this control list will be chosen as starting point. +#' @param pmm_reg_engine TODO +#' +#' @return List with selected parameters. +#' +#' @seealso +#' +#' [nonprob()] -- for fitting procedure with non-probability samples. +#' +#' +#' @export + +controlOut <- function(epsilon = 1e-4, + maxit = 100, + trace = FALSE, + k = 1, + penalty = c("SCAD", "lasso", "MCP"), + a_SCAD = 3.7, + a_MCP = 3, + lambda_min = .001, + nlambda = 100, + nfolds = 10, + treetype = "kd", + searchtype = "standard", + predictive_match = 1:2, + pmm_weights = c("none", "prop_dist"), + pmm_k_choice = c("none", "min_var"), + pmm_reg_engine = c("glm", "loess")) { + if (missing(predictive_match)) { + predictive_match <- 1 + } + + if (missing(pmm_weights)) { + pmm_weights <- "none" + } + + list( + epsilon = epsilon, + maxit = maxit, + trace = trace, + k = k, + penalty = if (missing(penalty)) "SCAD" else penalty, + a_SCAD = a_SCAD, + a_MCP = a_MCP, + lambda_min = lambda_min, + nlambda = nlambda, + nfolds = nfolds, + treetype = treetype, + searchtype = searchtype, + predictive_match = predictive_match, + pmm_weights = pmm_weights, + pmm_k_choice = if (missing(pmm_k_choice)) "none" else pmm_k_choice, + pmm_reg_engine = if (missing(pmm_reg_engine)) "glm" else pmm_reg_engine + ) +} diff --git a/R/control_selection.R b/R/control_selection.R new file mode 100644 index 0000000..0e68642 --- /dev/null +++ b/R/control_selection.R @@ -0,0 +1,95 @@ +#' @title Control parameters for selection model +#' @author Łukasz Chrostowski, Maciej Beręsewicz +#' \loadmathjax +#' +#' @description \code{controlSel} constructs a list with all necessary control parameters +#' for selection model. +#' +#' +#' @param method estimation method. +#' @param epsilon Tolerance for fitting algorithms by default \code{1e-6}. +#' @param maxit Maximum number of iterations. +#' @param trace logical value. If `TRUE` trace steps of the fitting algorithms. Default is `FALSE` +#' @param optimizer - optimization function for maximum likelihood estimation. +#' @param optim_method maximisation method that will be passed to [stats::optim()] function. Default is `BFGS`. +#' @param maxLik_method maximisation method that will be passed to [maxLik::maxLik()] function. Default is `NR`. +#' @param dependence logical value - `TRUE` if samples are dependent. +#' @param key binary key variable +#' @param est_method_sel Method of estimation for propensity score model. +#' @param h Smooth function for the generalized estimating equations methods taking the following values +#' \itemize{ +#' \item if \code{1} then \mjseqn{\mathbf{h}\left(\mathbf{x}, \boldsymbol{\theta}\right) = +#' \frac{\pi(\mathbf{x}, \boldsymbol{\theta})}{\mathbf{x}}} +#' \item if \code{2} then \mjseqn{ \mathbf{h}\left(\mathbf{x}, \boldsymbol{\theta}\right) = \mathbf{x}} +#' } +#' @param penalty The penanlization function used during variables selection. +#' @param a_SCAD The tuning parameter of the SCAD penalty for selection model. Default is 3.7. +#' @param a_MCP The tuning parameter of the MCP penalty for selection model. Default is 3. +#' @param lambda A user-specified \mjseqn{\lambda} value during variable selection model fitting. +#' @param lambda_min The smallest value for lambda, as a fraction of `lambda.max`. Default is .001. +#' @param nlambda The number of `lambda` values. Default is 50. +#' @param nfolds The number of folds for cross validation. Default is 10. +#' @param print_level this argument determines the level of printing which is done during the optimization (for propensity score model) process. +#' @param start_type - Type of method for start points for model fitting taking the following values +#' \itemize{ +#' \item if \code{glm} then start taken from the glm function called on samples. +#' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest. +#' \item if \code{zero} then start is a vector of zeros. +#' } +#' +#' @return List with selected parameters. +#' +#' @seealso +#' +#' [nonprob()] -- for fitting procedure with non-probability samples. +#' +#' @export + +controlSel <- function(method = "glm.fit", # perhaps another control function for model with variables selection + epsilon = 1e-4, + maxit = 500, + trace = FALSE, + optimizer = c("maxLik", "optim"), + maxLik_method = "NR", + optim_method = "BFGS", + dependence = FALSE, + key = NULL, + est_method_sel = c("mle", "gee"), + h = c(1, 2), + penalty = c("SCAD", "lasso", "MCP"), + a_SCAD = 3.7, + a_MCP = 3, + lambda = -1, + lambda_min = .001, + nlambda = 50, + nfolds = 10, + print_level = 0, + start_type = c("glm", "naive", "zero")) { + + + list( + epsilon = epsilon, + maxit = maxit, + trace = trace, + optimizer = if (missing(optimizer)) "optim" else optimizer, + maxLik_method = maxLik_method, + optim_method = optim_method, + dependence = dependence, + key = key, + est_method_sel = if (missing(est_method_sel)) "mle" else est_method_sel, + h = if (missing(h)) 1 else h, + penalty = if (missing(penalty)) "SCAD" else penalty, + a_SCAD = a_SCAD, + a_MCP = a_MCP, + lambda_min = lambda_min, + nlambda = nlambda, + nfolds = nfolds, + lambda = lambda, + print_level = print_level, + start_type = if (missing(start_type)) "naive" else start_type + ) +} + + + + diff --git a/R/data_manip.R b/R/data_manip.R new file mode 100644 index 0000000..86714aa --- /dev/null +++ b/R/data_manip.R @@ -0,0 +1,104 @@ +# create an object with model frames and matrices to preprocess +model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_totals = NULL, pop_size = NULL, flag = TRUE) { + if (!is.null(svydesign)) { + ##### Model frame for nonprobability sample ##### + model_Frame <- model.frame(formula, data) + y_nons <- model.response(model_Frame) + outcome_name <- names(model_Frame)[1] + mt <- attr(model_Frame, "terms") + nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) + ##### Model frame for probability sample ##### + if (outcome_name %in% colnames(svydesign$variables)) { + # design_to_frame <- svydesign$variables + # design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0 + # model_Frame_rand <- model.frame(formula, design_to_frame) + # mt_rand <- attr(model_Frame_rand, "terms") + # nons_names_rand <- attr(mt_rand, "term.labels") + + # TODO to consider this version + design_to_frame <- svydesign$variables + design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0 + names_rand <- all.vars(formula) + model_Frame_rand <- design_to_frame[,names_rand] + + nons_names_rand <- attr(attr(model.frame(formula, design_to_frame), "terms"), "term.labels") + } else { + design_to_frame <- svydesign$variables + names_rand <- all.vars(formula[-2]) + model_Frame_rand <- design_to_frame[,names_rand] + + nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") + + # model_Frame_rand <- model.frame(formula[-2], svydesign$variables) + # mt_rand <- attr(model_Frame_rand, "terms") + # nons_names_rand <- attr(mt_rand, "term.labels") + } + # TODO colnames(model_Frame_rand) <- all.vars(formula) + # print(nons_names_rand) + # TODO think out this condition + if (all(nons_names %in% nons_names_rand)) { # colnames(svydesign$variables) + dot_check <- sapply(formula, FUN = function(x) { + x == "." + }) + if (length(formula) == 2) nons_names <- nons_names[-1] + if (any(dot_check)) { + xx <- paste("~", paste(nons_names, collapse = "+")) + formula <- as.formula(paste(outcome_name, xx)) + X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables[, nons_names]) + } else { + X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables) # matrix of probability sample with intercept + } + frame_nons <- model.frame(formula, data) + X_nons <- model.matrix(frame_nons, data) # matrix for nonprobability sample with intercept + # if (outcome) { + # xx <- paste("~", paste(nons_names[2:length(nons_names)], collapse = "+")) + # formula <- as.formula(paste(formula[2], xx)) + # X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables[, nons_names]) + # } else { + # xx <- paste("~", paste(nons_names, collapse = "+")) + # formula <- as.formula(xx) + # X_rand <- model.matrix(formula, svydesign$variables[, nons_names])# matrix of probability sample with intercept + # } + } else { + stop("Variable names in data and svydesign do not match") + } + + list( + X_nons = X_nons, + X_rand = X_rand, + nons_names = nons_names, + y_nons = y_nons, + outcome_name = outcome_name, + model_frame_rand = model_Frame_rand + ) + } else if (!is.null(pop_totals)) { + model_Frame <- model.frame(formula, data) + X_nons <- model.matrix(model_Frame, data) + # matrix for nonprobability sample with intercept + # X_nons <- model.matrix(XY_nons, data, contrasts.arg = list(klasa_pr = contrasts(as.factor(XY_nons[,dep_name]), contrasts = FALSE))) + # nons_names <- attr(terms(formula, data = data), "term.labels") + # nons_names <- colnames(X_nons) + # pop_totals <- pop_totals[which(attr(X_nons, "assign") == 1)] + mt <- attr(model_Frame, "terms") + # nons_names <- attr(mt, "term.labels") + total_names <- colnames(X_nons) + if (flag) { + if (all(total_names %in% names(pop_totals))) { # TODO verify whether this warming works well.. pop_totals, pop_means defined such as in `calibrate` function + pop_totals <- pop_totals[total_names] + } else { + warning("Selection and population totals have different names.") + } + } + y_nons <- model.response(model_Frame) + outcome_name <- names(model_Frame)[1] + + list( + X_nons = X_nons, + pop_totals = pop_totals, + total_names = total_names, + y_nons = y_nons, + outcome_name = outcome_name, + X_rand = NULL + ) + } +} diff --git a/R/gee_ipw.R b/R/gee_ipw.R new file mode 100644 index 0000000..6a9f2a8 --- /dev/null +++ b/R/gee_ipw.R @@ -0,0 +1,183 @@ +# Object with output parameters for estimation by Generalized Estimating Equations for propensity scores +gee <- function(...) { + estimation_model <- function(model, method_selection) { + method <- model$method + theta_hat <- model$theta_hat + hess <- model$hess + grad <- model$grad + ps_nons <- model$ps_nons + est_ps_rand <- model$est_ps_rand + ps_nons_der <- model$ps_nons_der + est_ps_rand_der <- model$est_ps_rand_der + var_cov1 <- model$var_cov1 + var_cov2 <- model$var_cov2 + df_residual <- model$df_residual + variance_covariance <- model$variance_covariance # variance-covariance matrix of estimated parameters + eta <- c(model$eta_rand, model$eta_nons) + residuals <- model$residuals + variance <- as.vector(model$variance) + + list( + theta_hat = theta_hat, + grad = grad, + hess = hess, + var_cov1 = var_cov1, + var_cov2 = var_cov2, + ps_nons = ps_nons, + est_ps_rand = est_ps_rand, + ps_nons_der = ps_nons_der, + est_ps_rand_der = est_ps_rand_der, + variance_covariance = variance_covariance, + df_residual = df_residual, + log_likelihood = NA, + eta = eta, + aic = NA, + variance = variance, + residuals = residuals, + method = method + ) + } + + make_t <- function(X, ps, psd, b, y_rand, y_nons, h, N, method_selection, weights) { + if (h == 1) { + t <- X %*% t(as.matrix(b)) + y_rand - 1 / N * sum(weights * y_nons) + } else if (h == 2) { + t <- as.vector(ps) * X %*% t(as.matrix(b)) + y_rand - 1 / N * sum(weights * y_nons) + } + t + } + + make_var_nonprob <- function(ps, psd, y, y_pred, h_n, X, b, N, h, method_selection, weights, pop_totals) { + if (!is.null(pop_totals)) h <- 1 # perhaps to remove, just check if appropriate var is calculated + if (h == 2) { + var_nonprob <- 1 / N^2 * sum((1 - ps) * ((weights * (y - y_pred - h_n) / ps) - b %*% t(X))^2) + } else if (h == 1) { + var_nonprob <- 1 / N^2 * sum((1 - ps) * ((weights * (y - y_pred - h_n) - b %*% t(X)) / ps)^2) + } + as.numeric(var_nonprob) + } + + model_selection <- function(X, + X_nons, + X_rand, + weights, + weights_rand, + R, + method_selection, + optim_method, + h = h, + est_method, + maxit, + control_selection, + start, + verbose, + varcov = FALSE, + ...) { + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method = method_selection_function) + inv_link <- method$make_link_inv + + if (is.null(start)) { + if (control_selection$start_type == "glm") { + # start <- start_fit(X = X, # <--- does not work with pop_totals + # R = R, + # weights = weights, + # weights_rand = weights_rand, + # method_selection = method_selection) + + # TODO to test + start_to_gee <- start_fit( + X = X, # <--- does not work with pop_totals + R = R, + weights = weights, + weights_rand = weights_rand, + method_selection = method_selection + ) + start <- method$make_max_lik( + X_nons = X_nons, + X_rand = X_rand, + weights = weights, + weights_rand = weights_rand, + start = start_to_gee, + control = control_selection + )$theta_hat + #### + } else if (control_selection$start_type == "naive") { + start_h <- suppressWarnings(theta_h_estimation( + R = R, + X = X[, 1, drop = FALSE], + weights_rand = weights_rand, + weights = weights, + h = h, + method_selection = method_selection, + maxit = maxit, + start = 0 + )$theta_h) + start <- c(start_h, rep(0, ncol(X) - 1)) + } else if (control_selection$start_type == "zero") { + start <- rep(0, ncol(X)) + } + } + + h_object <- theta_h_estimation( + R = R, + X = X, + weights_rand = weights_rand, + weights = weights, + h = h, + method_selection = method_selection, + maxit = maxit, + start = start + ) + theta_hat <- h_object$theta_h + hess <- h_object$hess + grad <- h_object$grad + eta_nons <- theta_hat %*% t(as.matrix(X_nons)) + eta_rand <- theta_hat %*% t(as.matrix(X_rand)) + ps_nons <- inv_link(eta_nons) + est_ps_rand <- inv_link(eta_rand) + variance_covariance <- try(solve(-hess), silent = TRUE) + if(inherits(variance_covariance, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + variance_covariance <- MASS::ginv(-hess) + } + resids <- R - c(est_ps_rand, ps_nons) + + df_reduced <- nrow(X) - length(theta_hat) + variance <- as.vector((t(resids) %*% resids) / df_reduced) + + if (method_selection == "probit") { # for probit model, propensity score derivative is required + dinv_link <- method$make_link_inv_der + ps_nons_der <- dinv_link(theta_hat %*% t(as.matrix(X_nons))) + est_ps_rand_der <- dinv_link(theta_hat %*% t(as.matrix(X_rand))) + } + + list( + theta_hat = theta_hat, + hess = hess, + grad = grad, + ps_nons = ps_nons, + est_ps_rand = est_ps_rand, + ps_nons_der = ifelse(method_selection == "probit", ps_nons_der, NA), + est_ps_rand_der = ifelse(method_selection == "probit", est_ps_rand_der, NA), + variance_covariance = variance_covariance, + var_cov1 = ifelse(varcov, method$variance_covariance1, "No variance-covariance matrix"), + var_cov2 = ifelse(varcov, method$variance_covariance2, "No variance-covariance matrix"), + df_residual = df_reduced, + eta_nons = eta_nons, + eta_rand = eta_rand, + residuals = resids, + method = method + ) + } + + structure( + list( + estimation_model = estimation_model, + make_t = make_t, + make_var_nonprob = make_var_nonprob, + model_selection = model_selection + ), + class = "method" + ) +} diff --git a/R/glm.R b/R/glm.R new file mode 100644 index 0000000..9cb4c08 --- /dev/null +++ b/R/glm.R @@ -0,0 +1,85 @@ +# Internal functions for mass imputation models +#' @importFrom stats predict.glm +#' @importFrom stats glm.fit +#' @importFrom stats summary.glm +glm_nonprobsvy <- function(outcome, + data, + weights, + family_outcome, + start_outcome, + X_nons, + y_nons, + X_rand, + control, + n_nons, + n_rand, + model_frame, + vars_selection, + pop_totals) { + if (is.character(family_outcome)) { + family_nonprobsvy <- paste(family_outcome, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + if (vars_selection == FALSE) { + # Estimation for outcome model + model_out <- internal_outcome( + outcome = outcome, + data = data, + weights = weights, + family_outcome = family_outcome, + start_outcome = start_outcome + ) + + model_nons_coefs <- model_out$glm$coefficients + parameters <- model_out$glm_summary$coefficients + + if (is.null(pop_totals)) { + # print(head(model_frame)) + # stop("123") + y_rand_pred <- stats::predict.glm(model_out$glm, newdata = model_frame, type = "response") + } else { + eta <- pop_totals %*% model_nons_coefs / pop_totals[1] + y_rand_pred <- family_nonprobsvy$linkinv(eta) + } + y_nons_pred <- model_out$glm$fitted.values + } else { + model <- stats::glm.fit( + x = X_nons, + y = y_nons, + weights = weights, + family = get_method(family_outcome), + start = start_outcome, + control = list( + control$epsilon, + control$maxit, + control$trace + ), + intercept = FALSE + ) + model_summ <- stats::summary.glm(model) + parameters <- model_summ$coefficients + model_nons_coefs <- model$coefficients + if (is.null(pop_totals)) { + eta <- X_rand %*% model_nons_coefs + } else { + eta <- pop_totals %*% model_nons_coefs / pop_totals[1] + } + y_rand_pred <- family_nonprobsvy$linkinv(eta) + y_nons_pred <- model$fitted.values + + model_out <- list( + glm = model, + glm_summary = model_summ + ) + } + model_out$glm$std_err <- parameters[, 2] + names(model_out$glm$std_err) <- names(model_out$glm$coefficients) + + list( + model = model_out$glm, + y_rand_pred = y_rand_pred, + y_nons_pred = y_nons_pred, + parameters = parameters + ) +} diff --git a/R/helper.R b/R/helper.R new file mode 100644 index 0000000..47fcf8a --- /dev/null +++ b/R/helper.R @@ -0,0 +1,3 @@ +#' @useDynLib nonprobsvy, .registration = TRUE +#' @importFrom Rcpp sourceCpp +NULL diff --git a/R/internals.R b/R/internals.R index 8ea6c80..148155a 100644 --- a/R/internals.R +++ b/R/internals.R @@ -8,6 +8,9 @@ #' @importFrom stats contrasts #' @importFrom nleqslv nleqslv #' @importFrom stats get_all_vars +#' @importFrom stats cov +#' @importFrom stats var +#' @importFrom stats predict # Selection model object internal_selection <- function(X, @@ -23,6 +26,7 @@ internal_selection <- function(X, maxit, control_selection, start, + verbose, bias_correction = FALSE, varcov = FALSE, ...) { @@ -43,9 +47,11 @@ internal_selection <- function(X, varcov = varcov, control_selection = control_selection, start = start, + verbose = verbose, ... ) } + # Outcome model object internal_outcome <- function(outcome, data, @@ -67,109 +73,7 @@ internal_outcome <- function(outcome, glm_summary = model_nons_summary ) } -theta_h_estimation <- function(R, - X, - weights_rand, - weights, - h, - method_selection, - maxit, - start = NULL, - pop_totals = NULL, - pop_means = NULL) { # TODO with BERENZ recommendation - - p <- ncol(X) - # if (is.null(pop_totals) & is.null(pop_means)) { - # if (is.null(start)) { - # start0 <- start_fit(X = X, # <--- does not work with pop_totals - # R = R, - # weights = weights, - # weights_rand = weights_rand, - # method_selection = method_selection) - # } else { - # start0 <- start - # } - # } else { # TODO customize start point for fitting with population totals - # # start0 <- rep(.8, ncol(X)) - # # X_pop <- rbind(X, pop_totals) - # # weights_randd <- 1 - # if (is.null(start)) { - # start0 <- start_fit(X = X, # <--- does not work with pop_totals - # R = R, - # weights = weights, - # weights_rand = weights_rand, - # method_selection = method_selection) - # } else { - # start0 <- start - # } - # } - u_theta <- u_theta( - R = R, - X = X, - weights = c(weights_rand, weights), - h = h, - method_selection = method_selection, - pop_totals = pop_totals - ) - - u_theta_der <- u_theta_der( - R = R, - X = X, - weights = c(weights_rand, weights), - h = h, - method_selection = method_selection, - pop_totals = pop_totals - ) - if (method_selection == "cloglog") { - root <- nleqslv::nleqslv( - x = start, - fn = u_theta, - method = "Newton", # TODO consider the methods - global = "cline", # qline", - xscalm = "fixed", - jacobian = TRUE - ) - } else { - root <- nleqslv::nleqslv( - x = start, - fn = u_theta, - method = "Newton", # TODO consider the methods - global = "cline", # qline", - xscalm = "fixed", - jacobian = TRUE, - jac = u_theta_der - # control = list(sigma = 0.1, trace = 1) - ) - } - - - theta_root <- root$x - if (root$termcd %in% c(2:7, -10)) { - switch(as.character(root$termcd), - "2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."), - "3" = warning("Algorithm did not find suitable point - has stalled cannot find an acceptable new point when fitting selection model by nleqslv."), - "4" = warning("Iteration limit exceeded when fitting selection model by nleqslv."), - "5" = warning("ill-conditioned Jacobian when fitting selection model by nleqslv."), - "6" = warning("Jacobian is singular when fitting selection model by nleqslv."), - "7" = warning("Jacobian is unusable when fitting selection model by nleqslv."), - "-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv.") - ) - } - theta_h <- as.vector(theta_root) - grad <- u_theta(theta_h) - if (method_selection == "cloglog") { - hess <- root$jac - } else { - hess <- u_theta_der(theta_h) # TODO compare with root$jac - } - - list( - theta_h = theta_h, - hess = hess, - grad = grad - ) -} # code for the function comes from the ncvreg package setup_lambda <- function(X, y, @@ -214,457 +118,6 @@ setup_lambda <- function(X, lambda } -# score equation for theta, used in variable selection -u_theta <- function(R, - X, - weights, - method_selection, - h, - N = NULL, - pop_totals = NULL, - pop_size = NULL) { - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - inv_link <- method$make_link_inv - function(par) { - theta <- as.matrix(par) - n <- length(R) - X0 <- as.matrix(X) - eta_pi <- X0 %*% theta - ps <- inv_link(eta_pi) - R_rand <- 1 - R - ps <- as.vector(ps) - N_nons <- sum(1 / ps) - weights_sum <- sum(weights) - - if (is.null(pop_totals)) { - eq <- switch(h, - "1" = c(apply(X0 * R / ps * weights - X0 * R_rand * weights, 2, sum)), # consider division by N_nons - "2" = c(apply(X0 * R * weights - X0 * R_rand * ps * weights, 2, sum)) - ) - } else { - eq <- c(apply(X0 * R / ps * weights, 2, sum)) - pop_totals - } - eq - } -} - -# derivative of score equation for theta, used in variable selection -u_theta_der <- function(R, - X, - weights, - method_selection, - h, - N = NULL, - pop_totals = NULL) { - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - inv_link <- method$make_link_inv - dinv_link <- method$make_link_inv_der - inv_link_rev <- method$make_link_inv_rev - - function(par) { - theta <- as.matrix(par) - X0 <- as.matrix(X) - p <- ncol(X0) - eta <- X0 %*% theta - ps <- inv_link(eta) - ps <- as.vector(ps) - N_nons <- sum(1 / ps) - R_rand <- 1 - R - weights_sum <- sum(weights) - - if (!is.null(pop_totals)) { - mxDer <- t(R * as.data.frame(X0) * weights * inv_link_rev(eta)) %*% X0 - } else { - mxDer <- switch(h, - "1" = t(R * as.data.frame(X0) * weights * inv_link_rev(eta)) %*% X0, # TODO bug here when solve for some data - probably because of inv_link_rev - "2" = -t(R_rand * as.data.frame(X0) * weights * dinv_link(eta)) %*% X0 - ) - } - as.matrix(mxDer, nrow = p) # consider division by N_nons - } -} -# Variance for inverse probability weighted estimator -internal_varIPW <- function(svydesign, - X_nons, - X_rand, - y_nons, - weights, - ps_nons, - mu_hat, - hess, - ps_nons_der, - N, - est_ps_rand, - ps_rand, - est_ps_rand_der, - n_rand, - pop_size, - pop_totals, - method_selection, - est_method, - theta, - h, - var_cov1 = var_cov1, - var_cov2 = var_cov2) { - eta <- as.vector(X_nons %*% as.matrix(theta)) - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - b_obj <- method$b_vec_ipw( - X = X_nons, - ps = ps_nons, - psd = ps_nons_der, - y = y_nons, - mu = mu_hat, - hess = hess, - eta = eta, - pop_size = pop_size, - weights = weights - ) - b <- b_obj$b - - # sparse matrix - b_vec <- cbind(-1, b) - H_mx <- cbind(0, N * solve(hess)) - sparse_mx <- Matrix::Matrix(rbind(b_vec, H_mx), sparse = TRUE) - - V1 <- var_cov1( - X = X_nons, - y = y_nons, - mu = mu_hat, - ps = ps_nons, - psd = ps_nons_der, - pop_size = pop_size, - est_method = est_method, - h = h, - weights = weights, - pop_totals = pop_totals - ) # fixed - V2 <- var_cov2( - X = X_rand, - svydesign = svydesign, - eps = est_ps_rand, - est_method = est_method, - h = h, - pop_totals = pop_totals, - psd = est_ps_rand_der - ) - - - # variance-covariance matrix for set of parameters (mu_hat and theta_hat) - V_mx_nonprob <- sparse_mx %*% V1 %*% t(as.matrix(sparse_mx)) # nonprobability component - V_mx_prob <- sparse_mx %*% V2 %*% t(as.matrix(sparse_mx)) # probability component - V_mx <- V_mx_nonprob + V_mx_prob - - var_nonprob <- as.vector(V_mx_nonprob[1, 1]) - var_prob <- as.vector(V_mx_prob[1, 1]) - var <- as.vector(V_mx[1, 1]) - # vector of variances for theta_hat - # theta_hat_var <- diag(as.matrix(V_mx[2:ncol(V_mx), 2:ncol(V_mx)])) - - list( - var_nonprob = var_nonprob, - var_prob = var_prob, - var = var - ) -} -# Variance for doubly robust estimator -# TODO add nn and pmm -internal_varDR <- function(OutcomeModel, - SelectionModel, - y_nons_pred, - weights, - weights_rand, - method_selection, - control_selection, - theta, - ps_nons, - hess, - ps_nons_der, - est_ps_rand, - y_rand_pred, - N_nons, - est_ps_rand_der, - svydesign, - est_method, - h, - pop_totals, - sigma, - bias_correction) { - ######### mm - if (bias_correction == TRUE) { - infl1 <- (weights * (OutcomeModel$y_nons - y_nons_pred))^2 / ps_nons^2 - infl2 <- (weights * (OutcomeModel$y_nons - y_nons_pred))^2 / ps_nons - - # Variance estimators #### - svydesign <- stats::update(svydesign, - y_rand = y_rand_pred - ) - svydesign_mean <- survey::svymean(~y_rand, svydesign) - - var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component - var_nonprob <- (sum((infl1) - 2 * infl2) + sum(weights_rand * sigma)) / N_nons^2 # TODO potential bug here nonprobability component - } else { - eta <- as.vector(SelectionModel$X_nons %*% as.matrix(theta)) - h_n <- 1 / N_nons * sum(OutcomeModel$y_nons - y_nons_pred) # TODO add weights # errors mean - method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") - method <- get_method(method_selection) - est_method <- get_method(est_method) - # psd <- method$make_link_inv_der(eta) - - b <- method$b_vec_dr( - X = SelectionModel$X_nons, - ps = ps_nons, - psd = ps_nons_der, - y = OutcomeModel$y_nons, - hess = hess, - eta = eta, - h_n = h_n, - y_pred = y_nons_pred, - weights = weights - ) - - # asymptotic variance by each propensity score method (nonprobability component) - var_nonprob <- est_method$make_var_nonprob( - ps = ps_nons, - psd = ps_nons_der, - y = OutcomeModel$y_nons, - y_pred = y_nons_pred, - h_n = h_n, - X = SelectionModel$X_nons, - b = b, - N = N_nons, - h = h, - method_selection = method_selection, - weights = weights, - pop_totals = pop_totals - ) - - - if (is.null(pop_totals)) { - t <- est_method$make_t( - X = SelectionModel$X_rand, - ps = est_ps_rand, - psd = est_ps_rand_der, - b = b, - h = h, - y_rand = y_rand_pred, - y_nons = y_nons_pred, - N = N_nons, - method_selection = method_selection, - weights = weights - ) - # design based variance estimation based on approximations of the second-order inclusion probabilities - svydesign <- stats::update(svydesign, - t = t - ) - svydesign_mean <- survey::svymean(~t, svydesign) # perhaps using survey package to compute prob variance - var_prob <- as.vector(attr(svydesign_mean, "var")) - } else { - var_prob <- 0 - } - } - - list( - var_prob = var_prob, - var_nonprob = var_nonprob - ) -} -# Variance for mass imputation estimator -internal_varMI <- function(svydesign, - X_nons, - X_rand, - y, - y_pred, - y_hat, - weights_rand, - method, - n_rand, - n_nons, - N, - family, - parameters, - pop_totals) { - if (is.character(family)) { - family_nonprobsvy <- paste(family, "_nonprobsvy", sep = "") - family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) - family_nonprobsvy <- family_nonprobsvy() - } - - if (is.null(pop_totals)) { - svydesign_mean <- survey::svymean(~y_hat_MI, svydesign) - var_prob <- as.vector(attr(svydesign_mean, "var")) # probability component, should be bigger for nn - if (method == "nn") { - sigma_hat <- mean((y - y_pred)^2) # family_nonprobsvy$variance(mu = y_pred, y = y) - est_ps <- n_nons / N - var_nonprob <- n_rand / N^2 * (1 - est_ps) / est_ps * sigma_hat - } else if (method == "glm") { # TODO add variance for count binary outcome variable control_outcome$method - - beta <- parameters[, 1] - eta_nons <- X_nons %*% beta - eta_rand <- X_rand %*% beta - - mx <- 1 / N * colSums(as.data.frame(X_rand) * (weights_rand * family_nonprobsvy$mu.eta(eta_rand))) - c <- solve(1 / n_nons * t(as.data.frame(X_nons) * family_nonprobsvy$mu.eta(eta_nons)) %*% X_nons) %*% mx - residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) - - # nonprobability component - var_nonprob <- 1 / n_nons^2 * t(as.matrix(residuals^2)) %*% (X_nons %*% c)^2 - var_nonprob <- as.vector(var_nonprob) - } else if (method == "pmm") { - # beta <- parameters[,1] - # eta_nons <- X_nons %*% beta - # eta_rand <- X_rand %*% beta - # - # mx <- 1/N * colSums(as.data.frame(X_rand) * (weights_rand * family_nonprobsvy$mu_der(eta_rand))) - # c <- solve(1/n_nons * t(as.data.frame(X_nons) * family_nonprobsvy$mu_der(eta_nons)) %*% X_nons) %*% mx - # residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) - # - # # nonprobability component - # var_nonprob <- 1/n_nons^2 * t(as.matrix(residuals^2)) %*% (X_nons %*% c)^2 - # var_nonprob <- as.vector(var_nonprob) - - # nonprobability component - # var_nonprob <- 1/n_nons^2 * residuals^2 * X_nons %*% t(X_nons) - var_nonprob <- 0 - # var_nonprob <- as.vector(var_nonprob) - # TODO to consider - } - } else { - if (method == "nn") { - sigma_hat <- mean((y - y_pred)^2) # family_nonprobsvy$variance(mu = y_pred, y = y) - est_ps <- n_nons / N - var_nonprob <- n_nons / N^2 * (1 - est_ps) / est_ps * sigma_hat # what instead of n_rand here (?) now just n_nons - } else if (method == "glm") { - beta <- parameters[, 1] - eta_nons <- X_nons %*% beta - if (family %in% c("binomial", "poisson")) { # TODO consider this chunk of code - eta_rand <- pop_totals %*% beta / pop_totals[1] - } else { - eta_rand <- pop_totals %*% beta - } - mx <- 1 / N * pop_totals * as.vector(family_nonprobsvy$mu.eta(eta_rand)) - c <- solve(1 / n_nons * t(as.data.frame(X_nons) * family_nonprobsvy$mu.eta(eta_nons)) %*% X_nons) %*% mx - residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) - - # nonprobability component - var_nonprob <- 1 / n_nons^2 * t(as.matrix(residuals^2)) %*% (X_nons %*% c)^2 - var_nonprob <- as.vector(var_nonprob) - } else if (method == "pmm") { - # beta <- parameters[,1] - # eta_nons <- X_nons %*% beta - # - # if (family %in% c("binomial", "poisson")) { # TODO consider this chunk of code - # eta_rand <- pop_totals %*% beta / pop_totals[1] - # } else { - # eta_rand <- pop_totals %*% beta - # } - # - # residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) - - # nonprobability component - # var_nonprob <- 1/n_nons^2 * t(as.matrix(residuals^2)) %*% (family_nonprobsvy$mu_der(eta_nons) %*% t(X_nons))^2 - var_nonprob <- 0 - var_nonprob <- as.vector(var_nonprob) - } - var_prob <- 0 - } - - list( - var_prob = var_prob, - var_nonprob = var_nonprob - ) -} -# create an object with model frames and matrices to preprocess -model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_totals = NULL, pop_size = NULL, flag = TRUE) { - if (!is.null(svydesign)) { - ##### Model frame for nonprobability sample ##### - model_Frame <- model.frame(formula, data) - y_nons <- model.response(model_Frame) - outcome_name <- names(model_Frame)[1] - mt <- attr(model_Frame, "terms") - nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data) - ##### Model frame for probability sample ##### - if (outcome_name %in% colnames(svydesign$variables)) { - design_to_frame <- svydesign$variables - design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 - model_Frame_rand <- model.frame(formula, design_to_frame) - mt_rand <- attr(model_Frame_rand, "terms") - nons_names_rand <- attr(mt_rand, "term.labels") - } else { - model_Frame_rand <- model.frame(formula[-2], svydesign$variables) - mt_rand <- attr(model_Frame_rand, "terms") - nons_names_rand <- attr(mt_rand, "term.labels") - } - # print(nons_names_rand) - if (all(nons_names %in% nons_names_rand)) { # colnames(svydesign$variables) - dot_check <- sapply(formula, FUN = function(x) { - x == "." - }) - if (length(formula) == 2) nons_names <- nons_names[-1] - if (any(dot_check)) { - xx <- paste("~", paste(nons_names, collapse = "+")) - formula <- as.formula(paste(outcome_name, xx)) - X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables[, nons_names]) - } else { - X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables) # matrix of probability sample with intercept - } - frame_nons <- model.frame(formula, data) - X_nons <- model.matrix(frame_nons, data) # matrix for nonprobability sample with intercept - # if (outcome) { - # xx <- paste("~", paste(nons_names[2:length(nons_names)], collapse = "+")) - # formula <- as.formula(paste(formula[2], xx)) - # X_rand <- model.matrix(delete.response(terms(formula)), svydesign$variables[, nons_names]) - # } else { - # xx <- paste("~", paste(nons_names, collapse = "+")) - # formula <- as.formula(xx) - # X_rand <- model.matrix(formula, svydesign$variables[, nons_names])# matrix of probability sample with intercept - # } - } else { - stop("Variable names in data and svydesign do not match") - } - - list( - X_nons = X_nons, - X_rand = X_rand, - nons_names = nons_names, - y_nons = y_nons, - outcome_name = outcome_name, - model_frame_rand = model_Frame_rand - ) - } else if (!is.null(pop_totals)) { - model_Frame <- model.frame(formula, data) - X_nons <- model.matrix(model_Frame, data) - # matrix for nonprobability sample with intercept - # X_nons <- model.matrix(XY_nons, data, contrasts.arg = list(klasa_pr = contrasts(as.factor(XY_nons[,dep_name]), contrasts = FALSE))) - # nons_names <- attr(terms(formula, data = data), "term.labels") - # nons_names <- colnames(X_nons) - # pop_totals <- pop_totals[which(attr(X_nons, "assign") == 1)] - mt <- attr(model_Frame, "terms") - # nons_names <- attr(mt, "term.labels") - total_names <- colnames(X_nons) - if (flag) { - if (all(total_names %in% names(pop_totals))) { # TODO verify whether this warming works well.. pop_totals, pop_means defined such as in `calibrate` function - pop_totals <- pop_totals[total_names] - } else { - warning("Selection and population totals have different names.") - } - } - y_nons <- model.response(model_Frame) - outcome_name <- names(model_Frame)[1] - - list( - X_nons = X_nons, - pop_totals = pop_totals, - total_names = total_names, - y_nons = y_nons, - outcome_name = outcome_name, - X_rand = NULL - ) - } -} - start_fit <- function(X, R, weights, @@ -697,109 +150,6 @@ get_method <- function(method) { method } -# summary helper functions -# for now just a rough sketch -specific_summary_info <- function(object, ...) { - UseMethod("specific_summary_info") -} - -specific_summary_info.nonprobsvy_ipw <- function(object, - ...) { - coeffs_sel <- matrix(c(object$selection$coefficients, object$selection$std_err), - ncol = 2, - dimnames = list( - names(object$selection$coefficients), - c("Estimate", "Std. Error") - ) - ) - res <- list( - coeffs_sel = coeffs_sel, - weights = object$weights, - df_residual = object$selection$df_residual - ) - - attr(res$coeffs_sel, "glm") <- TRUE - attr(res$weights, "glm") <- FALSE - attr(res$df_residual, "glm") <- FALSE # TODO - attr(res, "model") <- c("glm regression on selection variable") - res -} - -specific_summary_info.nonprobsvy_mi <- function(object, - ...) { - if (object$outcome[[1]]$method == "glm") { # TODO for pmm - coeffs_out <- matrix(c(object$outcome[[1]]$coefficients, object$outcome[[1]]$std_err), - ncol = 2, - dimnames = list( - names(object$outcome[[1]]$coefficients), - c("Estimate", "Std. Error") - ) - ) - } else { - coeffs_out <- "no coefficients" - } - - res <- list( - coeffs_out = coeffs_out - ) - if (object$outcome[[1]]$method == "glm") { - attr(res$coeffs_out, "glm") <- TRUE - attr(res, "model") <- "glm regression on outcome variable" - } else if (object$outcome[[1]]$method == "nn") { - attr(res$coeffs_out, "glm") <- FALSE - } else if (object$outcome[[1]]$method == "pmm") { # TODO - attr(res$coeffs_out, "glm") <- FALSE - # attr(res, "model") <- "glm regression on outcome variable" - } - res -} - -specific_summary_info.nonprobsvy_dr <- function(object, - ...) { - coeffs_sel <- matrix(c(object$selection$coefficients, object$selection$std_err), - ncol = 2, - dimnames = list( - names(object$selection$coefficients), - c("Estimate", "Std. Error") - ) - ) - - - if (object$outcome[[1]]$method == "glm") { - coeffs_out <- matrix(c(object$outcome[[1]]$coefficients, object$outcome[[1]]$std_err), - ncol = 2, - dimnames = list( - names(object$outcome[[1]]$coefficients), - c("Estimate", "Std. Error") - ) - ) - } else { - coeffs_out <- "no coefficients" - } - - res <- list( - coeffs_sel = coeffs_sel, - coeffs_out = coeffs_out, - weights = object$weights, - df_residual = object$selection$df_residual - ) - attr(res$coeffs_sel, "glm") <- TRUE - if (object$outcome[[1]]$method == "glm") { - attr(res$coeffs_out, "glm") <- TRUE - attr(res, "model") <- c( - "glm regression on selection variable", - "glm regression on outcome variable" - ) - } else if (object$outcome[[1]]$method == "nn") { - attr(res$coeffs_out, "glm") <- FALSE - attr(res, "model") <- c("glm regression on selection variable") - } - attr(res$weights, "glm") <- FALSE - attr(res$df_residual, "glm") <- FALSE - - res -} - ff <- function(formula) { fff <- as.character(formula) f <- strsplit(fff[2], "\\s*\\+\\s*")[[1]] @@ -819,3 +169,4 @@ ff <- function(formula) { l = l ) } + diff --git a/R/logitModel.R b/R/logitModel.R index 42c4d23..c3c25a2 100644 --- a/R/logitModel.R +++ b/R/logitModel.R @@ -253,8 +253,12 @@ logit_model_nonprobsvy <- function(...) { } - b_vec_ipw <- function(y, mu, ps, psd, eta, X, hess, pop_size, weights) { - hess_inv_neg <- solve(-hess) # MASS::ginv(hess) + b_vec_ipw <- function(y, mu, ps, psd, eta, X, hess, pop_size, weights, verbose) { + hess_inv_neg <- try(solve(-hess), silent = TRUE) + if(inherits(hess_inv_neg, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv_neg <- MASS::ginv(-hess) + } if (is.null(pop_size)) { b <- -((1 - ps) / ps * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?) } else { @@ -263,8 +267,12 @@ logit_model_nonprobsvy <- function(...) { list(b = b) } - b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights) { - hess_inv <- solve(hess) + b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights, verbose) { + hess_inv <- try(solve(hess), silent = TRUE) + if(inherits(hess_inv, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv <- MASS::ginv(hess) + } -(((1 - ps) / ps) * weights * (y - y_pred - h_n)) %*% X %*% hess_inv } diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R index 40669bb..8bb6f6f 100644 --- a/R/main_function_documentation.R +++ b/R/main_function_documentation.R @@ -16,7 +16,7 @@ NULL #' The package uses `survey` package functionality when a probability sample is available. #' #' -#' @param data `data.frame` with data from the nonprobability sample. +#' @param data `data.frame` with data from the non-probability sample. #' @param selection `formula`, the selection (propensity) equation. #' @param outcome `formula`, the outcome equation. #' @param target `formula` with target variables. @@ -207,7 +207,10 @@ NULL #' \item{\code{method} -- set on `glm`, since the regression method} #' } #' } -#' In addition, if the variable selection model for the outcome variable is fitting, the list includes the \code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds. +#' In addition, if the variable selection model for the outcome variable is fitting, the list includes the +#' \itemize{ +#' \item{\code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds.} +#' } #' \item{\code{selection} -- list containing information about fitting of propensity score model, such as #' \itemize{ #' \item{\code{coefficients} -- a named vector of coefficients} @@ -224,9 +227,12 @@ NULL #' \item{\code{df_residual} -- the residual degrees of freedom.} #' \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NA`.} #' \item{\code{cve} -- the error for each value of the `lambda`, averaged across the cross-validation folds for the variable selection model -#' when the propensity score model is fitting.} +#' when the propensity score model is fitting. Returned only if selection of variables for the model is used.} #' } #' } +#' \item{\code{stat} -- matrix of the estimated population means in each bootstrap iteration. +#' Returned only if a bootstrap method is used to estimate the variance and \code{keep_boot} in +#' [controlInf()] is set on `TRUE`.} #' } #' @seealso #' [stats::optim()] -- For more information on the \code{optim} function used in the @@ -253,7 +259,7 @@ NULL #' @examples #' \donttest{ -#' # generate data based on Doubly Robust Inference With Nonprobability Survey Samples (2021) +#' # generate data based on Doubly Robust Inference With Non-probability Survey Samples (2021) #' # Yilin Chen , Pengfei Li & Changbao Wu #' library(sampling) #' set.seed(123) @@ -283,7 +289,7 @@ NULL #' #' # population #' sim_data <- data.frame(y30, y50, y80, x1, x2, x3, x4) -#' ## propensity score model for nonprobability sample (sum to 1000) +#' ## propensity score model for non-probability sample (sum to 1000) #' eta <- -4.461 + 0.1 * x1 + 0.2 * x2 + 0.1 * x3 + 0.2 * x4 #' rho <- plogis(eta) #' @@ -294,7 +300,7 @@ NULL #' # data #' sim_data$flag_nonprob <- UPpoisson(rho) ## sampling nonprob #' sim_data$flag_prob <- UPpoisson(sim_data$p_prob) ## sampling prob -#' nonprob_df <- subset(sim_data, flag_nonprob == 1) ## nonprobability sample +#' nonprob_df <- subset(sim_data, flag_nonprob == 1) ## non-probability sample #' svyprob <- svydesign( #' ids = ~1, probs = ~p_prob, #' data = subset(sim_data, flag_prob == 1), diff --git a/R/mle_ipw.R b/R/mle_ipw.R new file mode 100644 index 0000000..da82859 --- /dev/null +++ b/R/mle_ipw.R @@ -0,0 +1,209 @@ +# Object with output parameters for Maximum likelihood Estimation for propensity scores +mle <- function(...) { + estimation_model <- function(model, method_selection, ...) { + method <- model$method + dinv_link <- method$make_link_inv_der + maxLik_nons_obj <- model$maxLik_nons_obj + log_likelihood <- maxLik_nons_obj$log_l # maximum of the loglikelihood function + theta_hat <- model$theta + + ps_nons <- model$ps + ps_nons_der <- model$ps_der + est_ps_rand <- model$ps_rand + est_ps_rand_der <- model$ps_rand_der + hess <- maxLik_nons_obj$hess + grad <- maxLik_nons_obj$grad + var_cov1 <- model$var_cov1 + var_cov2 <- model$var_cov2 + df_residual <- model$df_residual + variance_covariance <- solve(-hess) # MASS::ginv # variance-covariance matrix of estimated parameters + eta <- c(model$eta_rand, model$eta_nons) + aic <- 2 * (length(theta_hat) - log_likelihood) + residuals <- model$residuals + variance <- as.vector(model$variance) + deviance <- model$deviance + deviance_null <- model$deviance_null + + list( + theta_hat = theta_hat, + grad = grad, + hess = hess, + var_cov1 = var_cov1, + var_cov2 = var_cov2, + ps_nons = ps_nons, + est_ps_rand = est_ps_rand, + ps_nons_der = ps_nons_der, + est_ps_rand_der = est_ps_rand_der, + variance_covariance = variance_covariance, + log_likelihood = log_likelihood, + df_residual = df_residual, + eta = eta, + aic = aic, + variance = variance, + residuals = residuals, + method = method + ) + } + + make_t <- function(X, ps, psd, b, y_rand, y_nons, h, N, method_selection, weights, weights_sum) { + method <- get_method(method_selection) + t <- method$t_vec( + X = X, + ps = ps, + psd = psd, + b = b, + y_rand = y_rand, + y_nons = y_nons, + N = N, + weights = weights + ) + t + } + + make_var_nonprob <- function(ps, psd, y, y_pred, h_n, X, b, N, h, method_selection, weights = weights, weights_sum, pop_totals = NULL) { + method <- get_method(method_selection) + var_nonprob <- method$var_nonprob( + ps = ps, + psd = psd, + y = y, + y_pred = y_pred, + h_n = h_n, + X = X, + b = b, + N = N, + weights = weights + ) + as.numeric(var_nonprob) + } + + model_selection <- function(X, + X_nons, + X_rand, + weights, + weights_rand, + R, + method_selection, + optim_method, + h = h, + est_method, + maxit, + control_selection, + start, + verbose = FALSE, + varcov = FALSE, + ...) { + method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method = method_selection_function) + max_lik <- method$make_max_lik # function for propensity score estimation + loglike <- method$make_log_like + gradient <- method$make_gradient + hessian <- method$make_hessian + inv_link <- method$make_link_inv + dinv_link <- method$make_link_inv_der + + # initial values for propensity score estimation + if (is.null(start)) { + if (control_selection$start_type == "glm") { + start <- start_fit( + X = X, + R = R, + weights = weights, + weights_rand = weights_rand, + method_selection = method_selection + ) + } else if (control_selection$start_type == "naive") { + intercept_start <- suppressWarnings(max_lik( + X_nons = X_nons[, 1, drop = FALSE], + X_rand = X_rand[, 1, drop = FALSE], + weights = weights, + weights_rand = weights_rand, + start = 0, + control = control_selection + )$theta_hat) + start <- c(intercept_start, rep(0, ncol(X_nons) - 1)) + } else if (control_selection$start_type == "zero") { + start <- rep(0, ncol(X)) + } + } + + df_reduced <- nrow(X) - length(start) + + maxLik_nons_obj <- max_lik( + X_nons = X_nons, + X_rand = X_rand, + weights = weights, + weights_rand = weights_rand, + start = start, + control = control_selection + ) + + #### deviance + # # null model + # max_lik_null <- max_lik(X_nons = rep(1, nrow(X_nons)), + # X_rand = rep(1, nrow(X_rand)), + # weights = weights, + # weights_rand = weights_rand, + # start = 0, + # control = control_selection) + # log_lik_null <- max_lik_null$log_l + # + # # saturated model + # sats <- factor(1:length(R)) + # df_sat <- data.frame(R = R, sats = sats) + # mod_sat <- model.frame(R ~ sats, data = df_sat) + # X_sat <- model.matrix(mod_sat) + # max_lik_sat <- max_lik(X_nons = X_sat[which(R == 1), ,drop = FALSE], + # X_rand = X_sat[which(R == 0), ,drop = FALSE], + # weights = weights, + # weights_rand = weights_rand, + # start = rep(0, ncol(X_sat)), + # control = control_selection) + # log_lik_sat <- max_lik_sat$log_l + # + # # null deviance + # deviance_null <- log_lik_sat - log_lik_null + # + # # deviance + # deviance <- log_lik_sat - maxLik_nons_obj$log_l + + theta <- maxLik_nons_obj$theta_hat + eta_nons <- theta %*% t(X_nons) + eta_rand <- theta %*% t(X_rand) + + ps_nons <- inv_link(eta_nons) + est_ps_rand <- inv_link(eta_rand) + + ps_nons_der <- dinv_link(eta_nons) + est_ps_rand_der <- dinv_link(eta_rand) + + resids <- R - c(est_ps_rand, ps_nons) + + variance <- (t(resids) %*% resids) / df_reduced + + list( + maxLik_nons_obj = maxLik_nons_obj, + theta = theta, + ps = ps_nons, + ps_der = ps_nons_der, + ps_rand = est_ps_rand, + ps_rand_der = est_ps_rand_der, + var_cov1 = ifelse(varcov, method$variance_covariance1, "No variance-covariance matrix"), + var_cov2 = ifelse(varcov, method$variance_covariance2, "No variance-covariance matrix"), + df_residual = df_reduced, + eta_nons = eta_nons, + eta_rand = eta_rand, + residuals = resids, + variance = variance, + method = method + ) + } + structure( + list( + estimation_model = estimation_model, + make_t = make_t, + make_var_nonprob = make_var_nonprob, + model_selection = model_selection + ), + class = "method" + ) +} diff --git a/R/nn.R b/R/nn.R new file mode 100644 index 0000000..067a9bd --- /dev/null +++ b/R/nn.R @@ -0,0 +1,93 @@ +nn_nonprobsvy <- function(outcome, + data, + weights, + family_outcome, + X_nons, + y_nons, + X_rand, + control, + n_nons, + n_rand, + vars_selection, + pop_totals, + model_frame = NULL, + start_outcome = NULL) { # TODO consider add data standardization before modelling + + model_nons <- nonprobMI_nn( + data = X_nons, + query = X_nons, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + if (is.null(pop_totals)) { + model_rand <- nonprobMI_nn( + data = X_nons, + query = X_rand, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + y_rand_pred <- vector(mode = "numeric", length = n_rand) + y_nons_pred <- vector(mode = "numeric", length = n_nons) + parameters <- "Non-parametric method for outcome model" + + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + + y_nons_pred <- apply(model_nons$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } else { + model_rand <- nonprobMI_nn( + data = X_nons, + query = t(as.matrix(pop_totals / pop_totals[1])), + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + y_rand_pred <- vector(mode = "numeric", length = 1) + y_nons_pred <- vector(mode = "numeric", length = n_nons) + parameters <- "Non-parametric method for outcome model" + + y_rand_pred <- mean(y_nons[model_rand$nn.idx]) + y_nons_pred <- apply(model_nons$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } + + model_out <- list( + model_nons = model_nons, + model_rand = model_rand + ) + list( + model = model_out, + y_rand_pred = y_rand_pred, + y_nons_pred = y_nons_pred, + parameters = parameters + ) +} + + +nonprobMI_nn <- function(data, + query, + k, + treetype, + searchtype, + radius = 0, + eps = 0) { + model_nn <- RANN::nn2( + data = data, + query = query, + k = k, + treetype = treetype, + searchtype = searchtype, + radius = radius, + eps = eps + ) + model_nn +} diff --git a/R/nonprob.R b/R/nonprob.R index 880db32..2ef3b72 100644 --- a/R/nonprob.R +++ b/R/nonprob.R @@ -58,7 +58,8 @@ nonprob <- function(data, if (is.null(selection) & is.null(outcome)) { stop("Please provide selection or outcome formula.") } - if (inherits(selection, "formula") && inherits(target, "formula") && (is.null(outcome) || inherits(outcome, "formula") == FALSE)) { + if (inherits(selection, "formula") && (is.null(outcome) || inherits(outcome, "formula") == FALSE)) { + if (inherits(target, "formula") == FALSE) stop("Please provide target variable") model_used <- "P" } diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 27e1d4d..e50c4ac 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -68,6 +68,9 @@ nonprobDR <- function(selection, confidence_interval <- NULL SE_values <- NULL } + if (control_inference$var_method == "bootstrap") { + stat <- matrix(nrow = control_inference$num_boot, ncol = outcomes$l) + } # Selection models if (is.null(pop_totals) && !is.null(svydesign)) { @@ -121,6 +124,10 @@ nonprobDR <- function(selection, X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons) # joint model matrix ###### WORKING VERSION if (var_selection == TRUE) { + # TODO add std seperately on X_nons and X_rand, after that do join to X matrix + # X_rand_stand <- ncvreg::std(weights_rand * X_rand) + # X_nons_stand <- ncvreg::std(X_nons) + # X_stand <- rbind(X_rand_stand, X_nons_stand) X_stand <- ncvreg::std(X) # penalizing without an intercept prior_weights <- c(weights_rand, weights) @@ -181,6 +188,8 @@ nonprobDR <- function(selection, ############# WORKING VERSION if (var_selection == TRUE) { + # TODO "standardize pop_totals" - dividing by N (?) - means from X_nons / std from X_nons e.g. + # pop_totals <- colMeans(X_nons) / apply(X_nons, 2, sd) X_stand <- ncvreg::std(X) # penalizing without an intercept method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") @@ -413,7 +422,8 @@ nonprobDR <- function(selection, est_method = est_method, maxit = maxit, control_selection = control_selection, - start = start_selection + start = start_selection, + verbose = verbose ) estimation_method <- get_method(est_method) @@ -577,7 +587,11 @@ nonprobDR <- function(selection, ps_nons_der <- dinv_link(eta_nons) weights_nons <- 1 / ps_nons N_nons <- sum(weights * weights_nons) - variance_covariance <- solve(-hess) + variance_covariance <- try(solve(-hess), silent = TRUE) + if(inherits(variance_covariance, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + variance_covariance <- MASS::ginv(-hess) + } theta_standard_errors <- sqrt(diag(variance_covariance)) df_residual <- nrow(SelectionModel$X_nons) - length(theta_hat) # if(is.null(pop_size)) pop_size <- N_nons @@ -659,7 +673,8 @@ nonprobDR <- function(selection, h = h, pop_totals = pop_totals, sigma = sigma, - bias_correction = bias_corr + bias_correction = bias_corr, + verbose = verbose ) var_prob <- var_obj$var_prob @@ -740,7 +755,8 @@ nonprobDR <- function(selection, } SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) var <- boot_obj$var - mu_hat <- boot_obj$mu + stat[, k] <- boot_obj$stat + # mu_hat <- boot_obj$mu } else { stop("Invalid method for variance estimation.") } @@ -768,6 +784,11 @@ nonprobDR <- function(selection, # dimnames = list(names(theta_hat), # c("Estimate", "Std. Error"))) OutcomeList[[k]]$method <- method_outcome + if (control_inference$vars_selection == TRUE) { + OutcomeList[[k]]$cve <- cve_outcome + } else { + NULL + } } weights_summary <- summary(as.vector(weights_nons)) prop_scores <- c(ps_nons, est_ps_rand) @@ -780,6 +801,13 @@ nonprobDR <- function(selection, names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { + stat + } else { + NULL + } + if (!is.null(boot_sample) & is.matrix(boot_sample)) colnames(boot_sample) <- names(ys) + SelectionList <- list( coefficients = selection_model$theta_hat, std_err = theta_standard_errors, @@ -794,7 +822,12 @@ nonprobDR <- function(selection, prior.weights = weights, formula = selection, df_residual = selection_model$df_residual, - log_likelihood = selection_model$log_likelihood + log_likelihood = selection_model$log_likelihood, + cve = if (control_inference$vars_selection == TRUE) { + cve_selection + } else { + NULL + } ) # df.null = selection_model$df_null # converged) @@ -818,7 +851,8 @@ nonprobDR <- function(selection, prob_size = n_rand, pop_size = pop_size, outcome = OutcomeList, - selection = SelectionList + selection = SelectionList, + boot_sample = boot_sample ), class = c("nonprobsvy", "nonprobsvy_dr") ) diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index e2a9c01..205d80d 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -5,6 +5,7 @@ #' @importFrom stats qnorm #' @importFrom stats as.formula #' @importFrom stats terms +#' @importFrom MASS ginv #' @import Rcpp #' @importFrom Rcpp evalCpp @@ -97,6 +98,7 @@ nonprobIPW <- function(selection, if (var_selection == TRUE) { # X_stand <- cbind(1, ncvreg::std(X)) # standardization of variables before fitting + # TODO add std seperately on X_nons and X_rand, after that do join to X matrix X_stand <- ncvreg::std(X) # penalizing without an intercept prior_weights <- c(weights_rand, weights) @@ -152,8 +154,9 @@ nonprobIPW <- function(selection, est_method = est_method, maxit = maxit, start = start_selection, - varcov = TRUE, - control_selection = control_selection + control_selection = control_selection, + verbose = verbose, + varcov = TRUE ) estimation_method <- get_method(est_method) @@ -223,6 +226,10 @@ nonprobIPW <- function(selection, if (var_selection == TRUE) { X_stand <- ncvreg::std(X) # penalizing without an intercept + # pop_totals_varsel <- pop_totals[-1] - colMeans(model$X_nons[,-1]) / apply(model$X_nons[,-1], 2, sd) * pop_totals[1] + # print(pop_totals) + # print(pop_totals_varsel) + # stop("123") method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "") method <- get_method(method_selection_function) @@ -233,7 +240,7 @@ nonprobIPW <- function(selection, # Cross-validation for variable selection cv <- cv_nonprobsvy_rcpp( - X = X_stand, # TODO TO FIX + X = X_stand, R = R, weights_X = weights, method_selection = method_selection, @@ -316,7 +323,11 @@ nonprobIPW <- function(selection, eta_nons <- theta_hat %*% t(X_nons) ps_nons <- inv_link(eta_nons) ps_nons_der <- dinv_link(eta_nons) - variance_covariance <- solve(-hess) + variance_covariance <- try(solve(-hess), silent = TRUE) + if(inherits(variance_covariance, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + variance_covariance <- MASS::ginv(-hess) + } theta_standard_errors <- sqrt(diag(variance_covariance)) var_cov1 <- method$variance_covariance1 var_cov2 <- method$variance_covariance2 @@ -344,7 +355,7 @@ nonprobIPW <- function(selection, } else { stop("Please, provide svydesign object or pop_totals/pop_means.") } - + mu_hats <- numeric(length = outcomes$l) for (k in 1:outcomes$l) { if (is.null(pop_totals)) { y_nons <- model_frame( @@ -366,23 +377,30 @@ nonprobIPW <- function(selection, )$y_nons } ys[[k]] <- as.numeric(y_nons) - mu_hat <- mu_hatIPW( + mu_hats[k] <- mu_hatIPW( y = y_nons, weights = weights, weights_nons = weights_nons, N = ifelse(is.null(pop_size), N, pop_size) ) # IPW estimator # consider using weighted.mean function # mu_hat <- weighted.mean(y_nons, w = weights * weights_nons) - if (se) { - if (var_method == "analytic") { + } + if (se) { + if (var_method == "analytic") { + var_nonprob <- numeric(length = outcomes$l) + var_prob <- numeric(length = outcomes$l) + var <- numeric(length = outcomes$l) + se_nonprob <- numeric(length = outcomes$l) + se_prob <- numeric(length = outcomes$l) + for (k in 1:outcomes$l) { var_obj <- internal_varIPW( svydesign = svydesign, X_nons = X_nons, X_rand = X_rand, - y_nons = y_nons, + y_nons = ys[[k]], weights = weights, ps_nons = ps_nons, - mu_hat = mu_hat, + mu_hat = mu_hats[k], hess = hess, ps_nons_der = ps_nons_der, N = N, @@ -397,85 +415,92 @@ nonprobIPW <- function(selection, theta = theta_hat, h = h, var_cov1 = var_cov1, - var_cov2 = var_cov2 + var_cov2 = var_cov2, + verbose = verbose ) - var_nonprob <- var_obj$var_nonprob - var_prob <- var_obj$var_prob - var <- var_obj$var - se_nonprob <- sqrt(var_nonprob) - se_prob <- sqrt(var_prob) - SE_values[[k]] <- data.frame(t(data.frame("SE" = c(prob = se_prob, nonprob = se_nonprob)))) - } else if (var_method == "bootstrap") { - if (control_inference$cores > 1) { - boot_obj <- bootIPW_multicore( - X_rand = X_rand, - X_nons = X_nons, - svydesign = svydesign, - y = y_nons, - num_boot = num_boot, - weights = weights, - weights_rand = weights_rand, - R = R, - theta_hat = theta_hat, - mu_hat = mu_hat, - method_selection = method_selection, - start_selection = start_selection, - n_nons = n_nons, - n_rand = n_rand, - optim_method = optim_method, - est_method = est_method, - h = h, - maxit = maxit, - pop_size = pop_size, - pop_totals = pop_totals, - control_selection = control_selection, - control_inference = control_inference, - cores = control_inference$cores, - verbose = verbose - ) - } else { - boot_obj <- bootIPW( - X_rand = X_rand, - X_nons = X_nons, - svydesign = svydesign, - y = y_nons, - num_boot = num_boot, - weights = weights, - weights_rand = weights_rand, - R = R, - theta_hat = theta_hat, - mu_hat = mu_hat, - method_selection = method_selection, - start_selection = start_selection, - n_nons = n_nons, - n_rand = n_rand, - optim_method = optim_method, - est_method = est_method, - h = h, - maxit = maxit, - pop_size = pop_size, - pop_totals = pop_totals, - control_selection = control_selection, - control_inference = control_inference, - verbose = verbose - ) - } - var <- boot_obj$var - mu_hat <- boot_obj$mu - SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) + var_nonprob[k] <- var_obj$var_nonprob + var_prob[k] <- var_obj$var_prob + var[k] <- var_obj$var + se_nonprob[k] <- sqrt(var_nonprob[k]) + se_prob[k] <- sqrt(var_prob[k]) + SE_values[[k]] <- data.frame(t(data.frame("SE" = c(prob = se_prob[k], nonprob = se_nonprob[k])))) + } + } else if (var_method == "bootstrap") { # TODO add ys, mu_hats instead of y_nons, + if (control_inference$cores > 1) { + boot_obj <- bootIPW_multicore( + X_rand = X_rand, + X_nons = X_nons, + svydesign = svydesign, + ys = ys, # + num_boot = num_boot, + weights = weights, + weights_rand = weights_rand, + R = R, + theta_hat = theta_hat, + mu_hats = mu_hats, # + method_selection = method_selection, + start_selection = start_selection, + n_nons = n_nons, + n_rand = n_rand, + optim_method = optim_method, + est_method = est_method, + h = h, + maxit = maxit, + pop_size = pop_size, + pop_totals = pop_totals, + control_selection = control_selection, + control_inference = control_inference, + cores = control_inference$cores, + verbose = verbose + ) } else { - stop("Invalid method for variance estimation.") + boot_obj <- bootIPW( + X_rand = X_rand, + X_nons = X_nons, + svydesign = svydesign, + ys = ys, # + num_boot = num_boot, + weights = weights, + weights_rand = weights_rand, + R = R, + theta_hat = theta_hat, + mu_hats = mu_hats, # + method_selection = method_selection, + start_selection = start_selection, + n_nons = n_nons, + n_rand = n_rand, + optim_method = optim_method, + est_method = est_method, + h = h, + maxit = maxit, + pop_size = pop_size, + pop_totals = pop_totals, + control_selection = control_selection, + control_inference = control_inference, + verbose = verbose + ) + } + var <- boot_obj$var + # mu_hat <- boot_obj$mu + for (k in 1:outcomes$l) { + SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) } - SE <- sqrt(var) - alpha <- control_inference$alpha - z <- stats::qnorm(1 - alpha / 2) - # confidence interval based on the normal approximation + } else { + stop("Invalid method for variance estimation.") + } + SE <- sqrt(var) + alpha <- control_inference$alpha + z <- stats::qnorm(1 - alpha / 2) + # confidence interval based on the normal approximation + for (k in 1:outcomes$l) { confidence_interval[[k]] <- data.frame(t(data.frame("normal" = c( - lower_bound = mu_hat - z * SE, - upper_bound = mu_hat + z * SE + lower_bound = mu_hats[k] - z * SE[k], + upper_bound = mu_hats[k] + z * SE[k] )))) - } else { + } + } else { + for (k in 1:outcomes$l) { SE <- NA confidence_interval[[k]] <- data.frame(t(data.frame("normal" = c( lower_bound = NA, @@ -483,11 +508,11 @@ nonprobIPW <- function(selection, )))) SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) } - - output[[k]] <- data.frame(t(data.frame(result = c(mean = mu_hat, SE = SE)))) } - - X <- rbind(X_nons, X_rand) # joint model matrix + for (k in 1:outcomes$l) { + output[[k]] <- data.frame(t(data.frame(result = c(mean = mu_hats[k], SE = SE[k])))) + } + X <- rbind(X_rand, X_nons) # joint model matrix parameters <- matrix(c(theta_hat, theta_standard_errors), ncol = 2, dimnames = list( @@ -505,6 +530,14 @@ nonprobIPW <- function(selection, names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { + boot_obj$stat + } else { + NULL + } + if (!is.null(boot_sample) & is.matrix(boot_sample)) colnames(boot_sample) <- names(ys) + + SelectionList <- list( coefficients = selection_model$theta_hat, std_err = theta_standard_errors, @@ -518,10 +551,14 @@ nonprobIPW <- function(selection, prior.weights = weights, formula = selection, df_residual = selection_model$df_residual, - log_likelihood = selection_model$log_likelihood + log_likelihood = selection_model$log_likelihood, + cve = if (control_inference$vars_selection == TRUE) { + cve_selection + } else { + NULL + } ) - structure( list( X = if (isTRUE(x)) X else NULL, @@ -538,7 +575,8 @@ nonprobIPW <- function(selection, nonprob_size = n_nons, prob_size = n_rand, pop_size = pop_size, - selection = SelectionList + selection = SelectionList, + boot_sample = boot_sample ), class = c("nonprobsvy", "nonprobsvy_ipw") ) diff --git a/R/nonprobMI.R b/R/nonprobMI.R index 4926b48..3e74cd0 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -45,11 +45,114 @@ nonprobMI <- function(outcome, SE_values <- NULL } num_boot <- control_inference$num_boot - if (method_outcome == "pmm") { + if (method_outcome == "pmm" & (!is.null(pop_totals) | !is.null(pop_means)) ) { control_inference$var_method <- "bootstrap" message("Bootstrap variance only, analytical version during implementation.") } + if (control_inference$var_method == "bootstrap") { + stat <- matrix(nrow = control_inference$num_boot, ncol = outcomes$l) + } + for (k in 1:outcomes$l) { + + if (control_outcome$pmm_k_choice == "min_var" & method_outcome == "pmm") { + # This can be programmed a lot better possibly with custom method outcome that would + # store previous k-pmm model and omit the last estimation + + ## TODO:: right now this only goes forward not backwards + var_prev <- Inf + cond <- TRUE + kk <- control_outcome$k - 1 + while (cond) { + OutcomeModel <- model_frame(formula = outcome, data = data, svydesign = svydesign) + X_nons <- OutcomeModel$X_nons + X_rand <- OutcomeModel$X_rand + nons_names <- OutcomeModel$nons_names + y_nons <- OutcomeModel$y_nons + + R_nons <- rep(1, nrow(X_nons)) + R_rand <- rep(0, nrow(X_rand)) + R <- c(R_nons, R_rand) + + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + + n_nons <- nrow(X_nons) + n_rand <- nrow(X_rand) + X <- rbind(X_nons, X_rand) + + ps_rand <- svydesign$prob + weights_rand <- 1 / ps_rand + N_est_rand <- sum(weights_rand) + + kk <- kk + 1 + method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") + ## estimation + + MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) + model_obj <- MethodOutcome( + outcome = outcome, + data = data, + weights = weights, + family_outcome = family_outcome, + start_outcome = start_outcome, + X_nons = X_nons, + y_nons = y_nons, + X_rand = X_rand, + control = control_outcome, + n_nons = n_nons, + n_rand = n_rand, + model_frame = OutcomeModel$model_frame_rand, + vars_selection = control_inference$vars_selection, + pop_totals = pop_totals + ) + y_rand_pred <- model_obj$y_rand_pred + y_nons_pred <- model_obj$y_nons_pred + # parameters <- model_obj$parameters + OutcomeList[[k]] <- model_obj$model + + # updating probability sample by adding y_hat variable + svydesign1 <- stats::update(svydesign, + y_hat_MI = y_rand_pred + ) + mu_hat <- weighted.mean(y_rand_pred, w = weights_rand) + + var_obj <- internal_varMI( + svydesign = svydesign1, + X_nons = X_nons, + X_rand = X_rand, + y = y_nons, + y_pred = y_nons_pred, + weights_rand = weights_rand, + method = method_outcome, + n_rand = n_rand, + n_nons = n_nons, + N = N_est_rand, + family = family_outcome, + model_obj = model_obj, + pop_totals = pop_totals, + k = control_outcome$k, + predictive_match = control_outcome$predictive_match, + pmm_exact_se = control_inference$pmm_exact_se, + pmm_reg_engine = control_outcome$pmm_reg_engine, + pi_ij = control_inference$pi_ij + ) + + var_nonprob <- var_obj$var_nonprob + var_prob <- var_obj$var_prob + + se_nonprob <- sqrt(var_nonprob) + se_prob <- sqrt(var_prob) + SE_values[[k]] <- data.frame(t(data.frame("SE" = c(prob = se_prob, nonprob = se_nonprob)))) + # variance + var_now <- var_nonprob + var_prob + cond <- var_prev > var_now + var_prev <- var_now + } + control_outcome$k <- kk - 1 + svydesign1 <- NULL # freeing up memmory + } + if (is.null(pop_totals) && !is.null(svydesign)) { pop_totals_sel <- pop_totals outcome <- outcomes$outcome[[k]] @@ -79,6 +182,7 @@ nonprobMI <- function(outcome, ########### WORKING VERSION if (var_selection == TRUE) { + # TODO add variables randomization nlambda <- control_outcome$nlambda beta <- ncvreg::cv.ncvreg( X = X_nons[, -1, drop = FALSE], @@ -88,7 +192,10 @@ nonprobMI <- function(outcome, trace = verbose, nfolds = control_outcome$nfolds, nlambda = nlambda, - gamma = switch(control_outcome$penalty, SCAD = control_outcome$a_SCAD, control_outcome$a_MCP), + gamma = switch(control_outcome$penalty, + SCAD = control_outcome$a_SCAD, + control_outcome$a_MCP + ), lambda_min = control_outcome$lambda_min, eps = control_outcome$epsilon ) @@ -102,13 +209,14 @@ nonprobMI <- function(outcome, X_design <- as.matrix(X[, beta_selected + 1, drop = FALSE]) # colnames(X_design) <- c("(Intercept)", colnames(Xsel)) - X_rand <- X_design[loc_rand, ] - X_nons <- X_design[loc_nons, ] + X_rand <- X_design[loc_rand, , drop = FALSE] + X_nons <- X_design[loc_nons, , drop = FALSE] } ################ method_outcome_nonprobsvy <- paste(method_outcome, "_nonprobsvy", sep = "") ## estimation + MethodOutcome <- get(method_outcome_nonprobsvy, mode = "function", envir = parent.frame()) model_obj <- MethodOutcome( outcome = outcome, @@ -166,7 +274,10 @@ nonprobMI <- function(outcome, trace = verbose, nfolds = control_outcome$nfolds, nlambda = nlambda, - gamma = switch(control_outcome$penalty, SCAD = control_outcome$a_SCAD, control_outcome$a_MCP), + gamma = switch(control_outcome$penalty, + SCAD = control_outcome$a_SCAD, + control_outcome$a_MCP + ), lambda_min = control_outcome$lambda_min, eps = control_outcome$epsilon ) @@ -211,6 +322,12 @@ nonprobMI <- function(outcome, } else { stop("Please, provide svydesign object or pop_totals/pop_means.") } + + if (isTRUE(attr(model_obj$model, "method") == "pmm") & !(control_inference$pmm_exact_se)) { + # if not pmm_exact_se then this can be dropped + model_obj$model$glm_obj <- NULL + } + ys[[k]] <- as.numeric(y_nons) if (se) { # design based variance estimation based on approximations of the second-order inclusion probabilities @@ -227,8 +344,14 @@ nonprobMI <- function(outcome, n_nons = n_nons, N = N_est_rand, family = family_outcome, - parameters = model_obj$parameters, - pop_totals = pop_totals + model_obj = model_obj, + pop_totals = pop_totals, + # we should probably just pass full control list + k = control_outcome$k, + predictive_match = control_outcome$predictive_match, + pmm_exact_se = control_inference$pmm_exact_se, + pmm_reg_engine = control_outcome$pmm_reg_engine, + pi_ij = control_inference$pi_ij ) var_nonprob <- var_obj$var_nonprob @@ -242,16 +365,18 @@ nonprobMI <- function(outcome, } else if (control_inference$var_method == "bootstrap") { # TODO for pop_totals # bootstrap variance if (control_inference$cores > 1) { - boot_obj <- bootMI_multicore(X_rand, - X_nons, - weights, - y_nons, - family_outcome, + boot_obj <- bootMI_multicore( + X_rand = X_rand, + X_nons = X_nons, + weights = weights, + y = y_nons, + family_outcome = family_outcome, start_outcome = start_outcome, num_boot = num_boot, weights_rand, mu_hat, svydesign, + model_obj = model_obj, rep_type = control_inference$rep_type, method = method_outcome, control_outcome = control_outcome, @@ -261,16 +386,18 @@ nonprobMI <- function(outcome, verbose = verbose ) } else { - boot_obj <- bootMI(X_rand, - X_nons, - weights, - y_nons, - family_outcome, + boot_obj <- bootMI( + X_rand = X_rand, + X_nons = X_nons, + weights = weights, + y = y_nons, + family_outcome = family_outcome, start_outcome = start_outcome, num_boot = num_boot, weights_rand, mu_hat, svydesign, + model_obj = model_obj, rep_type = control_inference$rep_type, method = method_outcome, control_outcome = control_outcome, @@ -280,7 +407,9 @@ nonprobMI <- function(outcome, ) } var <- boot_obj$var - mu_hat <- boot_obj$mu + stat[, k] <- boot_obj$stat + comp3_stat <- boot_obj$comp3_stat + # mu_hat <- boot_obj$mu SE_values[[k]] <- data.frame(t(data.frame("SE" = c( nonprob = NA, prob = NA @@ -305,12 +434,17 @@ nonprobMI <- function(outcome, SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) } - X <- rbind(X_nons, X_rand) # joint model matrix + X <- rbind(X_rand, X_nons) # joint model matrix # if (is.null(pop_size)) pop_size <- N_est_rand # estimated pop_size output[[k]] <- data.frame(t(data.frame(result = c(mean = mu_hat, SE = SE)))) OutcomeList[[k]]$method <- method_outcome + if (control_inference$vars_selection == TRUE) { + OutcomeList[[k]]$cve <- cve_outcome + } else { + NULL + } } output <- do.call(rbind, output) confidence_interval <- do.call(rbind, confidence_interval) @@ -320,6 +454,13 @@ nonprobMI <- function(outcome, names(pop_size) <- "pop_size" names(ys) <- all.vars(outcome_init[[2]]) + boot_sample <- if (control_inference$var_method == "bootstrap" & control_inference$keep_boot) { + list(stat = stat, comp2 = boot_obj$comp2) + } else { + NULL + } + if (!is.null(boot_sample) & is.matrix(boot_sample)) colnames(boot_sample) <- names(ys) + structure( list( X = if (isTRUE(x)) X else NULL, @@ -334,8 +475,47 @@ nonprobMI <- function(outcome, nonprob_size = n_nons, prob_size = n_rand, pop_size = pop_size, - outcome = OutcomeList + outcome = OutcomeList, + boot_sample = boot_sample ), class = c("nonprobsvy", "nonprobsvy_mi") ) } + + + +nonprobMI_fit <- function(outcome, + data, + weights, + svydesign, + family_outcome, + start, + control_outcome = controlOut(), + verbose, + model, + x, + y) { + family <- family_outcome + + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + data$weights <- weights # TODO just for now, find more efficient way + model_nons <- stats::glm( + formula = outcome, + data = data, + weights = weights, + family = family, + start = start, + control = list( + control_outcome$epsilon, + control_outcome$maxit, + control_outcome$trace + ) + ) + + model_nons +} diff --git a/R/pmm.R b/R/pmm.R new file mode 100644 index 0000000..8f93e3b --- /dev/null +++ b/R/pmm.R @@ -0,0 +1,259 @@ +#' @importFrom stats loess +#' @importFrom stats predict +#' @importFrom stats loess.control +pmm_nonprobsvy <- function(outcome, + data, + weights, + family_outcome, + start_outcome, + X_nons, + y_nons, + X_rand, + control, + n_nons, + n_rand, + vars_selection, + pop_totals, + model_frame) { + glm_object <- switch (control$pmm_reg_engine, + "glm" = glm_nonprobsvy( + outcome, + data, + weights, + family_outcome, + start_outcome = start_outcome, + X_nons, + y_nons, + X_rand, + control, + n_nons, + n_rand, + model_frame, + vars_selection, + pop_totals + ), + "loess" = { + # doesn't accept weights + mm <- stats::loess( + outcome, + data, + span = .2, + control = stats::loess.control(surface = "direct") + ) + mm$data <- data + mm$formula <- outcome + + list( + model = mm, + y_rand_pred = predict(mm, newdata = model_frame), + y_nons_pred = predict(mm), + parameters = NULL + ) + } + ) + + # This is commented now because it is not needed + # model_nons <- nonprobMI_nn(data = glm_object$y_nons_pred, + # query = glm_object$y_nons_pred, + # k = control$k, + # treetype = control$treetype, + # searchtype = control$searchtype) + # + # y_nons_pred <- apply(model_nons$nn.idx, 1, + # FUN=\(x) mean(y_nons[x]) + # #FUN=\(x) mean(sample_nonprob$short_[x]) + # ) + + # add protection for very low values in weighting + switch(control$predictive_match, + { # 1 + if (is.null(pop_totals)) { + model_rand <- nonprobMI_nn( + data = y_nons, + query = glm_object$y_rand_pred, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + + switch(control$pmm_weights, + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } + ) + } else { + # I'm not touching this + model_rand <- nonprobMI_nn( + data = y_nons, + query = glm_object$y_rand_pred, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + y_rand_pred <- mean(y_nons[model_rand$nn.idx]) + } + }, + { # 2 + if (is.null(pop_totals)) { + model_rand <- nonprobMI_nn( + data = glm_object$y_nons_pred, + query = glm_object$y_rand_pred, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + + switch(control$pmm_weights, + "none" = { + y_rand_pred <- apply(model_rand$nn.idx, 1, + FUN = \(x) mean(y_nons[x]) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + }, + "prop_dist" = { + # TODO:: these weights will need to be saved for variance estimation + y_rand_pred <- sapply(1:NROW(model_rand$nn.idx), + FUN = \(x) weighted.mean(y_nons[model_rand$nn.idx[x, ]], + w = 1 / model_rand$nn.dist[x, ] + ) + # FUN=\(x) mean(sample_nonprob$short_[x]) + ) + } + ) + } else { + # I'm not touching this + model_rand <- nonprobMI_nn( + data = glm_object$y_nons_pred, + query = glm_object$y_rand_pred, + k = control$k, + treetype = control$treetype, + searchtype = control$searchtype + ) + y_rand_pred <- mean(y_nons[model_rand$nn.idx]) + } + } + ) + + model_out <- list( + #model_nons = model_nons, + model_rand = model_rand, + glm_object = glm_object$model + ) + attr(model_out, "method") <- "pmm" + list( + model = model_out, + y_rand_pred = y_rand_pred, + # y_nons_pred = y_nons_pred, + parameters = glm_object$parameters + ) +} + + + +pmm_exact <- function(pi_ij, + weights_rand, + n_nons, + y, + pmm_reg_engine, + stats, + glm, + model_obj, + svydesign, + predictive_match, + k, + N) { + # if (isTRUE("ppsmat" %in% class(pi_ij))) { + # pi_ij <- pi_ij$pij + # } + # # if (!is.null(svydesign$dcheck[[1]]$dcheck)) { + # # pi_ij <- svydesign$dcheck[[1]]$dcheck + # # } + # if (is.null(pi_ij)) { + # pi_ij <- outer(1 / weights_rand, 1 / weights_rand) * ( + # 1 - outer(1 - 1 / weights_rand, 1 - 1 / weights_rand) / + # sum(1 - 1 / weights_rand)) + # } + # # if (!is.matrix(pi_ij)) { + # # + # # } + # add variable for loop size to control + loop_size <- 50 + + dd <- vector(mode = "numeric", length = loop_size) + for (jj in 1:loop_size) { + reg_object_boot <- NULL + while (is.null(reg_object_boot)) { + boot_samp <- sample(1:n_nons, size = n_nons, replace = TRUE) + # boot_samp <- sample(1:n_rand, size = n_rand, replace = TRUE) + y_nons_b <- y[boot_samp] + + reg_object_boot <- switch (pmm_reg_engine, + "glm" = stats::glm( + formula = model_obj$model$glm_object$formula, + data = model_obj$model$glm_object$data[boot_samp, , drop = FALSE], + #weights = weights, + family = model_obj$model$glm_object$family, + start = model_obj$model$glm_object$coefficients + ), + "loess" = stats::loess( + formula = model_obj$model$glm_object$formula, + data = model_obj$model$glm_object$data[boot_samp, , drop = FALSE], + span = .2, + control = stats::loess.control(surface = "direct") + ) + ) + XX <- predict( + reg_object_boot, + newdata = svydesign$variables, + type = "response" + ) + #XX <- reg_object_boot$family$mu.eta(X_rand %*% reg_object_boot$coefficients) + + if (any(!is.finite(XX))) { + reg_object_boot <- NULL + } + } + + YY <- switch (predictive_match, + {nonprobMI_nn( + data = y_nons_b, + query = XX, + k = k, + searchtype = "standard", + treetype = "kd" + )}, + {nonprobMI_nn( + data = predict( + reg_object_boot, + newdata = model_obj$model$glm_object$data[boot_samp, , drop = FALSE], + type = "response" + ), + query = XX, + k = k, + searchtype = "standard", + treetype = "kd" + )} + ) + + dd[jj] <- weighted.mean(apply(YY$nn.idx, 1, FUN=\(x) mean(y_nons_b[x])), + weights_rand) + } + var(dd) +} diff --git a/R/prints.R b/R/prints.R new file mode 100644 index 0000000..3fdbba5 --- /dev/null +++ b/R/prints.R @@ -0,0 +1,136 @@ +# no print doccumentation + +#' @method print nonprobsvy +#' @exportS3Method +print.nonprobsvy <- function(x, digits = 8, ...) { + if (!is.null(x$call)) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) + } + cat( + "Estimated population mean with overall std.err and confidence interval:\n\n" + ) + print(cbind(mean = x$output$mean, SE = x$output$SE, x$confidence_interval)) + invisible(x) +} +#' @method print summary_nonprobsvy +#' @importFrom stats printCoefmat +#' @exportS3Method +print.summary_nonprobsvy <- function(x, + signif.stars = getOption("show.signif.stars"), + digits = max(3L, getOption("digits") - 3L), + ...) { # TODO regression diagnostics divided into outcome and selection models + if (!is.null(x$call)) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) + } + # TODO add printing the Info only for DR and MI models + if (length(x$pop_total$mean) > 1) { + cat("Info:\n", "The summary contains information mainly on the first outcome variable.\n", + "More details on the estimation of this variable and others can be found in the outcome list of nonprob object.", + "\n\n", + sep = "" + ) + } + + # cat("Residuals:\n") + # print(summary(c(x$residuals[, 1]))) + + cat("-------------------------\n") + + # cat( + # sep = "", + # "Estimated population mean: ", format(x$pop_total$mean, digits = digits), + # " with overall std.err of: ", format(x$pop_total$se[1], digits = digits), + # "\nAnd std.err for nonprobability and probability samples being respectively:\n", + # ifelse(!is.null(x$pop_total$cnf_int), format(x$pop_total$se[3], digits = digits), ""), " and ", ifelse(!is.null(x$pop_total$cnf_int), format(x$pop_total$se[2], digits = digits), ""), # TODO for se = FALSE + # "\n\nBased on: ", x$model, " method", + # "\n\n",(1 - x$control$control_inference$alpha)*100, "% Confidence inverval for popualtion mean:\n" + # ) + # print(x$pop_total$cnf_int) + + cat( + sep = "", + "Estimated population mean: ", format(x$pop_total$mean[1], digits = digits) + ) + if (!is.null(x$pop_total$cnf_int)) { + cat( + sep = "", + " with overall std.err of: ", format(x$pop_total$se[1], digits = digits), + "\nAnd std.err for nonprobability and probability samples being respectively:\n", + format(x$pop_total$se[3], digits = digits), " and ", format(x$pop_total$se[2], digits = digits), # TODO for se = FALSE + "\n\n", (1 - x$control$control_inference$alpha) * 100, "% Confidence inverval for popualtion mean:\n" + ) + print(x$pop_total$cnf_int) + } + + cat(sep = "", "\n\nBased on: ", x$model, " method") + + + cat( + sep = "", + "\nFor a population of estimate size: ", x$population_size, + "\nObtained on a nonprobability sample of size: ", x$sample_size[2], + "\nWith an auxiliary probability sample of size: ", x$sample_size[1], # TODO + "\n" + ) + + cat("-------------------------\n\n") + + k <- length(x$names) + if (k > 0) cat("Regression coefficients:") + while (k > 0) { + cat("\n-----------------------\nFor ", x$names[k], ":\n", sep = "") + printCoefmat( + matrix( # TODO:: add conf intervals + data = c(x$coef[[k]], x$std_err[[k]], x$w_val[[k]], x$p_values[[k]]), # TODO named coefs for selection model + ncol = 4, + dimnames = list( + names(x$coef[[k]]), + switch(x$test, + "t" = c("Estimate", "Std. Error", "t value", "P(>|t|)"), + "z" = c("Estimate", "Std. Error", "z value", "P(>|z|)") + ) + ) + ), + digits = digits, + signif.stars = signif.stars, + signif.legend = if (k == length(x$names)) signif.stars else FALSE, + P.values = TRUE, has.Pvalue = TRUE, + na.print = "NA", + ... + ) + k <- k - 1 + } + + if (length(x$names) > 0) cat("-------------------------\n\n") + + if (x$model %in% c("Doubly-Robust", "Inverse probability weighted")) { + cat("Weights:\n") + print(x$weights) + + cat("-------------------------\n\n") + + cat("Residuals:\n") + print(summary(x$residuals$selection)) + + # cat("\nAIC:") + # print(x$aic) + # cat("BIC:") + # print(x$bic) + cat("\nAIC: ", x$aic[[1]], "\nBIC: ", x$bic[[1]], sep = "") + + cat("\nLog-Likelihood:", x$likelihood, "on", x$df_residual, "Degrees of freedom\n") + + # cat("-------------------------\n\n") + } + + + # cat("\nRegression diagnostics:") #TODO + + invisible(x) +} diff --git a/R/probitModel.R b/R/probitModel.R index 4be1148..67eb722 100644 --- a/R/probitModel.R +++ b/R/probitModel.R @@ -277,8 +277,12 @@ probit_model_nonprobsvy <- function(...) { } - b_vec_ipw <- function(y, mu, ps, psd, eta, X, hess, pop_size, weights) { - hess_inv_neg <- solve(-hess) + b_vec_ipw <- function(y, mu, ps, psd, eta, X, hess, pop_size, weights, verbose) { + hess_inv_neg <- try(solve(-hess), silent = TRUE) + if(inherits(hess_inv_neg, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv_neg <- MASS::ginv(-hess) + } if (is.null(pop_size)) { b <- -(psd / ps^2 * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?) } else { @@ -287,8 +291,12 @@ probit_model_nonprobsvy <- function(...) { list(b = b) } - b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights) { - hess_inv <- solve(hess) + b_vec_dr <- function(ps, psd, eta, y, y_pred, mu, h_n, X, hess, weights, verbose) { + hess_inv <- try(solve(hess), silent = TRUE) + if(inherits(hess_inv, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + hess_inv <- MASS::ginv(hess) + } -(psd / ps^2 * weights * (y - y_pred - h_n)) %*% X %*% hess_inv } diff --git a/R/methods.R b/R/simple_methods.R similarity index 60% rename from R/methods.R rename to R/simple_methods.R index 70aa44f..18af7cd 100644 --- a/R/methods.R +++ b/R/simple_methods.R @@ -1,256 +1,5 @@ -#' @title Summary statistics for model of nonprobsvy class. -#' -#' @param object object of nonprobsvy class -#' @param test Type of test for significance of parameters \code{"t"} for t-test -#' and \code{"z"} for normal approximation of students t distribution, by -#' default \code{"z"} is used if there are more than 30 degrees of freedom -#' and \code{"t"} is used in other cases. -#' @param correlation correlation Logical value indicating whether correlation matrix should -#' be computed from covariance matrix by default \code{FALSE}. -#' @param cov Covariance matrix corresponding to regression parameters -#' @param ... Additional optional arguments -#' -#' -#' @method summary nonprobsvy -#' @importFrom stats pt -#' @importFrom stats coef -#' @importFrom stats sd -#' @exportS3Method -summary.nonprobsvy <- function(object, - test = c("t", "z"), - correlation = FALSE, - # regression_confint = FALSE, confint Logical value indicating whether confidence intervals for - # regression parameters should be constructed TODO - cov = NULL, # in case of adding sandwich methods - ...) { - model_specific_info <- specific_summary_info( - object, - correlation = correlation, - ... - ) - df_residual <- model_specific_info$df_residual - if (!is.null(df_residual)) { - if (missing(test)) { - if (df_residual > 30) test <- "z" else test <- "t" - } - } else { - test <- "z" # TODO, for now just z-test in case of mi estimation - } - - cf <- list() - se <- list() - wald_test_stat <- list() - p_values <- list() - crr <- list() - confidence_interval_coef <- list() - - for (k in model_specific_info) { - if (attr(k, "glm")) { - number <- length(se) + 1 - cf[[number]] <- k[, 1] - se[[number]] <- k[, 2] - wald_test_stat[[number]] <- k[, 1] / k[, 2] - - p_values[[number]] <- switch(test, - "t" = 2 * stats::pt(q = -abs(k[, 1] / k[, 2]), df = df_residual), - "z" = 2 * stats::pnorm(q = abs(k[, 1] / k[, 2]), lower.tail = FALSE) - ) - - temp_correlation <- if (isFALSE(correlation)) { - NULL - } else { - cov / outer(k[, 2], k[, 2]) - } - if (isTRUE(correlation)) { - rownames(temp_correlation) <- colnames(temp_correlation) <- names(rownames(k)) - } - - crr[[number]] <- temp_correlation - - # confidence_interval_coef <- append(confidence_interval_coef, - # if(isTRUE(confint)) {confint(object, ...)} else {NULL}) - } else { - # TODO - } - } - if (!is.null(object$SE)) { - se_mean <- c(object$output[, 2], object$SE$prob, object$SE$nonprob) - } else { - se_mean <- NULL - } - res <- structure( - list( - call = object$call, - pop_total = list( - mean = object$output$mean, - se = se_mean, - cnf_int = object$confidence_interval - ), - sample_size = nobs(object, ...), - population_size = pop.size(object, ...), - test = test, - control = object$control, - model = switch(class(object)[2], - "nonprobsvy_dr" = "Doubly-Robust", - "nonprobsvy_ipw" = "Inverse probability weighted", - "nonprobsvy_mi" = "Mass Imputation" - ), - aic = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), AIC(object), "no value for the selected method"), - bic = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), BIC(object), "no value for the selected method"), - residuals = residuals.nonprobsvy(object, type = "response"), - likelihood = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$log_likelihood, "no value for the selected method"), - df_residual = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$df_residual, "no value for the selected method"), - weights = summary(object$weights), - coef = cf, - std_err = se, - w_val = wald_test_stat, - p_values = p_values, - crr = crr, - confidence_interval_coef = confidence_interval_coef, - names = attr(model_specific_info, "model") - ), - class = c("summary_nonprobsvy") - ) - res -} - # no need for documenting simple functions -#' @method print nonprobsvy -#' @exportS3Method -print.nonprobsvy <- function(x, digits = 8, ...) { - if (!is.null(x$call)) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", - sep = "" - ) - } - cat( - "Estimated population mean with overall std.err and confidence interval:\n\n" - ) - print(cbind(mean = x$output$mean, SE = x$output$SE, x$confidence_interval)) - invisible(x) -} -#' @method print summary_nonprobsvy -#' @importFrom stats printCoefmat -#' @exportS3Method -print.summary_nonprobsvy <- function(x, - signif.stars = getOption("show.signif.stars"), - digits = max(3L, getOption("digits") - 3L), - ...) { # TODO regression diagnostics divided into outcome and selection models - if (!is.null(x$call)) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", - sep = "" - ) - } - # TODO add printing the Info only for DR and MI models - if (length(x$pop_total$mean) > 1) { - cat("Info:\n", "The summary contains information mainly on the first outcome variable.\n", - "More details on the estimation of this variable and others can be found in the outcome list of nonprob object.", - "\n\n", - sep = "" - ) - } - - # cat("Residuals:\n") - # print(summary(c(x$residuals[, 1]))) - - cat("-------------------------\n") - - # cat( - # sep = "", - # "Estimated population mean: ", format(x$pop_total$mean, digits = digits), - # " with overall std.err of: ", format(x$pop_total$se[1], digits = digits), - # "\nAnd std.err for nonprobability and probability samples being respectively:\n", - # ifelse(!is.null(x$pop_total$cnf_int), format(x$pop_total$se[3], digits = digits), ""), " and ", ifelse(!is.null(x$pop_total$cnf_int), format(x$pop_total$se[2], digits = digits), ""), # TODO for se = FALSE - # "\n\nBased on: ", x$model, " method", - # "\n\n",(1 - x$control$control_inference$alpha)*100, "% Confidence inverval for popualtion mean:\n" - # ) - # print(x$pop_total$cnf_int) - - cat( - sep = "", - "Estimated population mean: ", format(x$pop_total$mean[1], digits = digits) - ) - if (!is.null(x$pop_total$cnf_int)) { - cat( - sep = "", - " with overall std.err of: ", format(x$pop_total$se[1], digits = digits), - "\nAnd std.err for nonprobability and probability samples being respectively:\n", - format(x$pop_total$se[3], digits = digits), " and ", format(x$pop_total$se[2], digits = digits), # TODO for se = FALSE - "\n\n", (1 - x$control$control_inference$alpha) * 100, "% Confidence inverval for popualtion mean:\n" - ) - print(x$pop_total$cnf_int) - } - - cat(sep = "", "\n\nBased on: ", x$model, " method") - - - cat( - sep = "", - "\nFor a population of estimate size: ", x$population_size, - "\nObtained on a nonprobability sample of size: ", x$sample_size[2], - "\nWith an auxiliary probability sample of size: ", x$sample_size[1], # TODO - "\n" - ) - - cat("-------------------------\n\n") - - k <- length(x$names) - if (k > 0) cat("Regression coefficients:") - while (k > 0) { - cat("\n-----------------------\nFor ", x$names[k], ":\n", sep = "") - printCoefmat( - matrix( # TODO:: add conf intervals - data = c(x$coef[[k]], x$std_err[[k]], x$w_val[[k]], x$p_values[[k]]), # TODO named coefs for selection model - ncol = 4, - dimnames = list( - names(x$coef[[k]]), - switch(x$test, - "t" = c("Estimate", "Std. Error", "t value", "P(>|t|)"), - "z" = c("Estimate", "Std. Error", "z value", "P(>|z|)") - ) - ) - ), - digits = digits, - signif.stars = signif.stars, - signif.legend = if (k == length(x$names)) signif.stars else FALSE, - P.values = TRUE, has.Pvalue = TRUE, - na.print = "NA", - ... - ) - k <- k - 1 - } - - if (length(x$names) > 0) cat("-------------------------\n\n") - - if (x$model %in% c("Doubly-Robust", "Inverse probability weighted")) { - cat("Weights:\n") - print(x$weights) - - cat("-------------------------\n\n") - - cat("Residuals:\n") - print(summary(x$residuals$selection)) - - # cat("\nAIC:") - # print(x$aic) - # cat("BIC:") - # print(x$bic) - cat("\nAIC: ", x$aic[[1]], "\nBIC: ", x$bic[[1]], sep = "") - - cat("\nLog-Likelihood:", x$likelihood, "on", x$df_residual, "Degrees of freedom\n") - - # cat("-------------------------\n\n") - } - - - # cat("\nRegression diagnostics:") #TODO - - invisible(x) -} - #' @method nobs nonprobsvy #' @importFrom stats nobs #' @exportS3Method @@ -320,6 +69,7 @@ residuals.nonprobsvy <- function(object, s <- rep(1, object$nonprob_size) } r <- object$selection$residuals + res_sel <- switch(type, "response" = r, "working" = r / propensity_scores * (1 - propensity_scores), diff --git a/R/summary.R b/R/summary.R new file mode 100644 index 0000000..e5f6753 --- /dev/null +++ b/R/summary.R @@ -0,0 +1,219 @@ +#' @title Summary statistics for model of nonprobsvy class. +#' +#' @param object object of nonprobsvy class +#' @param test Type of test for significance of parameters \code{"t"} for t-test +#' and \code{"z"} for normal approximation of students t distribution, by +#' default \code{"z"} is used if there are more than 30 degrees of freedom +#' and \code{"t"} is used in other cases. +#' @param correlation correlation Logical value indicating whether correlation matrix should +#' be computed from covariance matrix by default \code{FALSE}. +#' @param cov Covariance matrix corresponding to regression parameters +#' @param ... Additional optional arguments +#' +#' +#' @method summary nonprobsvy +#' @importFrom stats pt +#' @importFrom stats coef +#' @importFrom stats sd +#' @exportS3Method +summary.nonprobsvy <- function(object, + test = c("t", "z"), + correlation = FALSE, + # regression_confint = FALSE, confint Logical value indicating whether confidence intervals for + # regression parameters should be constructed TODO + cov = NULL, # in case of adding sandwich methods + ...) { + model_specific_info <- specific_summary_info( + object, + correlation = correlation, + ... + ) + df_residual <- model_specific_info$df_residual + if (!is.null(df_residual)) { + if (missing(test)) { + if (df_residual > 30) test <- "z" else test <- "t" + } + } else { + test <- "z" # TODO, for now just z-test in case of mi estimation + } + + cf <- list() + se <- list() + wald_test_stat <- list() + p_values <- list() + crr <- list() + confidence_interval_coef <- list() + + for (k in model_specific_info) { + if (attr(k, "glm")) { + number <- length(se) + 1 + cf[[number]] <- k[, 1] + se[[number]] <- k[, 2] + wald_test_stat[[number]] <- k[, 1] / k[, 2] + + p_values[[number]] <- switch(test, + "t" = 2 * stats::pt(q = -abs(k[, 1] / k[, 2]), df = df_residual), + "z" = 2 * stats::pnorm(q = abs(k[, 1] / k[, 2]), lower.tail = FALSE) + ) + + temp_correlation <- if (isFALSE(correlation)) { + NULL + } else { + cov / outer(k[, 2], k[, 2]) + } + if (isTRUE(correlation)) { + rownames(temp_correlation) <- colnames(temp_correlation) <- names(rownames(k)) + } + + crr[[number]] <- temp_correlation + + # confidence_interval_coef <- append(confidence_interval_coef, + # if(isTRUE(confint)) {confint(object, ...)} else {NULL}) + } else { + # TODO + } + } + if (!is.null(object$SE)) { + se_mean <- c(object$output[, 2], object$SE$prob, object$SE$nonprob) + } else { + se_mean <- NULL + } + res <- structure( + list( + call = object$call, + pop_total = list( + mean = object$output$mean, + se = se_mean, + cnf_int = object$confidence_interval + ), + sample_size = nobs(object, ...), + population_size = pop.size(object, ...), + test = test, + control = object$control, + model = switch(class(object)[2], + "nonprobsvy_dr" = "Doubly-Robust", + "nonprobsvy_ipw" = "Inverse probability weighted", + "nonprobsvy_mi" = "Mass Imputation" + ), + aic = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), AIC(object), "no value for the selected method"), + bic = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), BIC(object), "no value for the selected method"), + residuals = residuals.nonprobsvy(object, type = "response"), + likelihood = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$log_likelihood, "no value for the selected method"), + df_residual = ifelse(class(object)[2] %in% c("nonprobsvy_dr", "nonprobsvy_ipw"), object$selection$df_residual, "no value for the selected method"), + weights = summary(object$weights), + coef = cf, + std_err = se, + w_val = wald_test_stat, + p_values = p_values, + crr = crr, + confidence_interval_coef = confidence_interval_coef, + names = attr(model_specific_info, "model") + ), + class = c("summary_nonprobsvy") + ) + res +} + + +# summary helper functions +# for now just a rough sketch +specific_summary_info <- function(object, ...) { + UseMethod("specific_summary_info") +} + +specific_summary_info.nonprobsvy_ipw <- function(object, + ...) { + coeffs_sel <- matrix(c(object$selection$coefficients, object$selection$std_err), + ncol = 2, + dimnames = list( + names(object$selection$coefficients), + c("Estimate", "Std. Error") + ) + ) + res <- list( + coeffs_sel = coeffs_sel, + weights = object$weights, + df_residual = object$selection$df_residual + ) + + attr(res$coeffs_sel, "glm") <- TRUE + attr(res$weights, "glm") <- FALSE + attr(res$df_residual, "glm") <- FALSE # TODO + attr(res, "model") <- c("glm regression on selection variable") + res +} + +specific_summary_info.nonprobsvy_mi <- function(object, + ...) { + if (object$outcome[[1]]$method == "glm") { # TODO for pmm + coeffs_out <- matrix(c(object$outcome[[1]]$coefficients, object$outcome[[1]]$std_err), + ncol = 2, + dimnames = list( + names(object$outcome[[1]]$coefficients), + c("Estimate", "Std. Error") + ) + ) + } else { + coeffs_out <- "no coefficients" + } + + res <- list( + coeffs_out = coeffs_out + ) + if (object$outcome[[1]]$method == "glm") { + attr(res$coeffs_out, "glm") <- TRUE + attr(res, "model") <- "glm regression on outcome variable" + } else if (object$outcome[[1]]$method == "nn") { + attr(res$coeffs_out, "glm") <- FALSE + } else if (object$outcome[[1]]$method == "pmm") { # TODO + attr(res$coeffs_out, "glm") <- FALSE + # attr(res, "model") <- "glm regression on outcome variable" + } + res +} + +specific_summary_info.nonprobsvy_dr <- function(object, + ...) { + coeffs_sel <- matrix(c(object$selection$coefficients, object$selection$std_err), + ncol = 2, + dimnames = list( + names(object$selection$coefficients), + c("Estimate", "Std. Error") + ) + ) + + + if (object$outcome[[1]]$method == "glm") { + coeffs_out <- matrix(c(object$outcome[[1]]$coefficients, object$outcome[[1]]$std_err), + ncol = 2, + dimnames = list( + names(object$outcome[[1]]$coefficients), + c("Estimate", "Std. Error") + ) + ) + } else { + coeffs_out <- "no coefficients" + } + + res <- list( + coeffs_sel = coeffs_sel, + coeffs_out = coeffs_out, + weights = object$weights, + df_residual = object$selection$df_residual + ) + attr(res$coeffs_sel, "glm") <- TRUE + if (object$outcome[[1]]$method == "glm") { + attr(res$coeffs_out, "glm") <- TRUE + attr(res, "model") <- c( + "glm regression on selection variable", + "glm regression on outcome variable" + ) + } else if (object$outcome[[1]]$method == "nn") { + attr(res$coeffs_out, "glm") <- FALSE + attr(res, "model") <- c("glm regression on selection variable") + } + attr(res$weights, "glm") <- FALSE + attr(res$df_residual, "glm") <- FALSE + + res +} diff --git a/R/theta_funcs.R b/R/theta_funcs.R new file mode 100644 index 0000000..83d7a69 --- /dev/null +++ b/R/theta_funcs.R @@ -0,0 +1,312 @@ +# score equation for theta, used in variable selection +u_theta <- function(R, + X, + weights, + method_selection, + h, + N = NULL, + pop_totals = NULL, + pop_size = NULL) { + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + inv_link <- method$make_link_inv + function(par) { + #loc_nons = which(R == 1) + #loc_rand = which(R == 0) + theta <- as.matrix(par) + n <- length(R) + X0 <- as.matrix(X) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) + R_rand <- 1 - R + ps <- as.vector(ps) + N_nons <- sum(1 / ps) + weights_sum <- sum(weights) + + # "1" = t(X0[loc_nons,]) %*% (1/ps[loc_nons]) - t(X0[loc_rand,]) %*% weights[loc_rand], + # "2" = c(apply(X0 * R * weights - X0 * R_rand * ps * weights, 2, sum)) + if (is.null(pop_totals)) { + eq <- switch(h, + "1" = c(apply(X0 * R / ps * weights - X0 * R_rand * weights, 2, sum)), # consider division by N_nons + "2" = c(apply(X0 * R * weights - X0 * R_rand * ps * weights, 2, sum)) + ) + } else { + eq <- c(apply(X0 * R / ps * weights, 2, sum)) - pop_totals + } + eq + } +} + + +# derivative of score equation for theta, used in variable selection +u_theta_der <- function(R, + X, + weights, + method_selection, + h, + N = NULL, + pop_totals = NULL) { + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + inv_link <- method$make_link_inv + dinv_link <- method$make_link_inv_der + inv_link_rev <- method$make_link_inv_rev + + function(par) { + #loc_nons = which(R == 1) + #loc_rand = which(R == 0) + theta <- as.matrix(par) + X0 <- as.matrix(X) + p <- ncol(X0) + eta <- as.numeric(X0 %*% theta) + ps <- inv_link(eta) + ps <- as.vector(ps) + N_nons <- sum(1 / ps) + R_rand <- 1 - R + weights_sum <- sum(weights) + + # "1" = t(X0[loc_nons, ]) %*% weights[loc_nons] %*% t(inv_link_rev(eta)[loc_nons]) %*% X0[loc_nons, ], + # "2" = + if (!is.null(pop_totals)) { + mxDer <- t(R * X0 * weights * inv_link_rev(eta)) %*% X0 + } else { + mxDer <- switch(h, + "1" = t(R * X0 * weights * inv_link_rev(eta)) %*% X0, # TODO bug here when solve for some data - probably because of inv_link_rev + "2" = - t(R_rand * X0 * weights * dinv_link(eta)) %*% X0 + ) + } + as.matrix(mxDer, nrow = p) # consider division by N_nons + } +} + + +theta_h_estimation <- function(R, + X, + weights_rand, + weights, + h, + method_selection, + maxit, + start = NULL, + pop_totals = NULL, + pop_means = NULL) { # TODO with BERENZ recommendation + + p <- ncol(X) + # if (is.null(pop_totals) & is.null(pop_means)) { + # if (is.null(start)) { + # start0 <- start_fit(X = X, # <--- does not work with pop_totals + # R = R, + # weights = weights, + # weights_rand = weights_rand, + # method_selection = method_selection) + # } else { + # start0 <- start + # } + # } else { # TODO customize start point for fitting with population totals + # # start0 <- rep(.8, ncol(X)) + # # X_pop <- rbind(X, pop_totals) + # # weights_randd <- 1 + # if (is.null(start)) { + # start0 <- start_fit(X = X, # <--- does not work with pop_totals + # R = R, + # weights = weights, + # weights_rand = weights_rand, + # method_selection = method_selection) + # } else { + # start0 <- start + # } + # } + u_theta <- u_theta( + R = R, + X = X, + weights = c(weights_rand, weights), + h = h, + method_selection = method_selection, + pop_totals = pop_totals + ) + + u_theta_der <- u_theta_der( + R = R, + X = X, + weights = c(weights_rand, weights), + h = h, + method_selection = method_selection, + pop_totals = pop_totals + ) + #print(start) + # ######### BB + # if (method_selection == "cloglog") { + # root <- BB::dfsane( + # par = start, + # fn = u_theta, + # ) + # } else { + # root <- BB::dfsane( + # par = start, + # fn = u_theta + # # control = list(sigma = 0.1, trace = 1) + # ) + # } + # theta_root <- root$par + # print(theta_root) + ######### NLESQLV + if (method_selection == "cloglog") { + root <- nleqslv::nleqslv( + x = start, + fn = u_theta, + method = "Newton", # TODO consider the methods + global = "dbldog", # qline", + xscalm = "auto", + jacobian = TRUE + ) + } else { + root <- nleqslv::nleqslv( + x = start, + fn = u_theta, + method = "Newton", # TODO consider the methods + global = "dbldog", # qline", + xscalm = "auto", + jacobian = TRUE, + jac = u_theta_der + # control = list(sigma = 0.1, trace = 1) + ) + } + theta_root <- root$x + # stop("123") + if (root$termcd %in% c(2:7, -10)) { + switch(as.character(root$termcd), + "2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."), + "3" = warning("Algorithm did not find suitable point - has stalled cannot find an acceptable new point when fitting selection model by nleqslv."), + "4" = warning("Iteration limit exceeded when fitting selection model by nleqslv."), + "5" = warning("ill-conditioned Jacobian when fitting selection model by nleqslv."), + "6" = warning("Jacobian is singular when fitting selection model by nleqslv."), + "7" = warning("Jacobian is unusable when fitting selection model by nleqslv."), + "-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv.") + ) + } + theta_h <- as.vector(theta_root) + grad <- u_theta(theta_h) + if (method_selection == "cloglog") { + hess <- root$jac + } else { + hess <- u_theta_der(theta_h) # TODO compare with root$jac + } + + list( + theta_h = theta_h, + hess = hess, + grad = grad + ) +} + + + +# joint score equation for theta and beta, used in estimation when variable selections +u_theta_beta_dr <- function(par, + R, + X, + y, + weights, + method_selection, + family_nonprobsvy) { + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + + inv_link <- method$make_link_inv + inv_link_rev <- method$make_link_inv_rev + + p <- ncol(X) + theta <- par[1:(p)] + beta <- par[(p + 1):(2 * p)] + eta_pi <- X %*% theta + ps <- inv_link(eta_pi) + y[which(is.na(y))] <- 0 + ps <- as.vector(ps) + + eta <- X %*% beta + mu <- family_nonprobsvy$linkinv(eta) + mu_der <- as.vector(family_nonprobsvy$mu.eta(eta)) + res <- family_nonprobsvy$residuals(mu = mu, y = y) + mu_der <- 1 + + n <- length(R) + R_rand <- 1 - R + + utb <- c( + apply(X * R / ps * mu_der * weights - X * R_rand * weights * mu_der, 2, sum), + apply(X * R * weights * as.vector(-inv_link_rev(eta_pi)) * res, 2, sum) + ) / n + + utb +} + + +u_theta_ipw <- function(par, + R, + X, + y, + weights, + method_selection) { # TODO + + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + inv_link_rev <- method$make_link_inv_rev + inv_link <- method$make_link_inv + + p <- ncol(X) + theta <- par + eta_pi <- X %*% theta + y[which(is.na(y))] <- 0 + + R_rand <- 1 - R + loc_nons <- which(R == 1) + loc_rand <- which(R == 0) + n <- length(R) + y_mean <- mean(y[loc_nons]) + + # UTB <- apply(X0 * (R * as.vector(inv_link(eta_pi)) - y), 2, sum)/n # TODO + UTB <- apply(X * (R / as.vector(inv_link(eta_pi)) * y - mean(y)) * as.vector(inv_link_rev(eta_pi)), 2, sum) # TODO + + UTB +} + +# TODO Jacobian of the estimating equations for dr method +u_theta_beta_dr_jacob <- function(par, + R, + X, + y, + weights, + method_selection, + family_nonprobsvy) { + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + + inv_link <- method$make_link_inv + inv_link_rev <- method$make_link_inv_rev + dinv_link_rev <- method$make_link_inv_rev_de + + p <- ncol(X) + theta <- par[1:(p + 1)] + beta <- par[(p + 2):(2 * p + 2)] + X0 <- cbind(1, X) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) + y[which(is.na(y))] <- 0 + ps <- as.vector(ps) + + eta <- X0 %*% beta + mu <- family_nonprobsvy$mu(eta) + mu_der <- family_nonprobsvy$mu_der(mu) + mu_der2 <- family_nonprobsvy$mu_der2(mu) + res <- family_nonprobsvy$residuals(mu = mu, y = y) + n <- length(R) + R_rand <- 1 - R + + jac <- c( + apply(-X0 * R * weights * as.vector(inv_link_rev(eta_pi)) * mu_der, 2, sum), + apply(X0 * R / ps * mu_der2 * weights - X0 * R_rand * weights * mu_der2, 2, sum), + apply(X0 * R * weights * as.vector(dinv_link_rev(eta_pi)) * res * X0, 2, sum), + apply(X0 * R * weights * as.vector(inv_link_rev(eta_pi)) * mu_der, 2, sum) + ) / n + jac +} diff --git a/R/varianceDR.R b/R/varianceDR.R new file mode 100644 index 0000000..4c3c43b --- /dev/null +++ b/R/varianceDR.R @@ -0,0 +1,104 @@ +# Variance for doubly robust estimator +# TODO add nn and pmm +internal_varDR <- function(OutcomeModel, + SelectionModel, + y_nons_pred, + weights, + weights_rand, + method_selection, + control_selection, + theta, + ps_nons, + hess, + ps_nons_der, + est_ps_rand, + y_rand_pred, + N_nons, + est_ps_rand_der, + svydesign, + est_method, + h, + pop_totals, + sigma, + bias_correction, + verbose) { + ######### mm + if (bias_correction == TRUE) { + infl1 <- (weights * (OutcomeModel$y_nons - y_nons_pred))^2 / ps_nons^2 + infl2 <- (weights * (OutcomeModel$y_nons - y_nons_pred))^2 / ps_nons + + # Variance estimators #### + svydesign <- stats::update(svydesign, + y_rand = y_rand_pred + ) + svydesign_mean <- survey::svymean(~y_rand, svydesign) + + var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component + var_nonprob <- (sum((infl1) - 2 * infl2) + sum(weights_rand * sigma)) / N_nons^2 # TODO potential bug here nonprobability component + } else { + eta <- as.vector(SelectionModel$X_nons %*% as.matrix(theta)) + h_n <- 1 / N_nons * sum(OutcomeModel$y_nons - y_nons_pred) # TODO add weights # errors mean + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + est_method <- get_method(est_method) + # psd <- method$make_link_inv_der(eta) + + b <- method$b_vec_dr( + X = SelectionModel$X_nons, + ps = ps_nons, + psd = ps_nons_der, + y = OutcomeModel$y_nons, + hess = hess, + eta = eta, + h_n = h_n, + y_pred = y_nons_pred, + weights = weights, + verbose = verbose + ) + + # asymptotic variance by each propensity score method (nonprobability component) + var_nonprob <- est_method$make_var_nonprob( + ps = ps_nons, + psd = ps_nons_der, + y = OutcomeModel$y_nons, + y_pred = y_nons_pred, + h_n = h_n, + X = SelectionModel$X_nons, + b = b, + N = N_nons, + h = h, + method_selection = method_selection, + weights = weights, + pop_totals = pop_totals + ) + + + if (is.null(pop_totals)) { + t <- est_method$make_t( + X = SelectionModel$X_rand, + ps = est_ps_rand, + psd = est_ps_rand_der, + b = b, + h = h, + y_rand = y_rand_pred, + y_nons = y_nons_pred, + N = N_nons, + method_selection = method_selection, + weights = weights + ) + # design based variance estimation based on approximations of the second-order inclusion probabilities + svydesign <- stats::update(svydesign, + t = t + ) + svydesign_mean <- survey::svymean(~t, svydesign) # perhaps using survey package to compute prob variance + var_prob <- as.vector(attr(svydesign_mean, "var")) + } else { + var_prob <- 0 + } + } + + list( + var_prob = var_prob, + var_nonprob = var_nonprob + ) +} diff --git a/R/varianceIPW.R b/R/varianceIPW.R new file mode 100644 index 0000000..47058ca --- /dev/null +++ b/R/varianceIPW.R @@ -0,0 +1,90 @@ +# Variance for inverse probability weighted estimator +internal_varIPW <- function(svydesign, + X_nons, + X_rand, + y_nons, + weights, + ps_nons, + mu_hat, + hess, + ps_nons_der, + N, + est_ps_rand, + ps_rand, + est_ps_rand_der, + n_rand, + pop_size, + pop_totals, + method_selection, + est_method, + theta, + h, + verbose, + var_cov1 = var_cov1, + var_cov2 = var_cov2) { + eta <- as.vector(X_nons %*% as.matrix(theta)) + method_selection <- paste(method_selection, "_model_nonprobsvy", sep = "") + method <- get_method(method_selection) + b_obj <- method$b_vec_ipw( + X = X_nons, + ps = ps_nons, + psd = ps_nons_der, + y = y_nons, + mu = mu_hat, + hess = hess, + eta = eta, + pop_size = pop_size, + weights = weights, + verbose = verbose + ) + b <- b_obj$b + + # sparse matrix + b_vec <- cbind(-1, b) + H_mx <- try(cbind(0, N * solve(hess)), silent = TRUE) + if(inherits(H_mx, "try-error")){ + if(verbose) message("solve() failed, using ginv() instead.") + H_mx <- cbind(0, N * ginv(hess)) + } + sparse_mx <- Matrix::Matrix(rbind(b_vec, H_mx), sparse = TRUE) + + V1 <- var_cov1( + X = X_nons, + y = y_nons, + mu = mu_hat, + ps = ps_nons, + psd = ps_nons_der, + pop_size = pop_size, + est_method = est_method, + h = h, + weights = weights, + pop_totals = pop_totals + ) # fixed + V2 <- var_cov2( + X = X_rand, + svydesign = svydesign, + eps = est_ps_rand, + est_method = est_method, + h = h, + pop_totals = pop_totals, + psd = est_ps_rand_der + ) + + + # variance-covariance matrix for set of parameters (mu_hat and theta_hat) + V_mx_nonprob <- sparse_mx %*% V1 %*% t(as.matrix(sparse_mx)) # nonprobability component + V_mx_prob <- sparse_mx %*% V2 %*% t(as.matrix(sparse_mx)) # probability component + V_mx <- V_mx_nonprob + V_mx_prob + + var_nonprob <- as.vector(V_mx_nonprob[1, 1]) + var_prob <- as.vector(V_mx_prob[1, 1]) + var <- as.vector(V_mx[1, 1]) + # vector of variances for theta_hat + # theta_hat_var <- diag(as.matrix(V_mx[2:ncol(V_mx), 2:ncol(V_mx)])) + + list( + var_nonprob = var_nonprob, + var_prob = var_prob, + var = var + ) +} diff --git a/R/varianceMI.R b/R/varianceMI.R new file mode 100644 index 0000000..6c77838 --- /dev/null +++ b/R/varianceMI.R @@ -0,0 +1,123 @@ +# Variance for mass imputation estimator +#' @importFrom stats loess +#' @importFrom stats predict +#' @importFrom stats loess.control +#' @importFrom survey svymean +internal_varMI <- function(svydesign, + X_nons, + X_rand, + y, + y_pred, + y_hat, + weights_rand, + method, + n_rand, + n_nons, + N, + family, + model_obj, + pop_totals, + k, + predictive_match, + pmm_exact_se, + pmm_reg_engine, + pi_ij + ) { + parameters <- model_obj$parameters + + if (is.character(family)) { + family_nonprobsvy <- paste(family, "_nonprobsvy", sep = "") + family_nonprobsvy <- get(family_nonprobsvy, mode = "function", envir = parent.frame()) + family_nonprobsvy <- family_nonprobsvy() + } + + if (is.null(pop_totals)) { + svydesign_mean <- survey::svymean(~y_hat_MI, svydesign) + var_prob <- as.vector(attr(svydesign_mean, "var")) # probability component, should be bigger for nn + if (method == "nn") { + sigma_hat <- mean((y - y_pred)^2) # family_nonprobsvy$variance(mu = y_pred, y = y) + est_ps <- n_nons / N + var_nonprob <- n_rand / N^2 * (1 - est_ps) / est_ps * sigma_hat + } else if (method == "glm") { # TODO add variance for count binary outcome variable control_outcome$method + + beta <- parameters[, 1] + eta_nons <- X_nons %*% beta + eta_rand <- X_rand %*% beta + + mx <- 1 / N * colSums(as.data.frame(X_rand) * (weights_rand * family_nonprobsvy$mu.eta(eta_rand))) + c <- solve(1 / n_nons * t(as.data.frame(X_nons) * family_nonprobsvy$mu.eta(eta_nons)) %*% X_nons) %*% mx + residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) + + # nonprobability component + var_nonprob <- 1 / n_nons^2 * t(as.matrix(residuals^2)) %*% (X_nons %*% c)^2 + var_nonprob <- as.vector(var_nonprob) + } else if (method == "pmm") { + var_prob <- as.numeric(attr(svymean(~y_hat_MI, svydesign), "var")) + + # This in general cannot be computed from sample itself, we need to make + # a bootstrap. Sometimes this term is negligible hence by default its + # not computed, but it should be computed in serious publications + var_nonprob <- 0 + + # An option in controlInf controls this + # Maybe add a warning/message if this computation is omited + if (pmm_exact_se) { + var_nonprob <- pmm_exact( + pi_ij = pi_ij, + weights_rand = weights_rand, + n_nons = n_nons, + y = y, + pmm_reg_engine = pmm_reg_engine, + model_obj = model_obj, + svydesign = svydesign, + predictive_match = predictive_match, + k = k, + N = N + ) + } + } + } else { + if (method == "nn") { + sigma_hat <- mean((y - y_pred)^2) # family_nonprobsvy$variance(mu = y_pred, y = y) + est_ps <- n_nons / N + var_nonprob <- n_nons / N^2 * (1 - est_ps) / est_ps * sigma_hat # what instead of n_rand here (?) now just n_nons + } else if (method == "glm") { + beta <- parameters[, 1] + eta_nons <- X_nons %*% beta + if (family %in% c("binomial", "poisson")) { # TODO consider this chunk of code + eta_rand <- pop_totals %*% beta / pop_totals[1] + } else { + eta_rand <- pop_totals %*% beta + } + mx <- 1 / N * pop_totals * as.vector(family_nonprobsvy$mu.eta(eta_rand)) + c <- solve(1 / n_nons * t(as.data.frame(X_nons) * family_nonprobsvy$mu.eta(eta_nons)) %*% X_nons) %*% mx + residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) + + # nonprobability component + var_nonprob <- 1 / n_nons^2 * t(as.matrix(residuals^2)) %*% (X_nons %*% c)^2 + var_nonprob <- as.vector(var_nonprob) + } else if (method == "pmm") { + # beta <- parameters[,1] + # eta_nons <- X_nons %*% beta + # + # if (family %in% c("binomial", "poisson")) { # TODO consider this chunk of code + # eta_rand <- pop_totals %*% beta / pop_totals[1] + # } else { + # eta_rand <- pop_totals %*% beta + # } + # + # residuals <- family_nonprobsvy$residuals(mu = y_pred, y = y) + + # nonprobability component + # var_nonprob <- 1/n_nons^2 * t(as.matrix(residuals^2)) %*% (family_nonprobsvy$mu_der(eta_nons) %*% t(X_nons))^2 + var_nonprob <- 0 + var_nonprob <- as.vector(var_nonprob) + } + var_prob <- 0 + } + + list( + var_prob = var_prob, + var_nonprob = var_nonprob + ) +} diff --git a/inst/tinytest/test-2-ipw-totals.R b/inst/tinytest/test-2-ipw-totals.R index 8382e50..bd4e25a 100644 --- a/inst/tinytest/test-2-ipw-totals.R +++ b/inst/tinytest/test-2-ipw-totals.R @@ -192,7 +192,7 @@ expect_identical(y_all_corr_all$confidence_interval, # These tests are only supposed to be run on developer's machine and # package GitHub page not on CRAN (they take too long) -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -314,7 +314,7 @@ expect_identical(y_all_corr_all$confidence_interval, # verbose = T) # ) -# } +} ## non-linear case ------------------------------------------------------------------------ @@ -503,7 +503,7 @@ expect_identical(y_all_corr_all$confidence_interval, #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { ## y_11 expect_silent( @@ -539,7 +539,7 @@ expect_identical(y_all_corr_all$confidence_interval, y12_corr_scad$confidence_interval$upper_bound > mean(Y_12)) ## conf int expect_true(NROW(y12_corr_scad$selection$coefficients) == 2) - ## y_21 + # y_21 expect_silent( y21_corr_scad <- nonprob(selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, target = ~ Y_21, @@ -556,7 +556,7 @@ expect_identical(y_all_corr_all$confidence_interval, y21_corr_scad$confidence_interval$upper_bound > mean(Y_21)) ## conf int expect_true(NROW(y21_corr_scad$selection$coefficients) == 2) - ## y_22 + # # y_22 expect_silent( y22_corr_scad <- nonprob(selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, target = ~ Y_22, @@ -623,7 +623,7 @@ expect_identical(y_all_corr_all$confidence_interval, # verbose = T) # ) -# } +} # check probit ---------------------------------------------------------------------------- @@ -809,7 +809,7 @@ expect_identical(y_all_corr_all$confidence_interval, row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -931,7 +931,7 @@ expect_identical(y_all_corr_all$confidence_interval, # verbose = T) # ) -# } +} ## non-linear case ------------------------------------------------------------------------ #### correctly specified variables -------------------------------------------------------- @@ -1117,11 +1117,11 @@ expect_identical(y_all_corr_all$confidence_interval, row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- - ## y_11 + # # y_11 expect_silent( y11_corr_scad <- nonprob(selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, target = ~ Y_11, @@ -1155,7 +1155,7 @@ expect_identical(y_all_corr_all$confidence_interval, y12_corr_scad$confidence_interval$upper_bound > mean(Y_12)) ## conf int expect_true(NROW(y12_corr_scad$selection$coefficients) == 2) - ## y_21 + # # y_21 expect_silent( y21_corr_scad <- nonprob(selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, target = ~ Y_21, @@ -1172,7 +1172,7 @@ expect_identical(y_all_corr_all$confidence_interval, y21_corr_scad$confidence_interval$upper_bound > mean(Y_21)) ## conf int expect_true(NROW(y21_corr_scad$selection$coefficients) == 2) - ## y_22 + # # y_22 expect_silent( y22_corr_scad <- nonprob(selection = ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, target = ~ Y_22, @@ -1239,7 +1239,7 @@ expect_identical(y_all_corr_all$confidence_interval, # verbose = T) # ) -# } +} # check cloglog ----------------------------------------------------- ## linear case ---------------------------------------------------------------------------- @@ -1422,7 +1422,7 @@ expect_silent( # row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) # -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -1558,7 +1558,7 @@ expect_silent( # y22_corr_one$confidence_interval$upper_bound > mean(Y_22)) ## conf int # -# } +} # ##### all target variables --------------------------------------------------------------- # expect_silent( @@ -1679,7 +1679,7 @@ expect_silent( # y22_corr_all$confidence_interval$upper_bound), # row.names = c("Y_11", "Y_12", "Y_21", "Y_22"))) -# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { +if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) { #### variable selection ------------------------------------------------------------------ ##### one target variable ---------------------------------------------------------------- @@ -1753,4 +1753,4 @@ expect_silent( # # y22_corr_scad$confidence_interval$upper_bound > mean(Y_22)) ## conf int # expect_true(NROW(y22_corr_scad$selection$coefficients) == 2) -# } +} diff --git a/inst/tinytest/test_nonprobsvy.R b/inst/tinytest/test_nonprobsvy.R index ef204a8..700901b 100644 --- a/inst/tinytest/test_nonprobsvy.R +++ b/inst/tinytest/test_nonprobsvy.R @@ -226,7 +226,27 @@ expect_true( (test3ann$confidence_interval[1] < 5.072862) & (5.072862 < test3ann$confidence_interval[2]) ) +# MI - nn #### +# test3ann <- nonprob(outcome = y1 ~ x, +# data = source_nonprob_p, +# svydesign = svy_a, +# method_outcome = "nn") +expect_silent( + test3apmm <- nonprob(outcome = y1 ~ x, + data = source_nonprob_p, + svydesign = svy_a, + method_outcome = "pmm") +) +expect_equivalent( + test3apmm$output$mean, + 5.086964, + tolerance = .01 +) +expect_true( + (test3apmm$confidence_interval[1] < 5.086964) & + (5.086964 < test3ann$confidence_interval[2]) +) ## bootstrap # These tests are only supposed to be run on developer's machine and diff --git a/man/confint.nonprobsvy.Rd b/man/confint.nonprobsvy.Rd index 3c4c530..a68a430 100644 --- a/man/confint.nonprobsvy.Rd +++ b/man/confint.nonprobsvy.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/simple_methods.R \name{confint.nonprobsvy} \alias{confint.nonprobsvy} \title{Confidence Intervals for Model Parameters} diff --git a/man/controlInf.Rd b/man/controlInf.Rd index 3616d58..98c72a0 100644 --- a/man/controlInf.Rd +++ b/man/controlInf.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/control.R +% Please edit documentation in R/control_inference.R \name{controlInf} \alias{controlInf} \title{Control parameters for inference} @@ -13,7 +13,10 @@ controlInf( num_boot = 500, bias_correction = FALSE, alpha = 0.05, - cores = 1 + cores = 1, + keep_boot, + pmm_exact_se = FALSE, + pi_ij ) } \arguments{ @@ -38,6 +41,20 @@ selection and outcome model. \item{alpha}{Significance level, Default is 0.05.} \item{cores}{Number of cores in parallel computing.} + +\item{keep_boot}{Logical indicating whether statistics from bootstrap should be kept. +By default set to \code{TRUE}} + +\item{pmm_exact_se}{Logical value indicating whether to compute the exact +standard error estimate for \code{pmm} estimator. The variance estimator for +estimation based on \code{pmm} can be decomposed into three parts, with the +third being computed using covariance between imputed values for units in +probability sample using predictive matches from non-probability sample. +In most situations this term is negligible and is very computationally +expensive so by default this is set to \code{FALSE}, but it is recommended to +set this value to \code{TRUE} before submitting final results.} + +\item{pi_ij}{TODO, either matrix or \code{ppsmat} class object.} } \value{ List with selected parameters. diff --git a/man/controlOut.Rd b/man/controlOut.Rd index b4b4973..5d609df 100644 --- a/man/controlOut.Rd +++ b/man/controlOut.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/control.R +% Please edit documentation in R/control_outcome.R \name{controlOut} \alias{controlOut} \title{Control parameters for outcome model} @@ -16,7 +16,11 @@ controlOut( nlambda = 100, nfolds = 10, treetype = "kd", - searchtype = "standard" + searchtype = "standard", + predictive_match = 1:2, + pmm_weights = c("none", "prop_dist"), + pmm_k_choice = c("none", "min_var"), + pmm_reg_engine = c("glm", "loess") ) } \arguments{ @@ -40,9 +44,34 @@ controlOut( \item{nfolds}{The number of folds during cross-validation for variables selection model.} -\item{treetype}{type of tree for nearest neighbour imputation passed to \code{\link[RANN:nn2]{RANN::nn2()}} function.} +\item{treetype}{Type of tree for nearest neighbour imputation passed to \code{\link[RANN:nn2]{RANN::nn2()}} function.} -\item{searchtype}{type of search for nearest neighbour imputation passed to \code{\link[RANN:nn2]{RANN::nn2()}} function.} +\item{searchtype}{Type of search for nearest neighbour imputation passed to \code{\link[RANN:nn2]{RANN::nn2()}} function.} + +\item{predictive_match}{(Only for predictive mean matching) +Indicates how to select 'closest' unit from nonprobability sample for each +unit in probability sample. Either \code{1} (default) or \code{2} where +\code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +\mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2} +is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for +\mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}.} + +\item{pmm_weights}{(Only for predictive mean matching) +Indicate how to weight \code{k} nearest neighbours in \mjseqn{S_{B}} to +create imputed value for units in \mjseqn{S_{A}}. The default value +\code{"none"} indicates that mean of \code{k} nearest \mjseqn{y}'s from +\mjseqn{S_{B}} should be used whereas \code{"prop_dist"} results in +weighted mean of these \code{k} values where weights are inversely +proportional to distance between matched values.} + +\item{pmm_k_choice}{Character value indicating how \code{k} hyper-parameter +should be chosen, by default \code{"none"} meaning \code{k} provided in +\code{control_outcome} argument will be used. For now the only other +option \code{"min_var"} means that \code{k} will be chosen by minimizing +estimated variance of estimator for mean. Parameter \code{k} provided in +this control list will be chosen as starting point.} + +\item{pmm_reg_engine}{TODO} } \value{ List with selected parameters. diff --git a/man/controlSel.Rd b/man/controlSel.Rd index d275c7b..966d21f 100644 --- a/man/controlSel.Rd +++ b/man/controlSel.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/control.R +% Please edit documentation in R/control_selection.R \name{controlSel} \alias{controlSel} \title{Control parameters for selection model} @@ -24,7 +24,7 @@ controlSel( nlambda = 50, nfolds = 10, print_level = 0, - start_type = c("glm", "naive") + start_type = c("glm", "naive", "zero") ) } \arguments{ @@ -78,6 +78,7 @@ controlSel( \itemize{ \item if \code{glm} then start taken from the glm function called on samples. \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest. +\item if \code{zero} then start is a vector of zeros. } }} } @@ -87,12 +88,11 @@ List with selected parameters. \description{ \code{controlSel} constructs a list with all necessary control parameters for selection model. - -\loadmathjax } \seealso{ \code{\link[=nonprob]{nonprob()}} -- for fitting procedure with non-probability samples. } \author{ Łukasz Chrostowski, Maciej Beręsewicz +\loadmathjax } diff --git a/man/nonprob.Rd b/man/nonprob.Rd index b774df7..8d0fa7e 100644 --- a/man/nonprob.Rd +++ b/man/nonprob.Rd @@ -33,7 +33,7 @@ nonprob( ) } \arguments{ -\item{data}{\code{data.frame} with data from the nonprobability sample.} +\item{data}{\code{data.frame} with data from the non-probability sample.} \item{selection}{\code{formula}, the selection (propensity) equation.} @@ -116,7 +116,10 @@ the joint estimating equations for the \code{selection} and \code{outcome} model \item{\code{method} -- set on \code{glm}, since the regression method} } } -In addition, if the variable selection model for the outcome variable is fitting, the list includes the \code{cve} -- the error for each value of \code{lambda}, averaged across the cross-validation folds. +In addition, if the variable selection model for the outcome variable is fitting, the list includes the +\itemize{ +\item{\code{cve} -- the error for each value of \code{lambda}, averaged across the cross-validation folds.} +} \item{\code{selection} -- list containing information about fitting of propensity score model, such as \itemize{ \item{\code{coefficients} -- a named vector of coefficients} @@ -133,9 +136,12 @@ In addition, if the variable selection model for the outcome variable is fitting \item{\code{df_residual} -- the residual degrees of freedom.} \item{\code{log_likelihood} -- value of log-likelihood function if \code{mle} method, in the other case \code{NA}.} \item{\code{cve} -- the error for each value of the \code{lambda}, averaged across the cross-validation folds for the variable selection model -when the propensity score model is fitting.} +when the propensity score model is fitting. Returned only if selection of variables for the model is used.} } } +\item{\code{stat} -- matrix of the estimated population means in each bootstrap iteration. +Returned only if a bootstrap method is used to estimate the variance and \code{keep_boot} in +\code{\link[=controlInf]{controlInf()}} is set on \code{TRUE}.} } } \description{ @@ -269,7 +275,7 @@ in the documentation of the control functions for \code{nonprob}. } \examples{ \donttest{ -# generate data based on Doubly Robust Inference With Nonprobability Survey Samples (2021) +# generate data based on Doubly Robust Inference With Non-probability Survey Samples (2021) # Yilin Chen , Pengfei Li & Changbao Wu library(sampling) set.seed(123) @@ -299,7 +305,7 @@ y80 <- 2 + x1 + x2 + x3 + x4 + sigma_80 * epsilon # population sim_data <- data.frame(y30, y50, y80, x1, x2, x3, x4) -## propensity score model for nonprobability sample (sum to 1000) +## propensity score model for non-probability sample (sum to 1000) eta <- -4.461 + 0.1 * x1 + 0.2 * x2 + 0.1 * x3 + 0.2 * x4 rho <- plogis(eta) @@ -310,7 +316,7 @@ sim_data$p_prob <- inclusionprobabilities(z_prob, n = n_b) # data sim_data$flag_nonprob <- UPpoisson(rho) ## sampling nonprob sim_data$flag_prob <- UPpoisson(sim_data$p_prob) ## sampling prob -nonprob_df <- subset(sim_data, flag_nonprob == 1) ## nonprobability sample +nonprob_df <- subset(sim_data, flag_nonprob == 1) ## non-probability sample svyprob <- svydesign( ids = ~1, probs = ~p_prob, data = subset(sim_data, flag_prob == 1), diff --git a/man/pop.size.Rd b/man/pop.size.Rd index 5ee280e..e0ab2ba 100644 --- a/man/pop.size.Rd +++ b/man/pop.size.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/simple_methods.R \name{pop.size} \alias{pop.size} \title{Estimate size of population} diff --git a/man/summary.nonprobsvy.Rd b/man/summary.nonprobsvy.Rd index 28a7b9c..bbc672f 100644 --- a/man/summary.nonprobsvy.Rd +++ b/man/summary.nonprobsvy.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/summary.R \name{summary.nonprobsvy} \alias{summary.nonprobsvy} \title{Summary statistics for model of nonprobsvy class.} diff --git a/man/vcov.nonprobsvy.Rd b/man/vcov.nonprobsvy.Rd index 2637404..cb8ebe6 100644 --- a/man/vcov.nonprobsvy.Rd +++ b/man/vcov.nonprobsvy.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/simple_methods.R \name{vcov.nonprobsvy} \alias{vcov.nonprobsvy} \title{Obtain Covariance Matrix estimation.} diff --git a/src/Makevars b/src/Makevars index a4ba0f6..22c7566 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,19 @@ -CXX_STD = CXX15 -#PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) + +## With R 3.1.0 or later, you can uncomment the following line to tell R to +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +## +## R 4.0.0 made C++11 the default, R 4.1.0 switched to C++14, R 4.3.0 to C++17 +## _In general_ we should no longer need to set a standard as any recent R +## installation will do the right thing. Should you need it, uncomment it and +## set the appropriate value, possibly CXX17. +#CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win index a76ad1a..b3d0229 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,3 +1,24 @@ -CXX_STD = CXX15 -PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +## This assume that we can call Rscript to ask Rcpp about its locations +## Use the R_HOME indirection to support installations of multiple R version +#PKG_LIBS = $(shell $(R_HOME)/bin/Rscript.exe -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) + + +## With R 3.1.0 or later, you can uncomment the following line to tell R to +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +## +## R 4.0.0 made C++11 the default, R 4.1.0 switched to C++14, R 4.3.0 to C++17 +## _In general_ we should no longer need to set a standard as any recent R +## installation will do the right thing. Should you need it, uncomment it and +## set the appropriate value, possibly CXX17. +#CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/nonprobCV_cpp.cpp b/src/nonprobCV_cpp.cpp index 7a368d5..c3de0fe 100644 --- a/src/nonprobCV_cpp.cpp +++ b/src/nonprobCV_cpp.cpp @@ -473,7 +473,7 @@ Rcpp::List cv_nonprobsvy_rcpp(const arma::mat& X, Rcpp::Function probit = nonprobsvy_env["probit_model_nonprobsvy"]; arma::vec weights; - arma::vec loss_theta_av(nlambda, arma::fill::zeros); + arma::vec loss_theta_av(nlambda); const arma::vec& R_ = R; const arma::mat& X_ = X; @@ -509,6 +509,13 @@ Rcpp::List cv_nonprobsvy_rcpp(const arma::mat& X, const arma::mat& X_rand_train = X_rand.rows(idx_rand); const arma::mat& X_rand_test = X_rand.rows(find(folds_rand == sample_rand(j))); + // Randomize the columns (features) in the training data + // arma::uvec col_indices = arma::shuffle(arma::regspace(0, X_nons_train.n_cols - 1)); + // arma::mat X_nons_train_randomized = X_nons_train.cols(col_indices); + // arma::mat X_rand_train_randomized = X_rand_train.cols(col_indices); + // arma::mat X_nons_test_randomized = X_nons_test.cols(col_indices); + // arma::mat X_rand_test_randomized = X_rand_test.cols(col_indices); + const arma::mat& X_train = arma::join_cols(X_rand_train, X_nons_train); const arma::mat& X_test = arma::join_cols(X_rand_test, X_nons_test); int ncols = X_test.n_cols; @@ -549,8 +556,8 @@ Rcpp::List cv_nonprobsvy_rcpp(const arma::mat& X, //loss_theta_av(i) = mean(loss_theta_vec); } - arma::vec loss_theta_av(nlambda); arma::vec loss_theta_vec(nfolds); + // Vector to store means, one for each field for (int i = 0; i < nlambda; i++) { // arma::vec loss_theta_vec(nfolds); for (int j = 0; j < nfolds; j++) {