Skip to content

Commit

Permalink
Initial cleaning before going on CRAN
Browse files Browse the repository at this point in the history
  • Loading branch information
LukaszChrostowski committed Oct 27, 2024
1 parent f33fa1a commit 5101e05
Show file tree
Hide file tree
Showing 11 changed files with 19 additions and 307 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ Imports:
nleqslv,
doParallel,
foreach,
parallel,
formula.tools
parallel
LinkingTo:
Rcpp,
RcppArmadillo
Expand Down
3 changes: 0 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,13 @@ importFrom(Rcpp,sourceCpp)
importFrom(doParallel,registerDoParallel)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(formula.tools,lhs.vars)
importFrom(formula.tools,rhs.vars)
importFrom(maxLik,maxLik)
importFrom(ncvreg,cv.ncvreg)
importFrom(nleqslv,nleqslv)
importFrom(parallel,makeCluster)
importFrom(parallel,stopCluster)
importFrom(stats,AIC)
importFrom(stats,BIC)
importFrom(stats,aggregate)
importFrom(stats,as.formula)
importFrom(stats,binomial)
importFrom(stats,coef)
Expand Down
1 change: 0 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
- add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables
- add estimation of exact standard error for k-nearest neighbor estimator.
- add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching.
- add methods for estimation of means and totals in subsets/groups with its variances.
- implement `div` option when variable selection (more in documentation) for doubly robust estimation.

## nonprobsvy 0.1.0
Expand Down
233 changes: 0 additions & 233 deletions R/simple_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -321,236 +321,3 @@ deviance.nonprobsvy <- function(object,
if (class(object)[2] == "nonprobsvy_dr") res <- c("selection" = res_sel, "outcome" = res_out)
res
}
#' @title Total values of covariates in subgroups
#'
#' @param x - formula
#' @param nonprob - object of nonprobsvy class
#'
#' @method total nonprobsvy
#' @return A vector of estimated totals of the given covariates in subgroups
#' @importFrom formula.tools lhs.vars
#' @importFrom formula.tools rhs.vars
#' @importFrom stats aggregate
total.nonprobsvy <- function(x, nonprob) {
variables <- lhs.vars(x)
groups <- rhs.vars(x)
# TODO variance implementation
if (is.null(variables)) {
data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
total <- tapply(nonprob$weights[which(nonprob$R == 1)], data, sum)
} else {
data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
weights <- nonprob$weights[which(nonprob$R == 1)]

total <- aggregate(data[, variables] * weights, by = data[, groups, drop = FALSE], sum)
}
total
}
#' @title Mean values of covariates in subgroups
#'
#' @param x - formula
#' @param nonprob - object of nonprobsvy class
#'
#' @method mean nonprobsvy
#' @return A vector of estimated means of the given covariates in subgroups
#' @importFrom formula.tools lhs.vars
#' @importFrom formula.tools rhs.vars
#' @importFrom stats aggregate
mean.nonprobsvy <- function(x, nonprob) {
# TODO variance for MI
# TODO all implementation for DR
variables <- lhs.vars(x)
groups <- rhs.vars(x)
class_nonprob <- class(nonprob)[2]
if (!is.null(variables) & class_nonprob != 'nonprobsvy_ipw')
stop("DR or MI estimators only for y variable in subgroups. Recommended formula definition
is something like ~ x + y where x and y are grouping variables")
if (is.null(variables)) {
# data <- nonprob$data[, groups, drop = FALSE]
data <- model.matrix(as.formula(paste(x, "- 1")), data = nonprob$data)
if (class_nonprob == "nonprobsvy_ipw") {
mean_value <- sapply(as.data.frame(data), function(col) weighted.mean(col, nonprob$weights))
ys <- data
} else {
# MI TODO for more than one y
if (class_nonprob == "nonprobsvy_mi") {
data_nonprob <- split(cbind(nonprob$data, nonprob$outcome[[1]]$fitted.values), x)
data_prob <- split(cbind(nonprob$svydesign$variables, nonprob$svydesign$prob), x)

# X_nons <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_nonprob)
# X_rand <- model.matrix(delete.response(terms(nonprob$outcome[[1]]$formula)), data_prob)

mean_value <- aggregate(formula(paste("y_hat_MI", x)),
data = nonprob$svydesign$variables,
FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]),
w = 1 / nonprob$svydesign$prob)

} else if (class_nonprob == "nonprobsvy_dr") {
mean_value <- list()
for (i in 1:p) {
mean_value[[i]] <- mu_hatDR(
y = ,
y_nons = ,
y_rand = ,
weights = , # TODO
weights_nons = ,
weights_rand = ,
N_nons = ,
N_rand = )
}
}
}
} else {
data <- nonprob$data[, c(variables, groups)]
weights <- nonprob$weights
if (class_nonprob == "nonprobsvy_ipw") {
mean_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
FUN = function(y, w) weighted.mean(y, w = w[1:length(y)]),
w = weights)
}
}
p <- length(mean_value)
variances <- numeric(length = p)

for (i in 1:p) {
if (class_nonprob == "nonprobsvy_ipw") {
yy <- ys[,i]
mu <- mean_value[i]
var <- internal_varIPW(svydesign = nonprob$svydesign,
X_nons = nonprob$X[nonprob$R == 1, ],
X_rand = nonprob$X[nonprob$R == 0, ],
y_nons = yy,
weights = nonprob$selection$prior.weights,
ps_nons = nonprob$prob[nonprob$R == 1],
mu_hat = mu,
hess = nonprob$selection$hessian,
ps_nons_der = as.vector(nonprob$selection$prob_der),
N = nonprob$pop_size,
est_ps_rand = as.vector(nonprob$selection$prob_rand_est), # TODO
ps_rand = nonprob$svydesign$prob_rand,
est_ps_rand_der = as.vector(nonprob$selection$prob_rand_est_der), # TODO
n_rand = nonprob$prob_size,
pop_size = nonprob$pop_size,
pop_totals = nonprob$selection$pop_totals,
method_selection = nonprob$selection$method_selection,
est_method = nonprob$selection$method,
theta = nonprob$selection$coefficients,
h = nonprob$selection$h_function,
verbose = FALSE,
var_cov1 = nonprob$selection$link$variance_covariance1,
var_cov2 = nonprob$selection$link$variance_covariance2
)
} else if (nonprob_class == "nonprobsvy_mi") {

var <- internal_varMI(svydesign = nonprob$svydesign,
X_nons = nonprob$X[nonprob$R == 1, ], # subset by x formula
X_rand = nonprob$X[nonprob$R == 0, ], # subset by x formula
y = ys_nons[[i]]$single_shift, # TODO y_nons
y_pred = ys_pred_nons[[i]]$single_shift, # TODO
weights_rand = 1 / nonprob$svydesig$prob, # subset by x formula
method = method_outcome,
n_rand = nonprob$prob_size,
n_nons = nonprob$nonprob_size,
N = nonprob$pop_size,
family = nonprob$outcome$family,
model_obj = model_objects, # TODO
pop_totals = nonprob$pop_totals,
# we should probably just pass full control list
k = nonprob$control$control_outcome$k,
predictive_match = nonprob$control$control_outcome$predictive_match,
nn_exact_se = nonprob$control$control_inference$nn_exact_se,
pmm_reg_engine = nonprob$control$control_outcome$pmm_reg_engine,
pi_ij = nonprob$control$control_inference$pi_ij
)
} else if (nonprob_class == "nonprobsvy_dr") {
var <- internal_varDR(OutcomeModel = OutcomeModel,
SelectionModel = SelectionModel,
y_nons_pred = ys_nons_pred,
weights = 1,
weights_rand = 1 / nonprob$svydesign$prob,
method_selection = method_selection,
control_selection = nonprob$control$control_selection,
ps_nons = nonprob$prob,
theta = nonprob$selection$coefficients,
hess = nonprob$selection$hessian,
ps_nons_der = nonprob$selection$ps_nons_der,
est_ps_rand = nonprob$selection$prob_rand_est,
y_rand_pred = ys_pred,
N_nons = nonprob$pop_size,
est_ps_rand_der = nonprob$selection$prob_rand_est_der,
svydesign = nonprob$svydesign,
est_method = nonprob$selection$method,
h = nonprob$selection$h_function,
pop_totals = nonprob$pop_totals,
sigma = nonprob$outcome$sigma, # TODO
bias_correction = nonprob$control$control_inference$bias_corr,
verbose = FALSE
)
}
variances[i] <- var$var
}
res <- data.frame(mean = mean_value,
se = sqrt(variances)) # perhaps convert to data.frame
res
}
# @title Median values of covariates in subgroups
#
# @param x - formula
# @param nonprob - object of nonprobsvy class
#
# @method median nonprobsvy
# @return A vector of estimated medians of the given covariates in subgroups
# @importFrom formula.tools lhs.vars
# @importFrom formula.tools rhs.vars
# @importFrom spatstat weighted.median
# @importFrom stats aggregate
# @importFrom stats median
# median.nonprobsvy <- function(x, nonprob) {
# variables <- lhs.vars(x)
# groups <- rhs.vars(x)
#
# if (is.null(variables)) {
# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
# median_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, median)
# } else {
#
# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
# weights <- nonprob$weights[which(nonprob$R == 1)]
#
# median_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
# FUN = function(y, w) weighted.median(y, w = w[1:length(y)]),
# w = weights)
# }
# median_value
# }
# @title Quantile values of covariates in subgroups
#
# @param x - formula
# @param nonprob - object of nonprobsvy class
# @param probs - probability
#
# @method quantile nonprobsvy
# @return A vector of estimated quantiles if the given covariates in subgroups
# @importFrom formula.tools lhs.vars
# @importFrom formula.tools rhs.vars
# @importFrom spatstat weighted.quantile
# @importFrom stats aggregate
# @importFrom stats quantile
# quantile.nonprobsvy <- function(x, nonprob, probs = 0.5) {
# variables <- lhs.vars(x)
# groups <- rhs.vars(x)
#
# if (is.null(variables)) {
# data <- nonprob$data[which(nonprob$R == 1), groups, drop = FALSE]
# percentile_value <- tapply(nonprob$weights[which(nonprob$R == 1)], data, function(y) quantile(y, probs = probs))
# } else {
#
# data <- nonprob$data[which(nonprob$R == 1), c(variables, groups)]
# weights <- nonprob$weights[which(nonprob$R == 1)]
#
# percentile_value <- aggregate(data[, variables], by = data[, groups, drop = FALSE],
# FUN = function(y, w) weighted.quantile(y, w = w[1:length(y)], probs = probs),
# w = weights)
# }
# percentile_value
# }
15 changes: 6 additions & 9 deletions inst/tinytest/test-2-ipw-totals.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {

library(sampling)
library(survey)

Expand Down Expand Up @@ -505,7 +503,7 @@
#### variable selection ------------------------------------------------------------------
##### one target variable ----------------------------------------------------------------

# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {

## y_11
expect_silent(
Expand Down Expand Up @@ -625,7 +623,7 @@
# verbose = T)
# )

# }
}


# check probit ----------------------------------------------------------------------------
Expand Down Expand Up @@ -811,7 +809,7 @@
row.names = c("Y_11", "Y_12", "Y_21", "Y_22")))


# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
#### variable selection ------------------------------------------------------------------
##### one target variable ----------------------------------------------------------------

Expand Down Expand Up @@ -933,7 +931,7 @@
# verbose = T)
# )

# }
}

## non-linear case ------------------------------------------------------------------------
#### correctly specified variables --------------------------------------------------------
Expand Down Expand Up @@ -1119,7 +1117,7 @@
row.names = c("Y_11", "Y_12", "Y_21", "Y_22")))


# if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
if (isTRUE(tolower(Sys.getenv("TEST_NONPROBSVY_MULTICORE_DEVELOPER")) == "true")) {
#### variable selection ------------------------------------------------------------------
##### one target variable ----------------------------------------------------------------

Expand Down Expand Up @@ -1241,7 +1239,7 @@
# verbose = T)
# )

# }
}

# check cloglog -----------------------------------------------------
## linear case ----------------------------------------------------------------------------
Expand Down Expand Up @@ -1752,4 +1750,3 @@
# # expect_true(y22_corr_scad$confidence_interval$lower_bound < mean(Y_22) &
# # y22_corr_scad$confidence_interval$upper_bound > mean(Y_22)) ## conf int
# expect_true(NROW(y22_corr_scad$selection$coefficients) == 2)
# }
9 changes: 4 additions & 5 deletions inst/tinytest/test-3-mi-totals.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ expect_silent(

expect_equal(y12_mi_corr_one$output$mean, 7.465879, tolerance = 0.0001)
expect_equal(y12_mi_corr_one$output$SE, 0.2523682, tolerance = 0.0001)
# TODO
# expect_true(y12_mi_corr_one$confidence_interval$lower_bound < mean(Y_12) &
# y12_mi_corr_one$confidence_interval$upper_bound > mean(Y_12))

Expand Down Expand Up @@ -88,7 +87,7 @@ expect_true(y22_mi_corr_one$confidence_interval$lower_bound < mean(Y_22) &
# For Y_11 with all X variables
expect_silent(
y11_mi_corr_all <- nonprob(
outcome = as.formula(paste('Y_11', X_formula)),
outcome = as.formula(paste('Y_11 ~ ', as.character(X_formula)[2])),
data = sample_B1, # Include all X variables
pop_totals = X_totals,
method_outcome = "glm",
Expand All @@ -104,7 +103,7 @@ expect_true(y11_mi_corr_all$confidence_interval$lower_bound < mean(Y_11) &
# For Y_12 with all X variables
expect_silent(
y12_mi_corr_all <- nonprob(
outcome = as.formula(paste('Y_12', X_formula)),
outcome = as.formula(paste('Y_12 ~ ', as.character(X_formula)[2])),
data = sample_B1,
pop_totals = X_totals,
method_outcome = "glm",
Expand All @@ -121,7 +120,7 @@ expect_equal(y12_mi_corr_all$output$SE, 0.2454465, tolerance = 0.0001)
# For Y_21 with all X variables
expect_silent(
y21_mi_corr_all <- nonprob(
outcome = as.formula(paste('Y_21', X_formula)),
outcome = as.formula(paste('Y_21 ~ ', as.character(X_formula)[2])),
data = sample_B1,
pop_totals = X_totals,
method_outcome = "glm",
Expand All @@ -138,7 +137,7 @@ expect_equal(y21_mi_corr_all$output$SE, 0.01992489, tolerance = 0.0001)
# For Y_22 with all X variables
expect_silent(
y22_mi_corr_all <- nonprob(
outcome = as.formula(paste('Y_22', X_formula)),
outcome = as.formula(paste('Y_22 ~ ', as.character(X_formula)[2])),
data = sample_B1,
pop_totals = X_totals,
method_outcome = "glm",
Expand Down
Loading

0 comments on commit 5101e05

Please sign in to comment.