Skip to content

Commit

Permalink
Merge pull request #58 from ncn-foreigners/dev
Browse files Browse the repository at this point in the history
merge from dev
  • Loading branch information
LukaszChrostowski authored Nov 12, 2024
2 parents a5b20d2 + 915c5f3 commit 690342f
Show file tree
Hide file tree
Showing 38 changed files with 1,374 additions and 190 deletions.
Empty file added .Rapp.history
Empty file.
4 changes: 2 additions & 2 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master, package-development]
branches: [main, master, dev]
pull_request:
branches: [main, master, package-development]
branches: [main, master, dev]

name: R-CMD-check

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, dev]
pull_request:
branches: [main, master]
branches: [main, master, dev]
release:
types: [published]
workflow_dispatch:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, dev]
pull_request:
branches: [main, master]
branches: [main, master, dev]

name: test-coverage

Expand All @@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
TEST_NONPROBSVY_MULTICORE_DEVELOPER: "true"

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ importFrom(stats,pt)
importFrom(stats,qnorm)
importFrom(stats,rbinom)
importFrom(stats,rchisq)
importFrom(stats,reformulate)
importFrom(stats,residuals)
importFrom(stats,rexp)
importFrom(stats,rnorm)
Expand All @@ -86,6 +87,7 @@ importFrom(stats,weighted.mean)
importFrom(survey,as.svrepdesign)
importFrom(survey,svymean)
importFrom(survey,svyrecvar)
importFrom(survey,svytotal)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
useDynLib(nonprobsvy)
Expand Down
17 changes: 16 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
## version 0.1.0
# nonprobsvy 0.1.1

- bugfixes
- bug Fix occuring when estimation was based on auxiliary variable, which led to compression of the data from the frame to the vector.
- bug Fix related to not passing `maxit` argument from `controlSel` function to internally used `nleqslv` function
- bug Fix related to storing `vector` in `model_frame` when predicting `y_hat` in mass imputation `glm` model when X is based in one auxiliary variable only - fix provided converting it to `data.frame` object.
- features
- add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables
- add estimation of exact standard error for k-nearest neighbor estimator.
- add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching.
- implement `div` option when variable selection (more in documentation) for doubly robust estimation.
- add more insights to `nonprob` output such as gradient, hessian and jacobian derived from IPW estimation for `mle` and `gee` methods when `IPW` or `DR` model executed.
- add estimated inclusion probabilities and its derivatives for probability and non-probability samples to `nonprob` output when `IPW` or `DR` model executed.
- add `model_frame` matrix data from probability sample used for mass imputation to `nonprob` when `MI` or `DR` model executed.

## nonprobsvy 0.1.0

------------------------------------------------------------------------

Expand Down
30 changes: 24 additions & 6 deletions R/bias_correction_ipw.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# bias correction
mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, start_selection, start_outcome, boot = FALSE) {
mm <- function(X,
y,
weights,
weights_rand,
R,
n_nons,
n_rand,
method_selection,
family,
start_selection,
start_outcome,
maxit,
nleqslv_method,
nleqslv_global,
nleqslv_xscalm,
boot = FALSE) {
method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "")
method <- get_method(method_selection_function)
inv_link <- method$make_link_inv
Expand Down Expand Up @@ -63,13 +78,16 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
# par_sel <- multiroot$par
######### NLESQLV
multiroot <- nleqslv::nleqslv(
x = par0, # TODO add user-specified parameters to control functions
x = par0,
fn = u_theta_beta_dr,
method = "Newton", # TODO consider the method Broyden
global = "dbldog", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none")
xscalm = "auto", # c("fixed","auto")
method = nleqslv_method,
global = nleqslv_global,
xscalm = nleqslv_xscalm,
jacobian = TRUE,
control = list(scalex = rep(1, length(par0))), # TODO algorithm did not converge in maxit iterations for cloglog
control = list(
scalex = rep(1, length(par0)),
maxit = maxit
), # TODO algorithm did not converge in maxit iterations for cloglog
R = R,
X = X,
y = y,
Expand Down
5 changes: 2 additions & 3 deletions R/boot_mi.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,9 @@ bootMI <- function(X_rand,


predictive_match <- control_outcome$predictive_match
pmm_exact_se <- control_inference$pmm_exact_se
nn_exact_se <- control_inference$nn_exact_se
pmm_reg_engine <- control_outcome$pmm_reg_engine
pi_ij <- control_inference$pi_ij
pmm_exact_se <- control_inference$pmm_exact_se
comp2_stat <- numeric(length = num_boot)

if (is.null(pop_totals)) {
Expand Down Expand Up @@ -399,7 +398,7 @@ bootMI <- function(X_rand,
}
# mu_hat_boot <- mean(mu_hats)
if (method == "pmm") {
if (pmm_exact_se) {
if (nn_exact_se) {
comp2 <- pmm_exact(pi_ij,
weights_rand,
n_nons = n_nons,
Expand Down
14 changes: 7 additions & 7 deletions R/control_inference.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#' @param cores Number of cores in parallel computing.
#' @param keep_boot Logical indicating whether statistics from bootstrap should be kept.
#' By default set to \code{TRUE}
#' @param pmm_exact_se Logical value indicating whether to compute the exact
#' standard error estimate for \code{pmm} estimator. The variance estimator for
#' estimation based on \code{pmm} can be decomposed into three parts, with the
#' @param nn_exact_se Logical value indicating whether to compute the exact
#' standard error estimate for \code{nn} or \code{pmm} estimator. The variance estimator for
#' estimation based on \code{nn} or \code{pmm} can be decomposed into three parts, with the
#' third being computed using covariance between imputed values for units in
#' probability sample using predictive matches from non-probability sample.
#' In most situations this term is negligible and is very computationally
Expand Down Expand Up @@ -51,7 +51,7 @@ controlInf <- function(vars_selection = FALSE,
alpha = 0.05,
cores = 1,
keep_boot,
pmm_exact_se = FALSE,
nn_exact_se = FALSE,
pi_ij) {
list(
vars_selection = if (missing(vars_selection)) FALSE else vars_selection,
Expand All @@ -71,10 +71,10 @@ controlInf <- function(vars_selection = FALSE,
keep_boot
}
},
pmm_exact_se = if (!is.logical(pmm_exact_se) & length(pmm_exact_se) == 1) {
stop("Argument pmm_exact_se must be a logical scalar")
nn_exact_se = if (!is.logical(nn_exact_se) & length(nn_exact_se) == 1) {
stop("Argument nn_exact_se must be a logical scalar")
} else {
pmm_exact_se
nn_exact_se
},
pi_ij = if (missing(pi_ij)) NULL else pi_ij
)
Expand Down
4 changes: 2 additions & 2 deletions R/control_outcome.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#' @param predictive_match (Only for predictive mean matching)
#' Indicates how to select 'closest' unit from nonprobability sample for each
#' unit in probability sample. Either \code{1} (default) or \code{2} where
#' \code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2}
#' \code{2} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{1}
#' is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
#' \mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}.
#' @param pmm_weights (Only for predictive mean matching)
Expand Down
16 changes: 14 additions & 2 deletions R/control_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
#' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest.
#' \item if \code{zero} then start is a vector of zeros.
#' }
#' @param nleqslv_method The method that will be passed to [nleqslv::nleqslv()] function.
#' @param nleqslv_global The global strategy that will be passed to [nleqslv::nleqslv()] function.
#' @param nleqslv_xscalm The type of x scaling that will be passed to [nleqslv::nleqslv()] function.
#'
#' @return List with selected parameters.
#'
Expand Down Expand Up @@ -64,7 +67,13 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
nlambda = 50,
nfolds = 10,
print_level = 0,
start_type = c("glm", "naive", "zero")) {
start_type = c("glm", "naive", "zero"),
nleqslv_method = c("Broyden", "Newton"),
nleqslv_global = c(
"dbldog", "pwldog",
"cline", "qline", "gline", "hook", "none"
),
nleqslv_xscalm = c("fixed", "auto")) {
list(
epsilon = epsilon,
maxit = maxit,
Expand All @@ -84,6 +93,9 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
nfolds = nfolds,
lambda = lambda,
print_level = print_level,
start_type = if (missing(start_type)) "naive" else start_type
start_type = if (missing(start_type)) "naive" else start_type,
nleqslv_method = if (missing(nleqslv_method)) "Newton" else nleqslv_method,
nleqslv_global = if (missing(nleqslv_global)) "dbldog" else nleqslv_global,
nleqslv_xscalm = if (missing(nleqslv_xscalm)) "auto" else nleqslv_xscalm
)
}
25 changes: 18 additions & 7 deletions R/data_manip.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot
model_Frame <- model.frame(formula, data)
y_nons <- model.response(model_Frame)
outcome_name <- names(model_Frame)[1]
mt <- attr(model_Frame, "terms")
nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data)

mt <- terms(formula) # attr(model_Frame, "terms")
nons_names <- all.vars(as.formula(paste("~", paste(attr(mt, "term.labels"), collapse = " + "))))
# nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data)
##### Model frame for probability sample #####
if (outcome_name %in% colnames(svydesign$variables)) {
# design_to_frame <- svydesign$variables
Expand All @@ -19,16 +21,25 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot
design_to_frame <- svydesign$variables
design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0
names_rand <- all.vars(formula)
model_Frame_rand <- design_to_frame[, names_rand]
model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE]

nons_names_rand <- attr(attr(model.frame(formula, design_to_frame), "terms"), "term.labels")
nons_names_rand <- intersect(names_rand, colnames(design_to_frame))
} else {
design_to_frame <- svydesign$variables
names_rand <- all.vars(formula[-2])
model_Frame_rand <- design_to_frame[, names_rand]
##
terms_object <- terms(formula)
# names_rand <- all.vars(terms_object)
names_rand <- all.vars(as.formula(paste("~", paste(attr(terms_object, "term.labels"), collapse = " + "))))
##
# names_rand <- all.vars(formula[-2]) # old
##

model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE]
nons_names_rand <- intersect(names_rand, colnames(design_to_frame))

nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels")
# nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") # old

# old
# model_Frame_rand <- model.frame(formula[-2], svydesign$variables)
# mt_rand <- attr(model_Frame_rand, "terms")
# nons_names_rand <- attr(mt_rand, "term.labels")
Expand Down
6 changes: 6 additions & 0 deletions R/gee_ipw.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ gee <- function(...) {
h = h,
method_selection = method_selection,
maxit = maxit,
nleqslv_method = control_selection$nleqslv_method,
nleqslv_global = control_selection$nleqslv_global,
nleqslv_xscalm = control_selection$nleqslv_xscalm,
start = 0
)$theta_h)
start <- c(start_h, rep(0, ncol(X) - 1))
Expand All @@ -127,6 +130,9 @@ gee <- function(...) {
h = h,
method_selection = method_selection,
maxit = maxit,
nleqslv_method = control_selection$nleqslv_method,
nleqslv_global = control_selection$nleqslv_method,
nleqslv_xscalm = control_selection$nleqslv_method,
start = start
)
theta_hat <- h_object$theta_h
Expand Down
9 changes: 9 additions & 0 deletions R/glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,15 @@ glm_nonprobsvy <- function(outcome,
y_rand_pred <- family_nonprobsvy$linkinv(eta)
y_nons_pred <- model$fitted.values

# artificial formula and data created for pmm_extact_se
predictors <- colnames(X_nons)[-1]
outcome_name <- names(model_frame)[1]
formula_str <- paste(outcome_name, "~", paste(predictors, collapse = " + "))
formula <- as.formula(formula_str)
model$formula <- formula
model$data <- as.data.frame(cbind(y_nons, X_nons[, -1, drop = FALSE]))
colnames(model$data) <- c(outcome_name, predictors)

model_out <- list(
glm = model,
glm_summary = model_summ
Expand Down
31 changes: 20 additions & 11 deletions R/internals.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,20 +151,29 @@ get_method <- function(method) {
}

ff <- function(formula) {
fff <- as.character(formula)
f <- strsplit(fff[2], "\\s*\\+\\s*")[[1]]
outcome_formulas <- list()
if (any(duplicated(f))) {
warning("No unique names of the outcome variables in formula. The error has been corrected")
f <- unique(f)
}
l <- length(f)
for (i in 1:l) {
outcome_formulas[[i]] <- as.formula(paste(f[i], fff[3], sep = " ~ "))
formula_string <- paste(deparse(formula), collapse = " ")
formula_parts <- strsplit(formula_string, "~")[[1]]
if (length(formula_parts) != 2) {
stop("The formula must contain exactly one '~' operator.")
}

lhs <- trimws(formula_parts[1])
rhs <- trimws(formula_parts[2])

dependent_vars <- strsplit(lhs, "\\s*\\+\\s*")[[1]]
independent_vars <- strsplit(rhs, "\\s*\\+\\s*")[[1]]

if (any(duplicated(dependent_vars))) {
warning("Duplicate dependent variable names detected. They have been made unique.")
dependent_vars <- unique(dependent_vars)
}
outcome_formulas <- vector("list", length(dependent_vars))
l <- length(dependent_vars)
for (i in seq_along(dependent_vars)) {
outcome_formulas[[i]] <- reformulate(termlabels = independent_vars, response = dependent_vars[i])
}
list(
f = f,
f = dependent_vars,
outcomes = outcome_formulas,
l = l
)
Expand Down
Loading

0 comments on commit 690342f

Please sign in to comment.