Skip to content

Commit

Permalink
Merge pull request #22 from ncn-foreigners/dev
Browse files Browse the repository at this point in the history
add another start method, readme update
  • Loading branch information
LukaszChrostowski authored Nov 5, 2023
2 parents 3d91412 + 40d5c06 commit 0a03ca8
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 93 deletions.
43 changes: 38 additions & 5 deletions R/EstimationMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,21 @@ mle <- function(...) {

# initial values for propensity score estimation
if (is.null(start)) {
start <- start_fit(X = X,
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
if (control_selection$start_type == "glm") {
start <- start_fit(X = X,
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
intercept_start <- max_lik(X_nons = X_nons[, 1, drop = FALSE],
X_rand = X_rand[, 1, drop = FALSE],
weights = weights,
weights_rand = weights_rand,
start = 0,
control = control_selection)$theta_hat
start <- c(intercept_start, rep(0, ncol(X_nons) - 1))
}
}

df_reduced <- nrow(X) - length(start)
Expand Down Expand Up @@ -216,13 +226,36 @@ gee <- function(...) {
h = h,
est_method,
maxit,
control_selection,
start,
varcov = FALSE,
...){

method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "")
method <- get_method(method = method_selection_function)
inv_link <- method$make_link_inv

if (is.null(start)) {
if (control_selection$start_type == "glm") {
start0 <- start_fit(X = X, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
X = X[, 1, drop = FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
maxit = maxit,
start = 0)
start <- c(start_h, rep(0, ncol(X) - 1))
}
}


h_object <- theta_h_estimation(R = R,
X = X,
weights_rand = weights_rand,
Expand Down
11 changes: 9 additions & 2 deletions R/control.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
#' @param nlambda The number of `lambda` values. Default is 50.
#' @param nfolds The number of folds for cross validation. Default is 10.
#' @param print_level this argument determines the level of printing which is done during the optimization (for propensity score model) process.
#' @param start_type - Type of method for start points for model fitting taking the following values
#' \itemize{
#' \item if \code{glm} then start taken from the glm function called on samples.
#' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest.
#' }
#'
#' @return List with selected parameters.
#'
Expand All @@ -52,7 +57,8 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
lambda_min = .001,
nlambda = 50,
nfolds = 10,
print_level = 0
print_level = 0,
start_type = c("glm", "naive")
) {

list(epsilon = epsilon,
Expand All @@ -72,7 +78,8 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
nlambda = nlambda,
nfolds = nfolds,
lambda = lambda,
print_level = print_level
print_level = print_level,
start_type = if(missing(start_type)) "naive" else start_type
)

}
Expand Down
54 changes: 27 additions & 27 deletions R/internals.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,30 +74,30 @@ theta_h_estimation <- function(R,
pop_means = NULL){ # TODO with BERENZ recommendation

p <- ncol(X)
if (is.null(pop_totals) & is.null(pop_means)) {
if (is.null(start)) {
start0 <- start_fit(X = X, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else {
start0 <- start
}
} else { # TODO customize start point for fitting with population totals
# start0 <- rep(.8, ncol(X))
# X_pop <- rbind(X, pop_totals)
# weights_randd <- 1
if (is.null(start)) {
start0 <- start_fit(X = X, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else {
start0 <- start
}
}
# if (is.null(pop_totals) & is.null(pop_means)) {
# if (is.null(start)) {
# start0 <- start_fit(X = X, # <--- does not work with pop_totals
# R = R,
# weights = weights,
# weights_rand = weights_rand,
# method_selection = method_selection)
# } else {
# start0 <- start
# }
# } else { # TODO customize start point for fitting with population totals
# # start0 <- rep(.8, ncol(X))
# # X_pop <- rbind(X, pop_totals)
# # weights_randd <- 1
# if (is.null(start)) {
# start0 <- start_fit(X = X, # <--- does not work with pop_totals
# R = R,
# weights = weights,
# weights_rand = weights_rand,
# method_selection = method_selection)
# } else {
# start0 <- start
# }
# }
u_theta <- u_theta(R = R,
X = X,
weights = c(weights_rand, weights),
Expand All @@ -112,7 +112,7 @@ theta_h_estimation <- function(R,
method_selection = method_selection,
pop_totals = pop_totals)

root <- nleqslv::nleqslv(x = start0,
root <- nleqslv::nleqslv(x = start,
fn = u_theta,
method = "Newton", # TODO consider the methods
global = "cline", #qline",
Expand All @@ -123,7 +123,7 @@ theta_h_estimation <- function(R,
)


start <- root$x
theta_root <- root$x
if (root$termcd %in% c(2:7, -10)) {
switch(as.character(root$termcd),
"2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."),
Expand All @@ -134,7 +134,7 @@ theta_h_estimation <- function(R,
"7" = warning("Jacobian is unusable when fitting selection model by nleqslv."),
"-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv."))
}
theta_h <- as.vector(start)
theta_h <- as.vector(theta_root)
grad <- u_theta(theta_h)
hess <- u_theta_der(theta_h) # TODO compare with root$jac
#hess <- root$jac
Expand Down
36 changes: 18 additions & 18 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,14 @@ residuals.nonprobsvy <- function(object,

if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {
if (object$control$control_inference$vars_selection == FALSE) {
res_out <- residuals(object$outcome) # TODO for variable selection
res_out <- residuals(object$outcome[[1]]) # TODO for variable selection
} else { # TODO for variable selection
r <- object$outcome$family$residuals
res_out <- switch(type,
"response" = r,
"working" = r/object$outcome$family$mu,
"working" = r/object$outcome[[1]]$family$mu,
# TODO "deviance" =
"pearson" = r/sqrt(object$outcome$family$variance)
"pearson" = r/sqrt(object$outcome[[1]]$family$variance)
)
}
}
Expand Down Expand Up @@ -311,7 +311,7 @@ cooks.distance.nonprobsvy <- function(model, # TODO for variable selection
resids <- residuals(model, type = "pearsonSTD")^2
hats <- hatvalues(model)
res_sel <- (resids * (hats / (length(coef(model))))) # TODO
res_out <- cooks.distance(model$outcome$glm)
res_out <- cooks.distance(model$outcome[[1]])

}
#' @method hatvalues nonprobsvy
Expand All @@ -321,7 +321,7 @@ cooks.distance.nonprobsvy <- function(model, # TODO for variable selection
hatvalues.nonprobsvy <- function(model,
...) { # TODO reduce execution time and glm.fit object and customise to variable selection
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(model))) {
propensity_scores <- model$prop_scores
propensity_scores <- model$prob
W <- Matrix::Diagonal(x = propensity_scores * (1 - propensity_scores))
XWX_inv <- solve(t(model$X) %*% W %*% model$X)
hat_values_sel <- vector(mode = "numeric", length = length(propensity_scores))
Expand All @@ -348,10 +348,10 @@ logLik.nonprobsvy <- function(object, ...) {
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) {
val_sel <- object$selection$log_likelihood
attr(val_sel, "nobs") <- dim(residuals(object, type = "pearson"))[1]
attr(val_sel, "df") <- nrow(object$parameters) #length(object$coefficients)
attr(val_sel, "df") <- length(object$selection$coefficients)
class(val_sel) <- "logLik"
}
val_out <- ifelse(any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object)), logLik(object$outcome), 0) # TODO for gee
val_out <- ifelse(any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object)), logLik(object$outcome[[1]]), 0) # TODO for gee
if (class(object)[2] == "nonprobsvy_mi") val <- c("outcome" = val_out)
if (class(object)[2] == "nonprobsvy_ipw") val <- c("selection" = val_sel)
if (class(object)[2] == "nonprobsvy_dr") val <- c("selection" = val_sel, "outcome" = val_out)
Expand All @@ -363,8 +363,8 @@ logLik.nonprobsvy <- function(object, ...) {
AIC.nonprobsvy <- function(object,
...) {
if (!is.character(object$selection$log_likelihood)) {
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- 2 * (length(object$parameters) - object$selection$log_likelihood)
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome$aic
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- 2 * (length(object$selection$coefficients) - object$selection$log_likelihood)
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome[[1]]$aic
if (class(object)[2] == "nonprobsvy_mi") res <- c("outcome" = res_out)
if (class(object)[2] == "nonprobsvy_ipw") res <- c("selection" = res_sel)
if (class(object)[2] == "nonprobsvy_dr") res <- c("selection" = res_sel, "outcome" = res_out)
Expand All @@ -380,15 +380,15 @@ AIC.nonprobsvy <- function(object,
BIC.nonprobsvy <- function(object,
...) {
if (!is.character(object$selection$log_likelihood)) {
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- length(object$parameters) * log(object$nonprob_size + object$prob_size) - 2 * object$selection$log_likelihood
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- length(object$selection$coefficients) * log(object$nonprob_size + object$prob_size) - 2 * object$selection$log_likelihood
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {

if (!is.null(object$outcome$coefficients)) {
if (!is.null(object$outcome[[1]]$coefficients)) {
if (object$control$control_inference$vars_selection == TRUE) {
options(AIC="BIC")
res_out <- HelpersMG::ExtractAIC.glm(object$outcome)[2]
} else {
res_out <- BIC(object$outcome)
res_out <- BIC(object$outcome[[1]])
}
} else {
res_out <- "not available for this methos"
Expand Down Expand Up @@ -425,17 +425,17 @@ confint.nonprobsvy <- function(object,
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) {
std <- sqrt(diag(vcov(object)[[1]]))
sc <- qnorm(p = 1 - (1 - level) / 2)
res_sel <- data.frame(object$parameters[,1] - sc * std, object$parameters[,1] + sc * std)
res_sel <- data.frame(object$selection$coefficients - sc * std, object$selection$coefficients + sc * std)
colnames(res_sel) <- c(paste0(100 * (1 - level) / 2, "%"),
paste0(100 * (1 - (1 - level) / 2), "%"))
}
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {
if (object$control$control_inference$vars_selection == FALSE) {
res_out <- confint(object$outcome)
res_out <- confint(object$outcome[[1]])
} else {
std <- sqrt(diag(vcov(object)[[1]]))
sc <- qnorm(p = 1 - (1 - level) / 2)
res_out <- data.frame(object$beta[,1] - sc * std, object$beta[,1] + sc * std)
res_out <- data.frame(object$outcome[[1]]$coefficients - sc * std, object$outcome[[1]]$coefficients + sc * std)
colnames(res_out) <- c(paste0(100 * (1 - level) / 2, "%"),
paste0(100 * (1 - (1 - level) / 2), "%"))
}
Expand Down Expand Up @@ -463,14 +463,14 @@ confint.nonprobsvy <- function(object,
vcov.nonprobsvy <- function(object,
...) { # TODO consider different vcov methods for selection and outcome models
if (object$control$control_inference$vars_selection == TRUE) {
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome$variance_covariance
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome[[1]]$variance_covariance
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- object$selection$variance_covariance
if (class(object)[2] == "nonprobsvy_mi") res <- list(outcome = res_out)
if (class(object)[2] == "nonprobsvy_ipw") res <- list(selection = res_sel)
if (class(object)[2] == "nonprobsvy_dr") res <- list(selection = res_sel, outcome = res_out)

} else {
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- vcov(object$outcome)
if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- vcov(object$outcome[[1]])
if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- object$selection$variance_covariance
if (class(object)[2] == "nonprobsvy_mi") res <- list(outcome = res_out)
if (class(object)[2] == "nonprobsvy_ipw") res <- list(selection = res_sel)
Expand All @@ -483,6 +483,6 @@ vcov.nonprobsvy <- function(object,
#' @exportS3Method
deviance.nonprobsvy <- function(object,
...) {
res_out <- object$outcome$deviance
res_out <- object$outcome[[1]]$deviance
# TODO for selection model - use selection object
}
25 changes: 24 additions & 1 deletion R/nonprobDR.R
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ nonprobDR <- function(selection,
ps_nons_der <- selection_model$ps_nons_der
est_ps_rand_der <- selection_model$est_ps_rand_der
theta_standard_errors <- sqrt(diag(selection_model$variance_covariance))
names(theta_hat) <- names(selection_model$coefficients) <- colnames(X)

y_rand_pred <- outcome_model$y_rand_pred
y_nons_pred <- outcome_model$y_nons_pred
Expand Down Expand Up @@ -389,7 +390,7 @@ nonprobDR <- function(selection,
#log_likelihood <- est_method_obj$log_likelihood
#df_residual <- est_method_obj$df_residual

names(theta_hat) <- colnames(X)
names(selection_model$theta_hat) <- names(theta_hat) <- colnames(X)
weights_nons <- 1/ps_nons
N_nons <- sum(weights * weights_nons)
N_rand <- sum(weights_rand)
Expand Down Expand Up @@ -475,6 +476,28 @@ nonprobDR <- function(selection,
X_nons <- OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons,]
SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx+1])
}

if (is.null(start)) {
if (control_selection$start_type == "glm") {
start <- start_fit(X = SelectionModel$X_nons, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
X = SelectionModel$X_nons[,1,drop=FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
start = 0,
maxit = maxit,
pop_totals = SelectionModel$pop_totals[1])$theta_h
start <- c(start_h, rep(0, ncol(X) - 1))
}
}

h_object <- theta_h_estimation(R = R,
X = SelectionModel$X_nons,
weights_rand = NULL,
Expand Down
24 changes: 23 additions & 1 deletion R/nonprobIPW.R
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ nonprobIPW <- function(selection,
est_ps_rand_der <- selection_model$est_ps_rand_der
theta_standard_errors <- sqrt(diag(selection_model$variance_covariance))

names(selection_model$theta_hat) <- colnames(X)
names(theta_hat) <- names(selection_model$theta_hat) <- colnames(X)
weights_nons <- 1/ps_nons
N <- sum(weights * weights_nons)

Expand Down Expand Up @@ -252,6 +252,28 @@ nonprobIPW <- function(selection,
################ ESTIMATION
pop_totals <- model$pop_totals[idx]
}

if (is.null(start)) {
if (control_selection$start_type == "glm") {
start <- start_fit(X = X, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
X = X[,1,drop=FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
start = 0,
maxit = maxit,
pop_totals = pop_totals[1])$theta_h
start <- c(start_h, rep(0, ncol(X) - 1))
}
}

h_object <- theta_h_estimation(R = R,
X = X,
weights_rand = weights_rand,
Expand Down
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ You can install the development version of `nonprobsvy` package from [Github](ht
remotes::install_github("ncn-foreigners/nonprobsvy")
```

## Example
## Examples

Simulate example data from the following paper: Kim, Jae Kwang, and Zhonglei Wang. "Sampling techniques for big data analysis." International Statistical Review 87 (2019): S177-S191 [section 5.2]

Expand Down
Loading

0 comments on commit 0a03ca8

Please sign in to comment.