diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a3ac618..a1e9e7c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, package-development] pull_request: - branches: [main, master] + branches: [main, master, package-development] name: R-CMD-check diff --git a/R/control.R b/R/control.R index a35e173..bf5dba7 100644 --- a/R/control.R +++ b/R/control.R @@ -20,7 +20,7 @@ controlSel <- function(method = "glm.fit", #perhaps another control function for optim_method = "NR", overlap = FALSE, dependence = FALSE, - h_x = c(1, 2) + h_x = c("1", "2") ) { list(epsilon = epsilon, @@ -29,7 +29,7 @@ controlSel <- function(method = "glm.fit", #perhaps another control function for optim_method = optim_method, overlap = overlap, dependence = dependence, - h_x = if(missing(h_x)) 1 else h_x + h_x = if(missing(h_x)) "1" else h_x ) } diff --git a/R/nonprobCV.R b/R/nonprobCV.R index a17c355..a8be3fa 100644 --- a/R/nonprobCV.R +++ b/R/nonprobCV.R @@ -1,118 +1,143 @@ -cv_nonprobsvy <- function(X_rand, X_nons, R, weights_X, method_selection, K = 10) { +cv_nonprobsvy <- function(X, R, weights_X, method_selection, h, nfolds = 10) { loc_nons <- which(R == 1) loc_rand <- which(R == 0) - - R_nons <- R[loc_nons] - R_rand <- R[loc_rand] - - weights_nons <- weights_X[loc_nons] - weights_rand <- weights_X[loc_rand] - + X_nons <- cbind(X[loc_nons,], weights_X[loc_nons], R[loc_nons]) + X_rand <- cbind(X[loc_rand,], weights_X[loc_rand], R[loc_nons]) k <- 1 loss_theta_av <- vector(mode = "numeric", length = 50) - lambdas <- seq(from = 0.02, to = 4, by = 0.08) + lambdas <- setup_lambda(X = X, + y = R, + weights = weights_X, + method_selection = method_selection, + lambda_min = 0, + nlambda = 50) + X_nons <- X_nons[sample(nrow(X_nons)), ] X_rand <- X_rand[sample(nrow(X_rand)), ] - folds_nons <- cut(seq(1,nrow(X_nons)), breaks=K, labels=FALSE) #split nonprobabability sample into K parts - folds_rand <- cut(seq(1,nrow(X_rand)), breaks=K, labels=FALSE) #split probabability sample into K parts + folds_nons <- cut(seq(1,nrow(X_nons)), breaks=nfolds, labels=FALSE) #split nonprobabability sample into K parts + folds_rand <- cut(seq(1,nrow(X_rand)), breaks=nfolds, labels=FALSE) #split probabability sample into K parts # pair K subsets randomly - sample_nons <- sample(1:K, K, replace = FALSE) - sample_rand <- sample(1:K, K, replace = FALSE) + sample_nons <- sample(1:nfolds, nfolds, replace = FALSE) + sample_rand <- sample(1:nfolds, nfolds, replace = FALSE) for (lambda in lambdas) { - loss_theta_vec <- vector(mode = "numeric", length = K) + loss_theta_vec <- vector(mode = "numeric", length = nfolds) - for(i in 1:K){ + for(i in 1:nfolds){ # train data for X_nons idx_nons <- which(folds_nons==sample_nons[i], arr.ind=TRUE) X_nons_train <- X_nons[-idx_nons, ] - R_nons_train <- R_nons[-idx_nons] - weights_nons_train <- weights_nons[-idx_nons] + # test data for X_nons X_nons_test <- X_nons[idx_nons, ] - R_nons_test <- R_nons[idx_nons] - weights_nons_test <- weights_nons[idx_nons] + # train data for X_rand idx_rand <- which(folds_rand==sample_rand[i], arr.ind=TRUE) X_rand_train <- X_rand[-idx_rand, ] - R_rand_train <- R_rand[-idx_rand] - weights_rand_train <- weights_rand[-idx_rand] # test data for X_rand X_rand_test <- X_rand[idx_rand, ] - R_rand_test <- R_rand[idx_rand] - weights_rand_test <- weights_rand[idx_rand] - X_train <- rbind(X_rand_train[, -1], X_nons_train[, -1]) - X_test <- rbind(X_rand_test[, -1], X_nons_test[, -1]) + X_train <- rbind(X_rand_train, X_nons_train) + X_test <- rbind(X_rand_test, X_nons_test) - R_train <- c(R_rand_train, R_nons_train) - R_test <- c(R_rand_test, R_nons_test) - weights_X_train <- c(weights_rand_train, weights_nons_train) - weights_X_test <- c(weights_rand_test, weights_nons_test) + theta_est <- do.call("fit", X_train, X_train$R, X_test$weights_X, method_selection, h) - p <- ncol(X_test) + loss_theta_vec[i] <- loss_theta(par = theta_est, R = X_test$R, X = X_test, + weights = X_test$weights_X, h = h, method_selection = method_selection) - # initial values for set of parameters - init_theta <- rep(0, p+1) - # variables selection using score equation for theta + } - par0 <- c(init_theta) - LAMBDA <- Matrix::Matrix(matrix(0, p+1, p+1), sparse = TRUE) - it <- 0 + loss_theta_av[k] <- mean(loss_theta_vec) - for(jj in 1:100) { # to fix - it <- it + 1 + k <- k + 1 - u_theta0 <- u_theta(par = par0, R = R_train, X = X_train, - weights = weights_X_train, - method_selection = method_selection) + } - u_theta0_der <- u_theta_der(par = par0, R = R_train, X = X_train, - weights = weights_X_train, - method_selection = method_selection) + i <- which.min(loss_theta_av) - diag(LAMBDA) <- abs(q_lambda(par0, lambda))/(1e-6 + abs(par0)) - par <- par0 + solve(u_theta0_der + LAMBDA, sparse = TRUE) %*% (u_theta0 - LAMBDA %*% par0) # perhaps 'solve' function instead of 'ginv' - # equation (13) in the article - if (sum(abs(par - par0)) < 1e-6) break; - if (sum(abs(par - par0)) > 1000) break; + print(lambdas[i]) - par0 <- par + lambdas[i] - } +} - par <- as.vector(par) - theta_est <- par - loss_theta_vec[i] <- loss_theta(par = theta_est, R = R_test, X = X_test, - weights = weights_X_test, method_selection = method_selection) +setup_lambda <- function(X, y, weights, method_selection, alpha = 1, lambda_min, log,lambda = FALSE, nlambda, ...) { #consider panalty factor here - } + fit <- glm.fit(x = X, y = y, weights = weights, family = binomial(link = method_selection)) + #fit <- glm(y~1, weights = weights, family = binomial(link = method_selection)) - loss_theta_av[k] <- mean(loss_theta_vec) + n <- length(y) + p <- ncol(X) + w <- fit$weights + r <- residuals(fit, "working") * w + zmax <- .Call("maxprod", X, r)/n + lambda.max <- zmax/alpha - k <- k + 1 + if (log.lambda) { # lambda sequence on log-scale + if (lambda.min==0) { + lambda <- c(exp(seq(log(lambda.max), log(.001*lambda.max), length=nlambda-1)), 0) + } else { + lambda <- exp(seq(log(lambda.max), log(lambda.min*lambda.max), length=nlambda)) + } + } else { # lambda sequence on linear-scale + if (lambda.min==0) { + lambda <- c(seq(lambda.max, 0.001*lambda.max, length = nlambda-1), 0) + } else { + lambda <- seq(lambda.max, lambda.min*lambda.max, length = nlambda) + } } + lambda +} - i <- which.min(loss_theta_av) - print(lambdas[i]) +fit <- function(X, R, weights, method_selection, h) { - lambdas[i] + p <- ncol(X) + init_theta <- rep(0, p) + # variables selection using score equation for theta + par0 <- init_theta + LAMBDA <- Matrix::Matrix(matrix(0, p, p), sparse = TRUE) + it <- 0 + for(jj in 1:maxit) { + it <- it + 1 + if (it == maxit) break + + u_theta0 <- u_theta(par = par0, R = R, X = X, + weights = weights_X, h = h, + method_selection = method_selection) + + u_theta0_der <- u_theta_der(par = par0, R = R, X = X, + weights = weights_X, h = h, + method_selection = method_selection) + + diag(LAMBDA) <- abs(q_lambda(par0, lambda))/(eps + abs(par0)) + par <- par0 + MASS::ginv(as.matrix(u_theta0_der + LAMBDA)) %*% (u_theta0 - LAMBDA %*% par0) # perhaps 'solve' function instead of 'ginv' + # equation (13) in article + if (sum(abs(par - par0)) < eps) break; + if (sum(abs(par - par0)) > 1000) break; + + par0 <- par + + } + + par <- as.vector(par) + theta_est <- par + theta_est } diff --git a/R/nonprobDR.R b/R/nonprobDR.R index 90f6bec..f4510ef 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -153,6 +153,7 @@ nonprobDR <- function(selection, est_ps_rand <- maxLik_rand_obj$ps hess <- maxLik_nons_obj$hess theta_hat <- maxLik_nons_obj$theta_hat + names(theta_hat) <- c("(Intercept)", nons_names) log_likelihood <- log_like(theta_hat) # maximum of the loglikelihood function diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index e97012d..512230a 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -137,6 +137,7 @@ nonprobIPW <- function(selection, est_ps_rand <- maxLik_rand_obj$ps hess <- maxLik_nons_obj$hess theta_hat <- maxLik_nons_obj$theta_hat + names(theta_hat) <- c("(Intercept)", nons_names) log_likelihood <- log_like(theta_hat) # maximum of the loglikelihood function # to complete diff --git a/R/nonprobVariablesSelection.R b/R/nonprobVariablesSelection.R index f80d065..f4e9ffd 100644 --- a/R/nonprobVariablesSelection.R +++ b/R/nonprobVariablesSelection.R @@ -70,6 +70,7 @@ nonprobSel <- function(selection, eps <- control_selection$epsilon penalty <- control_inference$penalty maxit <- control_selection$maxit + h <- control_selection$h_x weights <- rep.int(1, nrow(data)) # to remove @@ -77,13 +78,9 @@ nonprobSel <- function(selection, X_nons <- model.matrix(XY_nons, data) #matrix of nonprobability sample with intercept nons_names <- attr(terms(outcome, data = data), "term.labels") if (all(nons_names %in% colnames(svydesign$variables))) { - X_rand <- as.matrix(cbind(1, svydesign$variables[,nons_names])) #matrix of probability sample with intercept - } else { - stop("variable names in data and svydesign do not match") - } y_nons <- XY_nons[,1] @@ -99,7 +96,7 @@ nonprobSel <- function(selection, n_nons <- nrow(X_nons) n_rand <- nrow(X_rand) - X <- rbind(X_rand[, -1], X_nons[, -1]) # joint matrix + X <- rbind(X_rand, X_nons) # joint matrix #X_stand <- scale(X) weights_X <- c(weights_rand, weights) @@ -119,12 +116,13 @@ nonprobSel <- function(selection, p <- ncol(X) N_rand <- sum(weights_rand) - FITT <- ncvreg::cv.ncvreg(X = X, y = R, + FITT <- ncvreg::cv.ncvreg(X = X[, -1], y = R, weights = weights_X, penalty = penalty, family = "binomial") # for 50 variables this function returns first four variables as the most "important", while my function returns almost all, perhaps because of form of the loss function #print(FITT$fit$beta[,FITT$min]) loss_theta <- vector(mode = "numeric", length = length(FITT$lambda)) theta <- list(vector(mode = "numeric", length = length(FITT$lambda))) + indices <- list(vector(mode = "numeric", length = length(FITT$lambda))) k <- 1 @@ -132,26 +130,28 @@ nonprobSel <- function(selection, # initial values for set of parameters #init_theta <- vector(mode = "numeric", length = p+1) - init_theta <- FITT$fit$beta[, k] - + idxx <- which(FITT$fit$beta[, k] != 0) + init_theta <- rep(0, length(FITT$fit$beta[, k][idxx])) # variables selection using score equation for theta par0 <- init_theta - LAMBDA <- Matrix::Matrix(matrix(0, p+1, p+1), sparse = TRUE) + p <- length(par0) + LAMBDA <- Matrix::Matrix(matrix(0, p, p), sparse = TRUE) it <- 0 for(jj in 1:maxit) { it <- it + 1 + if (it == maxit) break - u_theta0 <- u_theta(par = par0, R = R, X = X, - weights = weights_X, + u_theta0 <- u_theta(par = par0, R = R, X = X[, idxx], + weights = weights_X, h = h, method_selection = method_selection) - u_theta0_der <- u_theta_der(par = par0, R = R, X = X, - weights = weights_X, + u_theta0_der <- u_theta_der(par = par0, R = R, X = X[, idxx], + weights = weights_X, h = h, method_selection = method_selection) diag(LAMBDA) <- abs(q_lambda(par0, lambda))/(eps + abs(par0)) par <- par0 + MASS::ginv(as.matrix(u_theta0_der + LAMBDA)) %*% (u_theta0 - LAMBDA %*% par0) # perhaps 'solve' function instead of 'ginv' - # equation (13) in article + # equation (13) in article if (sum(abs(par - par0)) < eps) break; if (sum(abs(par - par0)) > 1000) break; @@ -160,48 +160,57 @@ nonprobSel <- function(selection, } par <- as.vector(par) - par[which(abs(par) < 1e-3)] <- 0 - iddx <- which(abs(par) != 0) - theta_estt <- par[iddx] - theta[[k]] <- par - Xloss <- cbind(1, X) - - loss_theta[k] <- loss_theta(par = theta_estt, - R = R, - X = as.matrix(Xloss[, iddx]), - weights = weights_X, - method_selection = method_selection) + if (any(par==0)) { + i <- which(par==0) + idxx <- idxx[-i] + } - k <- k + 1 + indices[[k]] <- idxx + theta_estt <- theta[[k]] <- par[par!=0] + if (length(theta[[k]]) == 1) { + loss_theta[k] <- NA + } else { + loss_theta[k] <- loss_theta(par = theta_estt, + R = R, + X = as.matrix(X[, idxx]), + weights = weights_X, + h = h, + method_selection = method_selection) # compute this term without intercept } + k <- k + 1 + } #A <- t(matrix(unlist(theta), nrow = length(theta[[1]]))) #L <- as.matrix(FITT$lambda, ncol = 1) #print(as.data.frame(cbind(L, A))) + min_idx <- which.min(loss_theta) + theta_est <- theta[[min_idx]] + theta_selected <- indices[[min_idx]] - 1 + names(theta_est) <- c("(Intercept)", nons_names[theta_selected[-1]]) - theta_est <- theta[[which.min(loss_theta)]] - theta_selected <- which(abs(theta_est) != 0) - 1 # variables selection for beta using ncvreg package, perhaps cv.glmnet for theta like #theta <- glmnet::cv.glmnet(X, R, family = "binomial") #print(coef(theta)) - beta <- ncvreg::cv.ncvreg(X = X[loc_nons, ], y = y_nons, + beta <- ncvreg::cv.ncvreg(X = X[loc_nons, -1], y = y_nons, penalty = penalty, family = family_outcome) + beta_est <- beta$fit$beta[,beta$min] beta_selected <- which(abs(beta_est) != 0) - 1 + beta_est <- beta_est[beta$fit$beta[,beta$min] != 0] # Estimating theta, beta parameters using selected variables idx <- unique(c(beta_selected[-1], theta_selected[-1])) #excluding intercepts psel <- length(idx) - Xsel <- as.matrix(X[, idx]) + Xsel <- as.matrix(X[, idx + 1]) - par0 <- rep(0, 2*(psel+1)) + par0 <- rep(0, 2*(psel + 1)) # root for joint score equation par_sel <- rootSolve::multiroot(u_theta_beta, @@ -216,6 +225,7 @@ nonprobSel <- function(selection, theta_sel <- par_sel[1:(psel+1)] beta_sel <- par_sel[(psel+2):(2*psel+2)] + names(theta_sel) <- names(beta_sel) <- c("(Intercept)", nons_names[idx]) ps_nons_est <- inv_link(as.vector(as.matrix(cbind(1, Xsel)) %*% as.matrix(theta_sel))) weights_nons <- 1/ps_nons_est[loc_nons] @@ -246,7 +256,7 @@ nonprobSel <- function(selection, svydesign_mean <- survey::svymean(~y_rand, svydesign) var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component - var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigmasqhat))/(N_nons^2) # nonprobability component + var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigmasqhat))/N_nons^2 # nonprobability component se_nonprob <- sqrt(var_nonprob) se_prob <- sqrt(var_prob) @@ -351,9 +361,7 @@ nonprobSel <- function(selection, theta_sel = theta_est, beta_sel = beta_est, theta = theta_sel, - beta = beta_sel, - theta_selected_vars = nons_names[theta_selected[-1]], - beta_selected_vars = nons_names[beta_selected[-1]] + beta = beta_sel ), class = "Doubly-robust") @@ -369,6 +377,7 @@ u_theta <- function(par, X, weights, method_selection, + h, N = NULL) { @@ -382,17 +391,18 @@ u_theta <- function(par, inv_link <- method$make_link_inv - theta <- par + theta <- as.matrix(par) n <- length(R) - X0 <- cbind(1, X) - lpiB <- X0 %*% theta - ps <- inv_link(lpiB) + X0 <- as.matrix(X) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) R_rand <- 1 - R ps <- as.vector(ps) N_nons <- sum(1/ps) - eq <- c(apply(X0 * R/ps - X0 * R_rand * weights, 2, sum))/N_nons - + eq <- switch(h, + "1" = c(apply(X0 * R/ps - X0 * R_rand * weights, 2, sum))/N_nons, + "2" = c(apply(X0 * R - X0 * R_rand * weights * ps, 2, sum))/N_nons) eq } @@ -405,6 +415,7 @@ u_theta_der <- function(par, X, weights, method_selection, + h, N = NULL) { method <- method_selection @@ -417,23 +428,45 @@ u_theta_der <- function(par, inv_link <- method$make_link_inv - theta <- par - p <- ncol(X) + theta <- as.matrix(par) + X0 <- as.matrix(X) + p <- ncol(X0) n <- length(R) - X0 <- cbind(1, X) - lpiB <- X0 %*% theta - ps <- inv_link(lpiB) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) ps <- as.vector(ps) + R_rand <- 1 - R + + if (method_selection == "probit") { + dinv_link <- method$make_link_inv_der + psd <- dinv_link(eta_pi) + psd <- as.vector(psd) + } + N_nons <- sum(1/ps) - mxDer <- matrix(0,(p+1),(p+1)) + mxDer <- matrix(0, p, p) - for (ii in 1:(p+1)) { - for (jj in ii:(p+1)) { - mxDer[ii,jj] <- mxDer[jj,ii] <- sum(R * (1-ps)/ps * X0[,ii] * X0[,jj]) + if (h == "1") { + for (ii in 1:(p)) { + for (jj in ii:(p)) { + mxDer[ii,jj] <- mxDer[jj,ii] <- switch(method_selection, + "logit" = sum(R * (1-ps)/ps * X0[,ii] * X0[,jj]), + "cloglog" = sum(R * (1-ps)/ps^2 * exp(eta_pi) * X0[,ii] * X0[,jj]), + "probit" = sum(R * psd/ps^2 * X0[,ii] * X0[,jj])) + } + } + } else if (h == "2") { + for (ii in 1:(p)) { + for (jj in ii:(p)) { + mxDer[ii,jj] <- mxDer[jj,ii] <- switch(method_selection, + "logit" = sum(R_rand * weights * ps/(exp(eta_pi)+1) * X0[,ii] * X0[,jj]), + "cloglog" = sum(R_rand * weights * (1-ps) * exp(eta_pi) * X0[,ii] * X0[,jj]), + "probit" = sum(R_rand * weights * psd * X0[,ii] * X0[,jj])) + } } - } + } mxDer/N_nons } @@ -475,8 +508,8 @@ u_theta_beta <- function(par, theta <- par[1:(p+1)] beta <- par[(p+2):(2*p+2)] X0 <- cbind(1, X) - lpiB <- X0 %*% theta - ps <- inv_link(lpiB) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) R_rand <- 1 - R y[which(is.na(y))] <- 0 ps <- as.vector(ps) @@ -521,6 +554,7 @@ loss_theta <- function(par, R, X, weights, + h, method_selection) { method <- method_selection @@ -537,8 +571,8 @@ loss_theta <- function(par, theta <- par nAB <- length(R) X0 <- X - lpiB <- X0 %*% theta - ps <- inv_link(lpiB) + eta_pi <- X0 %*% theta + ps <- inv_link(eta_pi) loc_nons <- which(R == 1) loc_rand <- which(R == 0) @@ -548,8 +582,11 @@ loss_theta <- function(par, N_est_rand <- sum(weights[loc_rand]) N_est_nons <- sum(1/ps) - loss <- sum(apply((X0*R/ps - X0*R_rand*weights), 2, sum)^2) + loss <- switch(h, + "1" = sum(apply((X0*R/ps - X0*R_rand*weights), 2, sum)^2), + "2" = sum(apply((X0*R - X0*R_rand*weights*ps), 2, sum)^2)) loss } +