Skip to content

Commit

Permalink
add probit, cloglog and another h function to variables selection
Browse files Browse the repository at this point in the history
  • Loading branch information
LukaszChrostowski committed Mar 7, 2023
1 parent e77f2fd commit 389f0ea
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 121 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, package-development]
pull_request:
branches: [main, master]
branches: [main, master, package-development]

name: R-CMD-check

Expand Down
4 changes: 2 additions & 2 deletions R/control.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ controlSel <- function(method = "glm.fit", #perhaps another control function for
optim_method = "NR",
overlap = FALSE,
dependence = FALSE,
h_x = c(1, 2)
h_x = c("1", "2")
) {

list(epsilon = epsilon,
Expand All @@ -29,7 +29,7 @@ controlSel <- function(method = "glm.fit", #perhaps another control function for
optim_method = optim_method,
overlap = overlap,
dependence = dependence,
h_x = if(missing(h_x)) 1 else h_x
h_x = if(missing(h_x)) "1" else h_x
)

}
Expand Down
147 changes: 86 additions & 61 deletions R/nonprobCV.R
Original file line number Diff line number Diff line change
@@ -1,118 +1,143 @@
cv_nonprobsvy <- function(X_rand, X_nons, R, weights_X, method_selection, K = 10) {
cv_nonprobsvy <- function(X, R, weights_X, method_selection, h, nfolds = 10) {

loc_nons <- which(R == 1)
loc_rand <- which(R == 0)

R_nons <- R[loc_nons]
R_rand <- R[loc_rand]

weights_nons <- weights_X[loc_nons]
weights_rand <- weights_X[loc_rand]

X_nons <- cbind(X[loc_nons,], weights_X[loc_nons], R[loc_nons])
X_rand <- cbind(X[loc_rand,], weights_X[loc_rand], R[loc_nons])
k <- 1

loss_theta_av <- vector(mode = "numeric", length = 50)
lambdas <- seq(from = 0.02, to = 4, by = 0.08)
lambdas <- setup_lambda(X = X,
y = R,
weights = weights_X,
method_selection = method_selection,
lambda_min = 0,
nlambda = 50)


X_nons <- X_nons[sample(nrow(X_nons)), ]
X_rand <- X_rand[sample(nrow(X_rand)), ]

folds_nons <- cut(seq(1,nrow(X_nons)), breaks=K, labels=FALSE) #split nonprobabability sample into K parts
folds_rand <- cut(seq(1,nrow(X_rand)), breaks=K, labels=FALSE) #split probabability sample into K parts
folds_nons <- cut(seq(1,nrow(X_nons)), breaks=nfolds, labels=FALSE) #split nonprobabability sample into K parts
folds_rand <- cut(seq(1,nrow(X_rand)), breaks=nfolds, labels=FALSE) #split probabability sample into K parts

# pair K subsets randomly
sample_nons <- sample(1:K, K, replace = FALSE)
sample_rand <- sample(1:K, K, replace = FALSE)
sample_nons <- sample(1:nfolds, nfolds, replace = FALSE)
sample_rand <- sample(1:nfolds, nfolds, replace = FALSE)

for (lambda in lambdas) {

loss_theta_vec <- vector(mode = "numeric", length = K)
loss_theta_vec <- vector(mode = "numeric", length = nfolds)

for(i in 1:K){
for(i in 1:nfolds){

# train data for X_nons
idx_nons <- which(folds_nons==sample_nons[i], arr.ind=TRUE)
X_nons_train <- X_nons[-idx_nons, ]
R_nons_train <- R_nons[-idx_nons]
weights_nons_train <- weights_nons[-idx_nons]


# test data for X_nons
X_nons_test <- X_nons[idx_nons, ]
R_nons_test <- R_nons[idx_nons]
weights_nons_test <- weights_nons[idx_nons]


# train data for X_rand
idx_rand <- which(folds_rand==sample_rand[i], arr.ind=TRUE)
X_rand_train <- X_rand[-idx_rand, ]
R_rand_train <- R_rand[-idx_rand]
weights_rand_train <- weights_rand[-idx_rand]

# test data for X_rand
X_rand_test <- X_rand[idx_rand, ]
R_rand_test <- R_rand[idx_rand]
weights_rand_test <- weights_rand[idx_rand]


X_train <- rbind(X_rand_train[, -1], X_nons_train[, -1])
X_test <- rbind(X_rand_test[, -1], X_nons_test[, -1])
X_train <- rbind(X_rand_train, X_nons_train)
X_test <- rbind(X_rand_test, X_nons_test)

R_train <- c(R_rand_train, R_nons_train)
R_test <- c(R_rand_test, R_nons_test)

weights_X_train <- c(weights_rand_train, weights_nons_train)
weights_X_test <- c(weights_rand_test, weights_nons_test)
theta_est <- do.call("fit", X_train, X_train$R, X_test$weights_X, method_selection, h)

p <- ncol(X_test)
loss_theta_vec[i] <- loss_theta(par = theta_est, R = X_test$R, X = X_test,
weights = X_test$weights_X, h = h, method_selection = method_selection)

# initial values for set of parameters
init_theta <- rep(0, p+1)

# variables selection using score equation for theta
}

par0 <- c(init_theta)
LAMBDA <- Matrix::Matrix(matrix(0, p+1, p+1), sparse = TRUE)
it <- 0
loss_theta_av[k] <- mean(loss_theta_vec)

for(jj in 1:100) { # to fix
it <- it + 1
k <- k + 1

u_theta0 <- u_theta(par = par0, R = R_train, X = X_train,
weights = weights_X_train,
method_selection = method_selection)
}

u_theta0_der <- u_theta_der(par = par0, R = R_train, X = X_train,
weights = weights_X_train,
method_selection = method_selection)
i <- which.min(loss_theta_av)

diag(LAMBDA) <- abs(q_lambda(par0, lambda))/(1e-6 + abs(par0))
par <- par0 + solve(u_theta0_der + LAMBDA, sparse = TRUE) %*% (u_theta0 - LAMBDA %*% par0) # perhaps 'solve' function instead of 'ginv'
# equation (13) in the article
if (sum(abs(par - par0)) < 1e-6) break;
if (sum(abs(par - par0)) > 1000) break;
print(lambdas[i])

par0 <- par
lambdas[i]

}
}

par <- as.vector(par)
theta_est <- par

loss_theta_vec[i] <- loss_theta(par = theta_est, R = R_test, X = X_test,
weights = weights_X_test, method_selection = method_selection)

setup_lambda <- function(X, y, weights, method_selection, alpha = 1, lambda_min, log,lambda = FALSE, nlambda, ...) { #consider panalty factor here

}
fit <- glm.fit(x = X, y = y, weights = weights, family = binomial(link = method_selection))
#fit <- glm(y~1, weights = weights, family = binomial(link = method_selection))

loss_theta_av[k] <- mean(loss_theta_vec)
n <- length(y)
p <- ncol(X)
w <- fit$weights
r <- residuals(fit, "working") * w
zmax <- .Call("maxprod", X, r)/n
lambda.max <- zmax/alpha

k <- k + 1

if (log.lambda) { # lambda sequence on log-scale
if (lambda.min==0) {
lambda <- c(exp(seq(log(lambda.max), log(.001*lambda.max), length=nlambda-1)), 0)
} else {
lambda <- exp(seq(log(lambda.max), log(lambda.min*lambda.max), length=nlambda))
}
} else { # lambda sequence on linear-scale
if (lambda.min==0) {
lambda <- c(seq(lambda.max, 0.001*lambda.max, length = nlambda-1), 0)
} else {
lambda <- seq(lambda.max, lambda.min*lambda.max, length = nlambda)
}
}
lambda
}

i <- which.min(loss_theta_av)

print(lambdas[i])
fit <- function(X, R, weights, method_selection, h) {

lambdas[i]
p <- ncol(X)
init_theta <- rep(0, p)
# variables selection using score equation for theta
par0 <- init_theta
LAMBDA <- Matrix::Matrix(matrix(0, p, p), sparse = TRUE)
it <- 0
for(jj in 1:maxit) {
it <- it + 1
if (it == maxit) break

u_theta0 <- u_theta(par = par0, R = R, X = X,
weights = weights_X, h = h,
method_selection = method_selection)

u_theta0_der <- u_theta_der(par = par0, R = R, X = X,
weights = weights_X, h = h,
method_selection = method_selection)

diag(LAMBDA) <- abs(q_lambda(par0, lambda))/(eps + abs(par0))
par <- par0 + MASS::ginv(as.matrix(u_theta0_der + LAMBDA)) %*% (u_theta0 - LAMBDA %*% par0) # perhaps 'solve' function instead of 'ginv'
# equation (13) in article
if (sum(abs(par - par0)) < eps) break;
if (sum(abs(par - par0)) > 1000) break;

par0 <- par

}

par <- as.vector(par)
theta_est <- par
theta_est

}
1 change: 1 addition & 0 deletions R/nonprobDR.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ nonprobDR <- function(selection,
est_ps_rand <- maxLik_rand_obj$ps
hess <- maxLik_nons_obj$hess
theta_hat <- maxLik_nons_obj$theta_hat
names(theta_hat) <- c("(Intercept)", nons_names)
log_likelihood <- log_like(theta_hat) # maximum of the loglikelihood function


Expand Down
1 change: 1 addition & 0 deletions R/nonprobIPW.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ nonprobIPW <- function(selection,
est_ps_rand <- maxLik_rand_obj$ps
hess <- maxLik_nons_obj$hess
theta_hat <- maxLik_nons_obj$theta_hat
names(theta_hat) <- c("(Intercept)", nons_names)
log_likelihood <- log_like(theta_hat) # maximum of the loglikelihood function

# to complete
Expand Down
Loading

0 comments on commit 389f0ea

Please sign in to comment.