Merge pull request #22 from ncn-foreigners/dev

add another start method, readme update
ncn-foreigners · Nov 5, 2023 · 0a03ca8 · 0a03ca8
2 parents 3d91412 + 40d5c06
commit 0a03ca8
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 93 deletions.
diff --git a/R/EstimationMethods.R b/R/EstimationMethods.R
@@ -97,11 +97,21 @@ mle <- function(...) {
 
     # initial values for propensity score estimation
     if (is.null(start)) {
-      start <- start_fit(X = X,
-                         R = R,
-                         weights = weights,
-                         weights_rand = weights_rand,
-                         method_selection = method_selection)
+      if (control_selection$start_type == "glm") {
+        start <- start_fit(X = X,
+                           R = R,
+                           weights = weights,
+                           weights_rand = weights_rand,
+                           method_selection = method_selection)
+      } else if (control_selection$start_type == "naive") {
+        intercept_start <- max_lik(X_nons = X_nons[, 1, drop = FALSE],
+                                   X_rand = X_rand[, 1, drop = FALSE],
+                                   weights = weights,
+                                   weights_rand = weights_rand,
+                                   start = 0,
+                                   control = control_selection)$theta_hat
+        start <- c(intercept_start, rep(0, ncol(X_nons) - 1))
+      }
     }
 
     df_reduced <- nrow(X) - length(start)
@@ -216,13 +226,36 @@ gee <- function(...) {
                               h = h,
                               est_method,
                               maxit,
+                              control_selection,
                               start,
                               varcov = FALSE,
                               ...){
 
     method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "")
     method <- get_method(method = method_selection_function)
     inv_link <- method$make_link_inv
+
+    if (is.null(start)) {
+      if (control_selection$start_type == "glm") {
+        start0 <- start_fit(X = X, # <--- does not work with pop_totals
+                            R = R,
+                            weights = weights,
+                            weights_rand = weights_rand,
+                            method_selection = method_selection)
+      } else if (control_selection$start_type == "naive") {
+        start_h <- theta_h_estimation(R = R,
+                                      X = X[, 1, drop = FALSE],
+                                      weights_rand = weights_rand,
+                                      weights = weights,
+                                      h = h,
+                                      method_selection = method_selection,
+                                      maxit = maxit,
+                                      start = 0)
+        start <- c(start_h, rep(0, ncol(X) - 1))
+      }
+  }
+
+
     h_object <- theta_h_estimation(R = R,
                                    X = X,
                                    weights_rand = weights_rand,

diff --git a/R/control.R b/R/control.R
@@ -29,6 +29,11 @@
 #' @param nlambda The number of `lambda` values. Default is 50.
 #' @param nfolds The number of folds for cross validation. Default is 10.
 #' @param print_level this argument determines the level of printing which is done during the optimization (for propensity score model) process.
+#' @param start_type - Type of method for start points for model fitting taking the following values
+#' \itemize{
+#' \item if \code{glm} then start taken from the glm function called on samples.
+#' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest.
+#' }
 #'
 #' @return List with selected parameters.
 #'
@@ -52,7 +57,8 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
                        lambda_min = .001,
                        nlambda = 50,
                        nfolds = 10,
-                       print_level = 0
+                       print_level = 0,
+                       start_type = c("glm", "naive")
                        ) {
 
   list(epsilon = epsilon,
@@ -72,7 +78,8 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
        nlambda = nlambda,
        nfolds = nfolds,
        lambda = lambda,
-       print_level = print_level
+       print_level = print_level,
+       start_type = if(missing(start_type)) "naive" else start_type
       )
 
 }

diff --git a/R/internals.R b/R/internals.R
@@ -74,30 +74,30 @@ theta_h_estimation <- function(R,
                                pop_means = NULL){ # TODO with BERENZ recommendation
 
   p <- ncol(X)
-  if (is.null(pop_totals) & is.null(pop_means)) {
-    if (is.null(start)) {
-      start0 <- start_fit(X = X, # <--- does not work with pop_totals
-                          R = R,
-                          weights = weights,
-                          weights_rand = weights_rand,
-                          method_selection = method_selection)
-    } else {
-      start0 <- start
-    }
-  } else { # TODO customize start point for fitting with population totals
-    # start0 <- rep(.8, ncol(X))
-    # X_pop <- rbind(X, pop_totals)
-    # weights_randd <- 1
-    if (is.null(start)) {
-      start0 <- start_fit(X = X, # <--- does not work with pop_totals
-                          R = R,
-                          weights = weights,
-                          weights_rand = weights_rand,
-                          method_selection = method_selection)
-    } else {
-      start0 <- start
-    }
-  }
+  # if (is.null(pop_totals) & is.null(pop_means)) {
+  #   if (is.null(start)) {
+  #     start0 <- start_fit(X = X, # <--- does not work with pop_totals
+  #                         R = R,
+  #                         weights = weights,
+  #                         weights_rand = weights_rand,
+  #                         method_selection = method_selection)
+  #   } else {
+  #     start0 <- start
+  #   }
+  # } else { # TODO customize start point for fitting with population totals
+  #   # start0 <- rep(.8, ncol(X))
+  #   # X_pop <- rbind(X, pop_totals)
+  #   # weights_randd <- 1
+  #   if (is.null(start)) {
+  #     start0 <- start_fit(X = X, # <--- does not work with pop_totals
+  #                         R = R,
+  #                         weights = weights,
+  #                         weights_rand = weights_rand,
+  #                         method_selection = method_selection)
+  #   } else {
+  #     start0 <- start
+  #   }
+  # }
   u_theta <- u_theta(R = R,
                      X = X,
                      weights = c(weights_rand, weights),
@@ -112,7 +112,7 @@ theta_h_estimation <- function(R,
                              method_selection = method_selection,
                              pop_totals = pop_totals)
 
-  root <- nleqslv::nleqslv(x = start0,
+  root <- nleqslv::nleqslv(x = start,
                            fn = u_theta,
                            method = "Newton", # TODO consider the methods
                            global = "cline", #qline",
@@ -123,7 +123,7 @@ theta_h_estimation <- function(R,
                            )
 
 
-  start <- root$x
+  theta_root <- root$x
   if (root$termcd %in% c(2:7, -10)) {
     switch(as.character(root$termcd),
            "2" = warning("Relatively convergent algorithm when fitting selection model by nleqslv, but user must check if function values are acceptably small."),
@@ -134,7 +134,7 @@ theta_h_estimation <- function(R,
            "7" = warning("Jacobian is unusable when fitting selection model by nleqslv."),
            "-10" = warning("user specified Jacobian is incorrect when fitting selection model by nleqslv."))
   }
-  theta_h <- as.vector(start)
+  theta_h <- as.vector(theta_root)
   grad <- u_theta(theta_h)
   hess <- u_theta_der(theta_h) # TODO compare with root$jac
   #hess <- root$jac

diff --git a/R/methods.R b/R/methods.R
@@ -272,14 +272,14 @@ residuals.nonprobsvy <- function(object,
 
   if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {
     if (object$control$control_inference$vars_selection == FALSE) {
-      res_out <- residuals(object$outcome) # TODO for variable selection
+      res_out <- residuals(object$outcome[[1]]) # TODO for variable selection
     } else { # TODO for variable selection
       r <- object$outcome$family$residuals
       res_out <- switch(type,
                         "response" = r,
-                        "working" = r/object$outcome$family$mu,
+                        "working" = r/object$outcome[[1]]$family$mu,
                         # TODO "deviance" =
-                        "pearson" = r/sqrt(object$outcome$family$variance)
+                        "pearson" = r/sqrt(object$outcome[[1]]$family$variance)
                         )
     }
   }
@@ -311,7 +311,7 @@ cooks.distance.nonprobsvy <- function(model, # TODO for variable selection
   resids <- residuals(model, type = "pearsonSTD")^2
   hats <- hatvalues(model)
   res_sel <- (resids * (hats / (length(coef(model))))) # TODO
-  res_out <- cooks.distance(model$outcome$glm)
+  res_out <- cooks.distance(model$outcome[[1]])
 
 }
 #' @method hatvalues nonprobsvy
@@ -321,7 +321,7 @@ cooks.distance.nonprobsvy <- function(model, # TODO for variable selection
 hatvalues.nonprobsvy <- function(model,
                                  ...) { # TODO reduce execution time and glm.fit object and customise to variable selection
   if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(model))) {
-    propensity_scores <- model$prop_scores
+    propensity_scores <- model$prob
     W <- Matrix::Diagonal(x = propensity_scores * (1 - propensity_scores))
     XWX_inv <-  solve(t(model$X) %*% W %*% model$X)
     hat_values_sel <- vector(mode = "numeric", length = length(propensity_scores))
@@ -348,10 +348,10 @@ logLik.nonprobsvy <- function(object, ...) {
   if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) {
     val_sel <- object$selection$log_likelihood
     attr(val_sel, "nobs") <- dim(residuals(object, type = "pearson"))[1]
-    attr(val_sel, "df") <- nrow(object$parameters) #length(object$coefficients)
+    attr(val_sel, "df") <- length(object$selection$coefficients)
     class(val_sel) <- "logLik"
   }
-  val_out <- ifelse(any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object)), logLik(object$outcome), 0) # TODO for gee
+  val_out <- ifelse(any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object)), logLik(object$outcome[[1]]), 0) # TODO for gee
   if (class(object)[2] == "nonprobsvy_mi") val <- c("outcome" = val_out)
   if (class(object)[2] == "nonprobsvy_ipw") val <- c("selection" = val_sel)
   if (class(object)[2] == "nonprobsvy_dr") val <- c("selection" = val_sel, "outcome" = val_out)
@@ -363,8 +363,8 @@ logLik.nonprobsvy <- function(object, ...) {
 AIC.nonprobsvy <- function(object,
                            ...) {
   if (!is.character(object$selection$log_likelihood)) {
-    if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- 2 * (length(object$parameters) - object$selection$log_likelihood)
-    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome$aic
+    if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- 2 * (length(object$selection$coefficients) - object$selection$log_likelihood)
+    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome[[1]]$aic
     if (class(object)[2] == "nonprobsvy_mi") res <- c("outcome" = res_out)
     if (class(object)[2] == "nonprobsvy_ipw") res <- c("selection" = res_sel)
     if (class(object)[2] == "nonprobsvy_dr") res <- c("selection" = res_sel, "outcome" = res_out)
@@ -380,15 +380,15 @@ AIC.nonprobsvy <- function(object,
 BIC.nonprobsvy <- function(object,
                            ...) {
   if (!is.character(object$selection$log_likelihood)) {
-    if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- length(object$parameters) * log(object$nonprob_size + object$prob_size) - 2 * object$selection$log_likelihood
+    if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) res_sel <- length(object$selection$coefficients) * log(object$nonprob_size + object$prob_size) - 2 * object$selection$log_likelihood
     if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {
 
-      if (!is.null(object$outcome$coefficients)) {
+      if (!is.null(object$outcome[[1]]$coefficients)) {
         if (object$control$control_inference$vars_selection == TRUE) {
           options(AIC="BIC")
           res_out <- HelpersMG::ExtractAIC.glm(object$outcome)[2]
         } else {
-          res_out <- BIC(object$outcome)
+          res_out <- BIC(object$outcome[[1]])
         }
       } else {
         res_out <- "not available for this methos"
@@ -425,17 +425,17 @@ confint.nonprobsvy <- function(object,
   if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) {
     std <- sqrt(diag(vcov(object)[[1]]))
     sc <- qnorm(p = 1 - (1 - level) / 2)
-    res_sel <- data.frame(object$parameters[,1] - sc * std, object$parameters[,1] + sc * std)
+    res_sel <- data.frame(object$selection$coefficients - sc * std, object$selection$coefficients + sc * std)
     colnames(res_sel) <- c(paste0(100 * (1 - level) / 2, "%"),
                            paste0(100 * (1 - (1 - level) / 2), "%"))
   }
   if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) {
     if (object$control$control_inference$vars_selection == FALSE) {
-    res_out <- confint(object$outcome)
+    res_out <- confint(object$outcome[[1]])
     } else {
       std <- sqrt(diag(vcov(object)[[1]]))
       sc <- qnorm(p = 1 - (1 - level) / 2)
-      res_out <- data.frame(object$beta[,1] - sc * std, object$beta[,1] + sc * std)
+      res_out <- data.frame(object$outcome[[1]]$coefficients - sc * std, object$outcome[[1]]$coefficients + sc * std)
       colnames(res_out) <- c(paste0(100 * (1 - level) / 2, "%"),
                              paste0(100 * (1 - (1 - level) / 2), "%"))
     }
@@ -463,14 +463,14 @@ confint.nonprobsvy <- function(object,
 vcov.nonprobsvy <- function(object,
                             ...) { # TODO consider different vcov methods for selection and outcome models
   if (object$control$control_inference$vars_selection == TRUE) {
-    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome$variance_covariance
+    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- object$outcome[[1]]$variance_covariance
     if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object)))  res_sel <- object$selection$variance_covariance
     if (class(object)[2] == "nonprobsvy_mi") res <- list(outcome = res_out)
     if (class(object)[2] == "nonprobsvy_ipw") res <- list(selection = res_sel)
     if (class(object)[2] == "nonprobsvy_dr") res <- list(selection = res_sel, outcome = res_out)
 
   } else {
-    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- vcov(object$outcome)
+    if (any(c("nonprobsvy_dr", "nonprobsvy_mi") %in% class(object))) res_out <- vcov(object$outcome[[1]])
     if (any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object)))  res_sel <- object$selection$variance_covariance
     if (class(object)[2] == "nonprobsvy_mi") res <- list(outcome = res_out)
     if (class(object)[2] == "nonprobsvy_ipw") res <- list(selection = res_sel)
@@ -483,6 +483,6 @@ vcov.nonprobsvy <- function(object,
 #' @exportS3Method
 deviance.nonprobsvy <- function(object,
                                 ...) {
-  res_out <- object$outcome$deviance
+  res_out <- object$outcome[[1]]$deviance
   # TODO for selection model - use selection object
 }
diff --git a/R/nonprobDR.R b/R/nonprobDR.R
@@ -290,6 +290,7 @@ nonprobDR <- function(selection,
         ps_nons_der <- selection_model$ps_nons_der
         est_ps_rand_der <- selection_model$est_ps_rand_der
         theta_standard_errors <- sqrt(diag(selection_model$variance_covariance))
+        names(theta_hat) <- names(selection_model$coefficients) <- colnames(X)
 
         y_rand_pred <- outcome_model$y_rand_pred
         y_nons_pred <- outcome_model$y_nons_pred
@@ -389,7 +390,7 @@ nonprobDR <- function(selection,
         #log_likelihood <- est_method_obj$log_likelihood
         #df_residual <- est_method_obj$df_residual
 
-        names(theta_hat) <- colnames(X)
+        names(selection_model$theta_hat) <- names(theta_hat) <- colnames(X)
         weights_nons <- 1/ps_nons
         N_nons <- sum(weights * weights_nons)
         N_rand <- sum(weights_rand)
@@ -475,6 +476,28 @@ nonprobDR <- function(selection,
         X_nons <- OutcomeModel$X_nons <- SelectionModel$X_nons <- X[loc_nons,]
         SelectionModel$pop_totals <- c(SelectionModel$pop_totals[1], SelectionModel$pop_totals[idx+1])
       }
+
+      if (is.null(start)) {
+        if (control_selection$start_type == "glm") {
+          start <- start_fit(X = SelectionModel$X_nons, # <--- does not work with pop_totals
+                             R = R,
+                             weights = weights,
+                             weights_rand = weights_rand,
+                             method_selection = method_selection)
+        } else if (control_selection$start_type == "naive") {
+          start_h <- theta_h_estimation(R = R,
+                                        X = SelectionModel$X_nons[,1,drop=FALSE],
+                                        weights_rand = weights_rand,
+                                        weights = weights,
+                                        h = h,
+                                        method_selection = method_selection,
+                                        start = 0,
+                                        maxit = maxit,
+                                        pop_totals = SelectionModel$pop_totals[1])$theta_h
+          start <- c(start_h, rep(0, ncol(X) - 1))
+        }
+      }
+
       h_object <- theta_h_estimation(R = R,
                                      X = SelectionModel$X_nons,
                                      weights_rand = NULL,

diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R
@@ -163,7 +163,7 @@ nonprobIPW <- function(selection,
         est_ps_rand_der <- selection_model$est_ps_rand_der
         theta_standard_errors <- sqrt(diag(selection_model$variance_covariance))
 
-        names(selection_model$theta_hat) <- colnames(X)
+        names(theta_hat) <- names(selection_model$theta_hat) <- colnames(X)
         weights_nons <- 1/ps_nons
         N <- sum(weights * weights_nons)
 
@@ -252,6 +252,28 @@ nonprobIPW <- function(selection,
           ################ ESTIMATION
           pop_totals <- model$pop_totals[idx]
         }
+
+        if (is.null(start)) {
+          if (control_selection$start_type == "glm") {
+            start <- start_fit(X = X, # <--- does not work with pop_totals
+                               R = R,
+                               weights = weights,
+                               weights_rand = weights_rand,
+                               method_selection = method_selection)
+          } else if (control_selection$start_type == "naive") {
+            start_h <- theta_h_estimation(R = R,
+                                          X = X[,1,drop=FALSE],
+                                          weights_rand = weights_rand,
+                                          weights = weights,
+                                          h = h,
+                                          method_selection = method_selection,
+                                          start = 0,
+                                          maxit = maxit,
+                                          pop_totals = pop_totals[1])$theta_h
+            start <- c(start_h, rep(0, ncol(X) - 1))
+          }
+        }
+
         h_object <- theta_h_estimation(R = R,
                                        X = X,
                                        weights_rand = weights_rand,

diff --git a/README.Rmd b/README.Rmd
@@ -36,7 +36,7 @@ You can install the development version of `nonprobsvy` package from [Github](ht
 remotes::install_github("ncn-foreigners/nonprobsvy")
 ```
 
-## Example
+## Examples
 
 Simulate example data from the following paper: Kim, Jae Kwang, and Zhonglei Wang. "Sampling techniques for big data analysis." International Statistical Review 87 (2019): S177-S191 [section 5.2]