UrbanInstitute · MortonC78483 · Mar 7, 2024 · Mar 10, 2024 · Mar 11, 2024 · awunderground
diff --git a/R/add_discriminator_auc.R b/R/add_discriminator_auc.R
@@ -4,25 +4,27 @@
 #' added using add_propensities())
 #' @param split A logical for if the metric should be calculated separately for 
 #' the training/testing split. Defaults to TRUE.
-#'
+#' @param group A list of variable names to group by
+#' 
 #' @return A discrimination object with propensities (likely added using 
 #' add_propensities()) with discriminator AUC
 #' 
 #' @export
 #'
-add_discriminator_auc <- function(discrimination, split = TRUE) {
+add_discriminator_auc <- function(discrimination, group = c(),split = TRUE) {
 
   if (split) {
 
     discriminator_auc <- discrimination$propensities %>%
-      dplyr::group_by(.data$.sample) %>%
+      dplyr::group_by(across(all_of(c(".sample", group))))%>%
       yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
       dplyr::mutate(.sample = factor(.data$.sample, levels = c("training", "testing"))) %>%
       dplyr::arrange(.data$.sample)
 
   } else {
 
     discriminator_auc <- discrimination$propensities %>%
+      dplyr::group_by(across(all_of(group)))%>%
       yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
       dplyr::mutate(.sample = factor("overall", levels = "overall"))
 

diff --git a/R/add_pmse.R b/R/add_pmse.R
@@ -1,9 +1,11 @@
-#' Add pMSE to discrimination object
+#' Add pMSE to discrimination object, with option to assess separately on groups
+#' indicated by a grouping variable
 #'
 #' @param discrimination A discrimination object with propensities (likely 
 #' added using add_propensities())
 #' @param split A logical for if the metric should be calculated separately for 
 #' the training/testing split. Defaults to TRUE.
+#' @param group A set of variables to group the pmse by
 #'
 #' @return A discrimination object with propensities (likely added using 
 #' add_propensities()) with a pMSE
@@ -12,27 +14,30 @@
 #' 
 #' @export
 #' 
-add_pmse <- function(discrimination, split = TRUE) {
+
+add_pmse <- function(discrimination, group = c(), split = TRUE) {
 
   calc_pmse <- function(propensities) {
 
     # calculate the expected propensity
     prop_synthetic <- propensities %>%
-      dplyr::summarize(
-        n_synthetic = sum(.data$.source_label == "synthetic"),
-        n_total = dplyr::n()
-      ) %>%
-      dplyr::mutate(prop_synthetic = .data$n_synthetic / .data$n_total) %>%
-      dplyr::pull("prop_synthetic")
+      dplyr::group_by(across(all_of(group))) %>%
+      dplyr::summarize("prop_synthetic" = list(c(sum(.data$.source_label == "synthetic")/dplyr::n()))) %>%
+      dplyr::pull(prop_synthetic)
 
     propensities_vec <- propensities %>%
-      dplyr::pull(".pred_synthetic")
+      dplyr::group_by(across(all_of(group))) %>%
+      dplyr::summarise(".pred_synthetic" = list(c(.pred_synthetic))) %>%
+      dplyr::pull(.pred_synthetic)
 
+    # function for pmse
+    pmse_func <- function(propensities_vec, prop_synthetic){
+      mean((propensities_vec - prop_synthetic)^2)
+    }
     # calculate the observed pMSE
-    pmse <- mean((propensities_vec - prop_synthetic) ^ 2)
+    pmse <- mapply(pmse_func, propensities_vec, prop_synthetic)
 
     return(pmse)
-
   }
 
   if (split) {
@@ -45,21 +50,47 @@ add_pmse <- function(discrimination, split = TRUE) {
       dplyr::filter(.data$.sample == "testing") %>%
       calc_pmse()
 
-    pmse <- tibble::tibble(
-      .source = factor(c("training", "testing"), levels = c("training", "testing")),
-      .pmse = c(pmse_training, pmse_testing)
-    )
+    if (length(group)==0){ # original case
+      pmse <- tibble::tibble(
+        .source = factor(c("training", "testing"), levels = c("training", "testing")),
+        .pmse = c(pmse_training, pmse_testing)
+      )
+    }
+    else{ # we have passed a list of grouping variables
+      groups <- discrimination$propensities %>%
+        dplyr::group_by(across(all_of(group))) %>%
+        group_keys()
+      groups <- rbind(groups, groups) # make 2 copies for train/test
+
+      pmse <- tibble::tibble(
+        groups,
+        .source = factor(c(rep("training", length(pmse_training)), rep("testing", length(pmse_testing))), levels = c("training", "testing")),
+        .pmse = c(pmse_training, pmse_testing)
+      )
+    }
 
   } else {
 
     pmse_overall <- discrimination$propensities %>%
       calc_pmse()
 
-    pmse <- tibble::tibble(
-      .source = factor("overall", levels = "overall"),
-      .pmse = pmse_overall
-    )
-
+    if (length(group)==0){ # original case
+      pmse <- tibble::tibble(
+        .source = factor("overall", levels = "overall"),
+        .pmse = pmse_overall
+      )
+    }
+    else{ # we have passed a list of grouping variables
+      groups <- discrimination$propensities %>%
+        dplyr::group_by(across(all_of(group))) %>%
+        group_keys()
+
+      pmse <- tibble::tibble(
+        groups,
+        .source = factor(c(rep("overall", length(pmse_overall))), levels = c("overall")),
+        .pmse = pmse_overall
+      )
+    }
   }
 
   discrimination$pmse <- pmse

diff --git a/R/add_pmse_ratio.R b/R/add_pmse_ratio.R
@@ -6,14 +6,15 @@
 #' @param prop The proportion of data to be retained for modeling/analysis in 
 #' the training/testing split. The sampling is stratified by the original and
 #' synthetic data.
+#' @param group A set of variables to group the pmse and null pmse by
 #' @param times The number of bootstrap samples.
 #'
 #' @return A discrimination with pMSE
 #' 
 #' @family Utility metrics
 #' 
 #' @export
-add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
+add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, group = c(), times) {
 
   if (is.null(discrimination$pmse)) {
 
@@ -25,48 +26,86 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
 
     # calculate the expected propensity
     prop_synthetic <- propensities %>%
-      dplyr::summarize(
-        n_synthetic = sum(.data$.source_label == "synthetic"),
-        n_total = dplyr::n()
-      ) %>%
-      dplyr::mutate(prop_synthetic = .data$n_synthetic / .data$n_total) %>%
-      dplyr::pull("prop_synthetic")
+      dplyr::group_by(across(all_of(group))) %>%
+      dplyr::summarize("prop_synthetic" = list(c(sum(.data$.source_label == "synthetic")/dplyr::n()))) %>%
+      dplyr::pull(prop_synthetic)
 
     propensities_vec <- propensities %>%
-      dplyr::pull(".pred_synthetic")
+      dplyr::group_by(across(all_of(group))) %>%
+      dplyr::summarise(".pred_synthetic" = list(c(.pred_synthetic))) %>%
+      dplyr::pull(.pred_synthetic)
 
+    # function for pmse
+    pmse_func <- function(propensities_vec, prop_synthetic){
+      mean((propensities_vec - prop_synthetic)^2)
+    }
     # calculate the observed pMSE
-    pmse <- mean((propensities_vec - prop_synthetic) ^ 2)
+    pmse <- mapply(pmse_func, propensities_vec, prop_synthetic)
 
     return(pmse)
-
   }
-
-  pmse_null_overall <- vector(mode = "numeric", length = times)
-  pmse_null_training <- vector(mode = "numeric", length = times)
-  pmse_null_testing <- vector(mode = "numeric", length = times)
+  # function to sample 2x size of grouped data, by group, 
+  # for the dataset with both confidential and synthetic data
+  # group_resample <- function(data){
+  #   # split by grouping variables
+  #   split_data = dplyr::group_split(data %>% dplyr::ungroup() %>% dplyr::group_by(across(all_of(group))))
+  #   bootstrap_sample = list()
+  #   for (elem in split_data){ # iterate through grouped dataset
+  #     # in each group, sample twice as much data
+  #     bootstrap_sample <- append(bootstrap_sample, list(dplyr::bind_cols(
+  #       elem %>%
+  #         dplyr::filter(.data$.source_label == "original") %>%
+  #         dplyr::group_by(across(all_of(group))) %>%
+  #         dplyr::slice_sample(n = nrow(elem), replace = TRUE) %>%
+  #         dplyr::select(-".source_label"),
+  #       elem %>%
+  #         dplyr::select(".source_label"))
+  #     ))
+  #   }
+  #   bootstrap_sample = dplyr::bind_rows(bootstrap_sample)
+  # }
 
-  for (a in seq_along(pmse_null_overall)) {
-
+  group_resample <- function(data){
+    # split by grouping variables
+    split_data = dplyr::group_split(data %>% dplyr::ungroup() %>% dplyr::group_by(across(all_of(group))))
+    bootstrap_sample = vector("list", length = length(split_data))
+    for (i in 1:length(split_data)){ # iterate through grouped dataset
+      # in each group, sample twice as much data
+      bootstrap_sample[[i]] <- list(dplyr::bind_cols(
+        split_data[[i]] %>%
+          dplyr::filter(.data$.source_label == "original") %>%
+          dplyr::group_by(across(all_of(group))) %>%
+          dplyr::slice_sample(n = nrow(split_data[[i]]), replace = TRUE) %>%
+          dplyr::select(-".source_label"),
+        split_data[[i]] %>%
+          dplyr::select(".source_label"))
+      )
+    }
+    bootstrap_sample = dplyr::bind_rows(bootstrap_sample)
+  }
+
+  # matrix instead of vector, where each entry is a simulation, containing a vector with groups
+  #pmse_null_overall <- c()
+  #pmse_null_training <- c()
+  #pmse_null_testing <- c()
+
+  pmse_null_overall <- rep(NA, times)
+  pmse_null_training <- rep(NA, times)
+  pmse_null_testing <- rep(NA, times)
+
+  for (a in 1:times) {
     # bootstrap sample original observations to equal the size of the combined 
-    # data
+    # data, with a vector of one set of observations per grouping variable (?)
     # append the original labels so the proportions match
-    bootstrap_sample <- dplyr::bind_cols(
-      discrimination$combined_data %>%
-        dplyr::filter(.data$.source_label == "original") %>%
-        dplyr::slice_sample(n = nrow(discrimination$combined_data), replace = TRUE) %>%
-        dplyr::select(-".source_label"),
-      discrimination$combined_data %>%
-        dplyr::select(".source_label")
-    )
-
+    bootstrap_sample <- group_resample(discrimination$combined_data)
+
     if (split) {
 
-      # make training/testing split
+      # make training/testing split 
       data_split <- rsample::initial_split(
         data = bootstrap_sample,
         prop = prop,
-        strata = ".source_label"
+        strata = ".source_label" # NOTE: Should this also be stratified by group?
       )
 
       # fit the model from the pMSE on the bootstrap sample
@@ -89,14 +128,20 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
         )
 
       # calculate the pmse for each bootstrap
+      # pmse_null_overall <- append(pmse_null_overall, calc_pmse(propensities_df))
+      # pmse_null_training <- append(pmse_null_training, propensities_df %>%
+      #                                dplyr::filter(.data$.sample == "training") %>%
+      #                                calc_pmse())
+      # pmse_null_testing <- append(pmse_null_testing, propensities_df %>%
+      #                               dplyr::filter(.data$.sample == "testing") %>%
+      #                               calc_pmse())
       pmse_null_overall[a] <- calc_pmse(propensities_df)
       pmse_null_training[a] <- propensities_df %>%
-        dplyr::filter(.data$.sample == "training") %>%
-        calc_pmse()
+                                     dplyr::filter(.data$.sample == "training") %>%
+                                     calc_pmse()
       pmse_null_testing[a] <- propensities_df %>%
-        dplyr::filter(.data$.sample == "testing") %>%
-        calc_pmse()
-
+                                    dplyr::filter(.data$.sample == "testing") %>%
+                                    calc_pmse()
     } else {
 
       # fit the model from the pMSE on the bootstrap sample
@@ -113,15 +158,17 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
 
       # calculate the pmse for each bootstrap
       pmse_null_overall[a] <- calc_pmse(propensities_df)
-
+      
     }
 
   }
 
   # find the mean of the bootstrapped pMSEs
-  mean_null_pmse_overall <- mean(pmse_null_overall)
-  mean_null_pmse_training <- mean(pmse_null_training)
-  mean_null_pmse_testing <- mean(pmse_null_testing)
+  mean_null_pmse_overall <- colMeans(t(matrix(pmse_null_overall, ncol = times))) # each row is a new sample
+  if (split){
+    mean_null_pmse_training <- colMeans(t(matrix(pmse_null_training, ncol = times)))
+    mean_null_pmse_testing <- colMeans(t(matrix(pmse_null_testing, ncol= times)))
+  }
 
   # calculate the ratio for the training/testing split or overall data
   if (all(c("training", "testing") %in% discrimination$pmse$.source)) {