Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iss71 #72

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions R/add_discriminator_auc.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,27 @@
#' added using add_propensities())
#' @param split A logical for if the metric should be calculated separately for
#' the training/testing split. Defaults to TRUE.
#'
#' @param group A list of variable names to group by
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @return A discrimination object with propensities (likely added using
#' add_propensities()) with discriminator AUC
#'
#' @export
#'
add_discriminator_auc <- function(discrimination, split = TRUE) {
add_discriminator_auc <- function(discrimination, group = c(),split = TRUE) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example of using c() like this in a function? Typically, we would write group = NULL.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have an example, group = NULL is better!


if (split) {

discriminator_auc <- discrimination$propensities %>%
dplyr::group_by(.data$.sample) %>%
dplyr::group_by(across(all_of(c(".sample", group))))%>%
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
dplyr::mutate(.sample = factor(.data$.sample, levels = c("training", "testing"))) %>%
dplyr::arrange(.data$.sample)

} else {

discriminator_auc <- discrimination$propensities %>%
dplyr::group_by(across(all_of(group)))%>%
yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
dplyr::mutate(.sample = factor("overall", levels = "overall"))

Expand Down
71 changes: 51 additions & 20 deletions R/add_pmse.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#' Add pMSE to discrimination object
#' Add pMSE to discrimination object, with option to assess separately on groups
#' indicated by a grouping variable
#'
#' @param discrimination A discrimination object with propensities (likely
#' added using add_propensities())
#' @param split A logical for if the metric should be calculated separately for
#' the training/testing split. Defaults to TRUE.
#' @param group A set of variables to group the pmse by
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @return A discrimination object with propensities (likely added using
#' add_propensities()) with a pMSE
Expand All @@ -12,27 +14,30 @@
#'
#' @export
#'
add_pmse <- function(discrimination, split = TRUE) {

add_pmse <- function(discrimination, group = c(), split = TRUE) {

calc_pmse <- function(propensities) {

# calculate the expected propensity
prop_synthetic <- propensities %>%
dplyr::summarize(
n_synthetic = sum(.data$.source_label == "synthetic"),
n_total = dplyr::n()
) %>%
dplyr::mutate(prop_synthetic = .data$n_synthetic / .data$n_total) %>%
dplyr::pull("prop_synthetic")
dplyr::group_by(across(all_of(group))) %>%
dplyr::summarize("prop_synthetic" = list(c(sum(.data$.source_label == "synthetic")/dplyr::n()))) %>%
dplyr::pull(prop_synthetic)
Comment on lines +24 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new code is less clear than the original code imo. Can you just add group_by() before the old code?


propensities_vec <- propensities %>%
dplyr::pull(".pred_synthetic")
dplyr::group_by(across(all_of(group))) %>%
dplyr::summarise(".pred_synthetic" = list(c(.pred_synthetic))) %>%
dplyr::pull(.pred_synthetic)
Comment on lines +29 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love nested data structures! This sections needs some comments. This is probably also easier with the nest() function.


# function for pmse
pmse_func <- function(propensities_vec, prop_synthetic){
mean((propensities_vec - prop_synthetic)^2)
}
# calculate the observed pMSE
pmse <- mean((propensities_vec - prop_synthetic) ^ 2)
pmse <- mapply(pmse_func, propensities_vec, prop_synthetic)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a blank line before # calculate the observed pMSE

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We prefer library(purrr) to the *apply() functions and we typically include argument names.

pmse <- purrr::map2_dbl(
  .x = propensities_vec,
  .y = prop_synthetic,
  .f = pmse_func
)


return(pmse)

}

if (split) {
Expand All @@ -45,21 +50,47 @@ add_pmse <- function(discrimination, split = TRUE) {
dplyr::filter(.data$.sample == "testing") %>%
calc_pmse()

pmse <- tibble::tibble(
.source = factor(c("training", "testing"), levels = c("training", "testing")),
.pmse = c(pmse_training, pmse_testing)
)
if (length(group)==0){ # original case
pmse <- tibble::tibble(
.source = factor(c("training", "testing"), levels = c("training", "testing")),
.pmse = c(pmse_training, pmse_testing)
)
}
else{ # we have passed a list of grouping variables
groups <- discrimination$propensities %>%
dplyr::group_by(across(all_of(group))) %>%
group_keys()
groups <- rbind(groups, groups) # make 2 copies for train/test

pmse <- tibble::tibble(
groups,
.source = factor(c(rep("training", length(pmse_training)), rep("testing", length(pmse_testing))), levels = c("training", "testing")),
.pmse = c(pmse_training, pmse_testing)
)
}
Comment on lines +53 to +70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code fails and it's unclear. Please rewrite using the tidyverse style guide, clearer spacing, and more comments.


} else {

pmse_overall <- discrimination$propensities %>%
calc_pmse()

pmse <- tibble::tibble(
.source = factor("overall", levels = "overall"),
.pmse = pmse_overall
)

if (length(group)==0){ # original case
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You will need to switch to NULL

pmse <- tibble::tibble(
.source = factor("overall", levels = "overall"),
.pmse = pmse_overall
)
}
else{ # we have passed a list of grouping variables
groups <- discrimination$propensities %>%
dplyr::group_by(across(all_of(group))) %>%
group_keys()

pmse <- tibble::tibble(
groups,
.source = factor(c(rep("overall", length(pmse_overall))), levels = c("overall")),
.pmse = pmse_overall
)
}
Comment on lines +78 to +93
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above.

}

discrimination$pmse <- pmse
Expand Down
121 changes: 84 additions & 37 deletions R/add_pmse_ratio.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
#' @param prop The proportion of data to be retained for modeling/analysis in
#' the training/testing split. The sampling is stratified by the original and
#' synthetic data.
#' @param group A set of variables to group the pmse and null pmse by
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
#' @param times The number of bootstrap samples.
#'
#' @return A discrimination with pMSE
#'
#' @family Utility metrics
#'
#' @export
add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, group = c(), times) {

if (is.null(discrimination$pmse)) {

Expand All @@ -25,48 +26,86 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {

# calculate the expected propensity
prop_synthetic <- propensities %>%
dplyr::summarize(
n_synthetic = sum(.data$.source_label == "synthetic"),
n_total = dplyr::n()
) %>%
dplyr::mutate(prop_synthetic = .data$n_synthetic / .data$n_total) %>%
dplyr::pull("prop_synthetic")
dplyr::group_by(across(all_of(group))) %>%
dplyr::summarize("prop_synthetic" = list(c(sum(.data$.source_label == "synthetic")/dplyr::n()))) %>%
dplyr::pull(prop_synthetic)

propensities_vec <- propensities %>%
dplyr::pull(".pred_synthetic")
dplyr::group_by(across(all_of(group))) %>%
dplyr::summarise(".pred_synthetic" = list(c(.pred_synthetic))) %>%
dplyr::pull(.pred_synthetic)

# function for pmse
pmse_func <- function(propensities_vec, prop_synthetic){
mean((propensities_vec - prop_synthetic)^2)
}
# calculate the observed pMSE
pmse <- mean((propensities_vec - prop_synthetic) ^ 2)
pmse <- mapply(pmse_func, propensities_vec, prop_synthetic)

return(pmse)

}

pmse_null_overall <- vector(mode = "numeric", length = times)
pmse_null_training <- vector(mode = "numeric", length = times)
pmse_null_testing <- vector(mode = "numeric", length = times)
# function to sample 2x size of grouped data, by group,
# for the dataset with both confidential and synthetic data
# group_resample <- function(data){
# # split by grouping variables
# split_data = dplyr::group_split(data %>% dplyr::ungroup() %>% dplyr::group_by(across(all_of(group))))
# bootstrap_sample = list()
# for (elem in split_data){ # iterate through grouped dataset
# # in each group, sample twice as much data
# bootstrap_sample <- append(bootstrap_sample, list(dplyr::bind_cols(
# elem %>%
# dplyr::filter(.data$.source_label == "original") %>%
# dplyr::group_by(across(all_of(group))) %>%
# dplyr::slice_sample(n = nrow(elem), replace = TRUE) %>%
# dplyr::select(-".source_label"),
# elem %>%
# dplyr::select(".source_label"))
# ))
# }
# bootstrap_sample = dplyr::bind_rows(bootstrap_sample)
# }
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved

for (a in seq_along(pmse_null_overall)) {

group_resample <- function(data){
# split by grouping variables
split_data = dplyr::group_split(data %>% dplyr::ungroup() %>% dplyr::group_by(across(all_of(group))))
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
bootstrap_sample = vector("list", length = length(split_data))
for (i in 1:length(split_data)){ # iterate through grouped dataset
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
# in each group, sample twice as much data
bootstrap_sample[[i]] <- list(dplyr::bind_cols(
split_data[[i]] %>%
dplyr::filter(.data$.source_label == "original") %>%
dplyr::group_by(across(all_of(group))) %>%
dplyr::slice_sample(n = nrow(split_data[[i]]), replace = TRUE) %>%
dplyr::select(-".source_label"),
split_data[[i]] %>%
dplyr::select(".source_label"))
)
}
bootstrap_sample = dplyr::bind_rows(bootstrap_sample)
}

# matrix instead of vector, where each entry is a simulation, containing a vector with groups
#pmse_null_overall <- c()
#pmse_null_training <- c()
#pmse_null_testing <- c()

pmse_null_overall <- rep(NA, times)
pmse_null_training <- rep(NA, times)
pmse_null_testing <- rep(NA, times)
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved

for (a in 1:times) {
# bootstrap sample original observations to equal the size of the combined
# data
# data, with a vector of one set of observations per grouping variable (?)
# append the original labels so the proportions match
bootstrap_sample <- dplyr::bind_cols(
discrimination$combined_data %>%
dplyr::filter(.data$.source_label == "original") %>%
dplyr::slice_sample(n = nrow(discrimination$combined_data), replace = TRUE) %>%
dplyr::select(-".source_label"),
discrimination$combined_data %>%
dplyr::select(".source_label")
)

bootstrap_sample <- group_resample(discrimination$combined_data)

if (split) {

# make training/testing split
# make training/testing split
data_split <- rsample::initial_split(
data = bootstrap_sample,
prop = prop,
strata = ".source_label"
strata = ".source_label" # NOTE: Should this also be stratified by group?
)

# fit the model from the pMSE on the bootstrap sample
Expand All @@ -89,14 +128,20 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {
)

# calculate the pmse for each bootstrap
# pmse_null_overall <- append(pmse_null_overall, calc_pmse(propensities_df))
# pmse_null_training <- append(pmse_null_training, propensities_df %>%
# dplyr::filter(.data$.sample == "training") %>%
# calc_pmse())
# pmse_null_testing <- append(pmse_null_testing, propensities_df %>%
# dplyr::filter(.data$.sample == "testing") %>%
# calc_pmse())
pmse_null_overall[a] <- calc_pmse(propensities_df)
pmse_null_training[a] <- propensities_df %>%
dplyr::filter(.data$.sample == "training") %>%
calc_pmse()
dplyr::filter(.data$.sample == "training") %>%
calc_pmse()
pmse_null_testing[a] <- propensities_df %>%
dplyr::filter(.data$.sample == "testing") %>%
calc_pmse()

dplyr::filter(.data$.sample == "testing") %>%
calc_pmse()
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved
} else {

# fit the model from the pMSE on the bootstrap sample
Expand All @@ -113,15 +158,17 @@ add_pmse_ratio <- function(discrimination, split = TRUE, prop = 3 / 4, times) {

# calculate the pmse for each bootstrap
pmse_null_overall[a] <- calc_pmse(propensities_df)

}

}

# find the mean of the bootstrapped pMSEs
mean_null_pmse_overall <- mean(pmse_null_overall)
mean_null_pmse_training <- mean(pmse_null_training)
mean_null_pmse_testing <- mean(pmse_null_testing)
mean_null_pmse_overall <- colMeans(t(matrix(pmse_null_overall, ncol = times))) # each row is a new sample
if (split){
mean_null_pmse_training <- colMeans(t(matrix(pmse_null_training, ncol = times)))
mean_null_pmse_testing <- colMeans(t(matrix(pmse_null_testing, ncol= times)))
}
MortonC78483 marked this conversation as resolved.
Show resolved Hide resolved

# calculate the ratio for the training/testing split or overall data
if (all(c("training", "testing") %in% discrimination$pmse$.source)) {
Expand Down
Loading