diff --git a/DESCRIPTION b/DESCRIPTION index 70f5191..cdcc8e0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Imports: checkmate, cli, curl, + distfromq, fs, gh, glue, @@ -40,9 +41,14 @@ Imports: Suggests: arrow (>= 17.0.0), dplyr, + hubData, knitr, rmarkdown, testthat (>= 3.2.0) +Remotes: + hubverse-org/hubData, + reichlab/distfromq +Additional_repositories: https://hubverse-org.r-universe.dev/ Config/Needs/website: hubverse-org/hubStyle Config/testthat/edition: 3 Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index e71fd27..72e3547 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(read_config,default) export("%>%") export(as_model_out_tbl) export(check_deprecated_schema) +export(convert_output_type) export(extract_schema_version) export(get_config_tid) export(get_round_ids) @@ -15,6 +16,7 @@ export(get_schema) export(get_schema_url) export(get_schema_valid_versions) export(get_schema_version_latest) +export(get_task_id_cols) export(get_task_id_names) export(is_v3_config) export(is_v3_config_file) @@ -30,3 +32,4 @@ importFrom(gh,gh) importFrom(jsonlite,fromJSON) importFrom(jsonlite,read_json) importFrom(magrittr,"%>%") +importFrom(rlang,.data) diff --git a/R/convert_output_types.R b/R/convert_output_types.R new file mode 100644 index 0000000..6c95055 --- /dev/null +++ b/R/convert_output_types.R @@ -0,0 +1,308 @@ +#' Transform between output types +#' +#' Transform between output types for each unique combination of task IDs for +#' each model. Conversion must be from a single starting output type to one or more +#' new output types, and the resulting output will only contain the new output types. +#' See details for supported conversions. +#' +#' @param model_out_tbl an object of class `model_out_tbl` containing predictions +#' with a single, unique value in the `output_type` column. +#' @param new_output_type character vector of the desired output type(s) after +#' transformation. May contain any of the following output types: +#' `"mean"`, `"median"`, `"quantile"`, `"cdf"`. +#' @param new_output_type_id A named list indicating the desired output type IDs +#' for each new output type, in which each element is a vector of output type IDs. +#' If only one new output type is requested, then it may be a single numeric vector +#' (for `"quantile"` or `"cdf"`) or not required (`"mean"` or `"median"`). See +#' the examples for an illustration of both cases. +#' @param n_samples `numeric` that specifies the number of samples to use when +#' calculating output_types from an estimated quantile function. Defaults to `1e4`. +#' @param ... parameters that are passed to `distfromq::make_q_fn`, specifying +#' details of how to estimate a quantile function from provided quantile levels +#' and quantile values for `"quantile"` or `"cdf"` output types. +#' +#' @details +#' The following transformations are supported: (i) `"sample"` can be +#' transformed to `"mean"`, `"median"`, `"quantile"`, or `"cdf"`; (ii) +#' `"quantile"` can be transformed to `"mean"`, `"median"`, or `"cdf"`; and +#' (iii) `"cdf"` can be transformed to `"mean"`, `"median"`, or `"quantile"`. +#' +#' For `"quantile"` and `"cdf"` starting output types, we use the following approach: +#' 1. Interpolate and extrapolate from the provided quantiles or probabilities +#' for each component model to obtain an estimate of the cdf of that distribution. +#' 2. Draw samples from the distribution for each component model. To reduce +#' Monte Carlo variability, we use quasi-random samples corresponding to +#' quantiles of the estimated distribution. +#' 3. Calculate the desired quantity (e.g., mean). +#' If the median quantile is provided in the `model_out_tbl` object (i.e., the +#' original output_type is `"median"` and 0.5 is contained in the original +#' output_type_id), the median value is extracted and returned directly. +#' +#' If both `"quantile"` and `"cdf"` output_types are desired, `new_output_type_id` +#' should be a named list, where each element specifies the corresponding +#' `new_output_type_id`. See examples for an illustration. +#' +#' @examples +#' # We illustrate the conversion between output types using normal distributions +#' ex_quantiles <- c(0.25, 0.5, 0.75) +#' model_out_tbl <- expand.grid( +#' stringsAsFactors = FALSE, +#' group1 = c(1,2), +#' model_id = "A", +#' output_type = "quantile", +#' output_type_id = ex_quantiles +#' ) %>% +#' dplyr::mutate(value = qnorm(p = output_type_id, mean = group1)) +#' +#' convert_output_type(model_out_tbl, new_output_type = "median", new_output_type_id = NA) +#' +#' # Next, we illustrate conversion from samples to quantile and cdf +#' ex_cdf_values <- seq(-2,2,1) +#' ex_quantiles <- c(0.25, 0.5, 0.75) +#' model_out_tbl <- expand.grid( +#' stringsAsFactors = FALSE, +#' group1 = c(1,2), +#' model_id = "A", +#' output_type = "sample", +#' output_type_id = 1:100 +#' ) %>% +#' dplyr::mutate(value = rnorm(200, mean = group1)) +#' +#' convert_output_type(model_out_tbl, new_output_type = c("quantile", "cdf"), +#' new_output_type_id = list("quantile" = ex_quantiles, "cdf" = ex_cdf_values)) +#' +#' @return object of class `model_out_tbl` containing (only) the new output_type(s) +#' for each unique combination of task IDs for each model +#' @export +#' @importFrom rlang .data +convert_output_type <- function(model_out_tbl, new_output_type, + new_output_type_id = NA, n_samples = 1e4, ...) { + # validations + starting_output_type <- unique(model_out_tbl$output_type) + starting_output_type_ids <- unique(model_out_tbl$output_type_id) + task_id_cols <- get_task_id_cols(model_out_tbl) + validate_new_output_type( + starting_output_type, new_output_type, + new_output_type_id + ) + # for cdf and quantile functions, get samples + if (starting_output_type == "cdf") { + # estimate from samples + model_out_tbl <- get_samples_from_cdf(model_out_tbl, task_id_cols, n_samples) + } else if (starting_output_type == "quantile") { + # if median output desired, and Q50 provided return exact value, otherwise + # estimate from samples + if (any(new_output_type != "median") || !(0.5 %in% starting_output_type_ids)) { + model_out_tbl <- model_out_tbl %>% + get_samples_from_quantiles(task_id_cols, n_samples) %>% + rbind(model_out_tbl) + } + } + # transform based on new_output_type + model_out_tbl_transform <- vector("list", length = length(new_output_type)) + for (i in seq_along(new_output_type)) { + # first find new_output_type_id + new_output_type_id_tmp <- new_output_type_id + if (new_output_type[i] %in% c("mean", "median")) { + new_output_type_id_tmp <- NA + } else if (is.list(new_output_type_id)) { + new_output_type_id_tmp <- new_output_type_id[[new_output_type[i]]] + } + # if median output desired, and Q50 provided return exact value + if (new_output_type[i] == "median" && 0.5 %in% starting_output_type_ids) { + model_out_tbl_transform[[i]] <- model_out_tbl %>% + dplyr::filter( + .data[["output_type"]] != "sample", + .data[["output_type_id"]] == 0.5 + ) %>% + dplyr::mutate( + output_type = new_output_type[i], + output_type_id = NA + ) %>% + as_model_out_tbl() + } else { # otherwise calculate new values + grouped_model_out_tbl <- model_out_tbl %>% + dplyr::filter(.data[["output_type"]] == "sample") %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c("model_id", task_id_cols)))) + model_out_tbl_transform[[i]] <- grouped_model_out_tbl %>% + convert_from_sample(new_output_type[i], new_output_type_id_tmp) %>% + dplyr::ungroup() + } + } + return(dplyr::bind_rows(model_out_tbl_transform)) +} + +#' @noRd +validate_new_output_type <- function(starting_output_type, new_output_type, + new_output_type_id) { + # check only one starting_output_type is provided + if (length(starting_output_type) != 1) { + cli::cli_abort(c("Only one {.var starting_output_type} may be provided")) + } + valid_conversions <- list( + "sample" = c("mean", "median", "quantile", "cdf"), + "quantile" = c("mean", "median", "cdf"), + "cdf" = c("mean", "median", "quantile") + ) + # check starting_output_type is supported + valid_starting_output_type <- starting_output_type %in% names(valid_conversions) + if (!valid_starting_output_type) { + cli::cli_abort(c( + "{.var output_type} {.val {starting_output_type}} provided cannot be transformed", + i = "must be of type {.val sample}, {.val quantile} or {.val cdf}." + )) + } + # check new_output_type is supported + invalid_new_output_type <- which(!(new_output_type %in% valid_conversions[[starting_output_type]])) + if (length(invalid_new_output_type) > 0) { + cli::cli_abort(c( + "Output type {.val {starting_output_type}} cannot be converted to the specified + {.val {new_output_type[invalid_new_output_type]}}", + i = "{.var new_output_type} values must be one of {.val {valid_conversions[[starting_output_type]]}}" + )) + } + # check new_output_type_id + if (length(new_output_type) == 1) { + validate_new_output_type_id(new_output_type, new_output_type_id) + } else if (length(new_output_type > 1)) { + purrr::map(.x = new_output_type, + ~ validate_new_output_type_id(new_output_type = .x, + new_output_type_id = new_output_type_id[[.x]])) + } +} + +#' @noRd +validate_new_output_type_id <- function(new_output_type, new_output_type_id) { + if (new_output_type %in% c("mean", "median") && !all(is.na(new_output_type_id))) { + cli::cli_abort(c( + "{.var new_output_type_id} is incompatible with {.var new_output_type}", + i = "{.var new_output_type_id} should be {.var NA}" + )) + } else if (new_output_type == "quantile") { + new_output_type_id_quantile <- new_output_type_id + if (!is.numeric(new_output_type_id_quantile)) { + cli::cli_abort(c( + "elements of {.var new_output_type_id} should be numeric", + i = "elements of {.var new_output_type_id} represent quantiles + of the predictive distribution" + )) + } + if (any(new_output_type_id_quantile < 0) || any(new_output_type_id_quantile > 1)) { + cli::cli_abort(c( + "elements of {.var new_output_type_id} should be between 0 and 1", + i = "elements of {.var new_output_type_id} represent quantiles + of the predictive distribution" + )) + } + } else if (new_output_type == "cdf") { + new_output_type_id_cdf <- new_output_type_id + if (!is.numeric(new_output_type_id_cdf)) { + cli::cli_abort(c( + "elements of {.var new_output_type_id} should be numeric", + i = "elements of {.var new_output_type_id} represent possible + values of the target variable" + )) + } + } +} + +#' @noRd +get_samples_from_quantiles <- function(model_out_tbl, task_id_cols, n_samples, ...) { + if (!requireNamespace("distfromq")) { + cli::cli_abort( + c("x" = "{.pkg distfromq} must be installed to convert {.val cdf} + or {.val quantile} output types.", + "i" = "Use {.code remotes::install_github('reichlab/distfromq')} to install." + ) + ) + } + + samples <- model_out_tbl %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c("model_id", task_id_cols)))) %>% + dplyr::reframe( + value = distfromq::make_q_fn( + ps = as.numeric(.data[["output_type_id"]]), + qs = .data[["value"]], ... + )(stats::runif(n_samples, 0, 1)) + ) %>% + dplyr::ungroup() + split_samples <- split(samples, f = samples[[task_id_cols]]) + formatted_samples <- split_samples %>% + purrr::map(.f = function(split_samples) { + dplyr::mutate(split_samples, + output_type = "sample", + output_type_id = as.numeric(dplyr::row_number()), + .before = "value") + }) %>% + purrr::list_rbind() %>% + as_model_out_tbl() + return(formatted_samples) +} + +#' @noRd +get_samples_from_cdf <- function(model_out_tbl, task_id_cols, n_samples, ...) { + if (!requireNamespace("distfromq")) { + cli::cli_abort( + c("x" = "{.pkg distfromq} must be installed to convert {.val cdf} + or {.val quantile} output types.", + "i" = "Use {.code remotes::install_github('reichlab/distfromq')} to install." + ) + ) + } + + samples <- model_out_tbl %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c("model_id", task_id_cols)))) %>% + dplyr::reframe( + value = distfromq::make_q_fn( + ps = .data[["value"]], + qs = as.numeric(.data[["output_type_id"]]), ... + )(stats::runif(n_samples, 0, 1)) + ) %>% + dplyr::ungroup() + split_samples <- split(samples, f = samples[[task_id_cols]]) + formatted_samples <- split_samples %>% + purrr::map(.f = function(split_samples) { + dplyr::mutate(split_samples, + output_type = "sample", + output_type_id = as.numeric(dplyr::row_number()), + .before = "value") + }) %>% + purrr::list_rbind() %>% + as_model_out_tbl() + return(formatted_samples) +} + +#' @noRd +convert_from_sample <- function(grouped_model_out_tbl, new_output_type, + new_output_type_id) { + if (new_output_type == "mean") { + model_out_tbl_transform <- grouped_model_out_tbl %>% + dplyr::reframe( + value = mean(.data[["value"]]), + output_type_id = new_output_type_id + ) + } else if (new_output_type == "median") { + model_out_tbl_transform <- grouped_model_out_tbl %>% + dplyr::reframe( + value = mean(.data[["value"]]), + output_type_id = new_output_type_id + ) + } else if (new_output_type == "quantile") { + model_out_tbl_transform <- grouped_model_out_tbl %>% + dplyr::reframe( + value = stats::quantile(.data[["value"]], as.numeric(new_output_type_id), names = FALSE), + output_type_id = new_output_type_id + ) + } else if (new_output_type == "cdf") { + model_out_tbl_transform <- grouped_model_out_tbl %>% + dplyr::reframe( + value = stats::ecdf(.data[["value"]])(as.numeric(new_output_type_id)), + output_type_id = new_output_type_id + ) + } + # update output_type and output_type_id columns + model_out_tbl_transform <- model_out_tbl_transform %>% + dplyr::mutate(output_type = new_output_type) %>% + as_model_out_tbl() + return(model_out_tbl_transform) +} diff --git a/R/get_task_id_cols.R b/R/get_task_id_cols.R new file mode 100644 index 0000000..782e4eb --- /dev/null +++ b/R/get_task_id_cols.R @@ -0,0 +1,11 @@ +#' Get task ID column names from `model_out_tbl` object +#' +#' @param model_out_tbl an object of class `model_out_tbl` +#' +#' @return a character vector of task ID column names +#' @export +get_task_id_cols <- function(model_out_tbl) { + model_out_cols <- colnames(model_out_tbl) + task_id_cols <- model_out_cols[!model_out_cols %in% std_colnames] + return(task_id_cols) +} diff --git a/man/convert_output_type.Rd b/man/convert_output_type.Rd new file mode 100644 index 0000000..4123d79 --- /dev/null +++ b/man/convert_output_type.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_output_types.R +\name{convert_output_type} +\alias{convert_output_type} +\title{Transform between output types} +\usage{ +convert_output_type( + model_out_tbl, + new_output_type, + new_output_type_id = NA, + n_samples = 10000, + ... +) +} +\arguments{ +\item{model_out_tbl}{an object of class \code{model_out_tbl} containing predictions +with a single, unique value in the \code{output_type} column.} + +\item{new_output_type}{character vector of the desired output type(s) after +transformation. May contain any of the following output types: +\code{"mean"}, \code{"median"}, \code{"quantile"}, \code{"cdf"}.} + +\item{new_output_type_id}{A named list indicating the desired output type IDs +for each new output type, in which each element is a vector of output type IDs. +If only one new output type is requested, then it may be a single numeric vector +(for \code{"quantile"} or \code{"cdf"}) or not required (\code{"mean"} or \code{"median"}). See +the examples for an illustration of both cases.} + +\item{n_samples}{\code{numeric} that specifies the number of samples to use when +calculating output_types from an estimated quantile function. Defaults to \code{1e4}.} + +\item{...}{parameters that are passed to \code{distfromq::make_q_fn}, specifying +details of how to estimate a quantile function from provided quantile levels +and quantile values for \code{"quantile"} or \code{"cdf"} output types.} +} +\value{ +object of class \code{model_out_tbl} containing (only) the new output_type(s) +for each unique combination of task IDs for each model +} +\description{ +Transform between output types for each unique combination of task IDs for +each model. Conversion must be from a single starting output type to one or more +new output types, and the resulting output will only contain the new output types. +See details for supported conversions. +} +\details{ +The following transformations are supported: (i) \code{"sample"} can be +transformed to \code{"mean"}, \code{"median"}, \code{"quantile"}, or \code{"cdf"}; (ii) +\code{"quantile"} can be transformed to \code{"mean"}, \code{"median"}, or \code{"cdf"}; and +(iii) \code{"cdf"} can be transformed to \code{"mean"}, \code{"median"}, or \code{"quantile"}. + +For \code{"quantile"} and \code{"cdf"} starting output types, we use the following approach: +\enumerate{ +\item Interpolate and extrapolate from the provided quantiles or probabilities +for each component model to obtain an estimate of the cdf of that distribution. +\item Draw samples from the distribution for each component model. To reduce +Monte Carlo variability, we use quasi-random samples corresponding to +quantiles of the estimated distribution. +\item Calculate the desired quantity (e.g., mean). +If the median quantile is provided in the \code{model_out_tbl} object (i.e., the +original output_type is \code{"median"} and 0.5 is contained in the original +output_type_id), the median value is extracted and returned directly. +} + +If both \code{"quantile"} and \code{"cdf"} output_types are desired, \code{new_output_type_id} +should be a named list, where each element specifies the corresponding +\code{new_output_type_id}. See examples for an illustration. +} +\examples{ +# We illustrate the conversion between output types using normal distributions +ex_quantiles <- c(0.25, 0.5, 0.75) +model_out_tbl <- expand.grid( + stringsAsFactors = FALSE, + group1 = c(1,2), + model_id = "A", + output_type = "quantile", + output_type_id = ex_quantiles +) \%>\% +dplyr::mutate(value = qnorm(p = output_type_id, mean = group1)) + +convert_output_type(model_out_tbl, new_output_type = "median", new_output_type_id = NA) + +# Next, we illustrate conversion from samples to quantile and cdf +ex_cdf_values <- seq(-2,2,1) +ex_quantiles <- c(0.25, 0.5, 0.75) +model_out_tbl <- expand.grid( + stringsAsFactors = FALSE, + group1 = c(1,2), + model_id = "A", + output_type = "sample", + output_type_id = 1:100 +) \%>\% +dplyr::mutate(value = rnorm(200, mean = group1)) + +convert_output_type(model_out_tbl, new_output_type = c("quantile", "cdf"), + new_output_type_id = list("quantile" = ex_quantiles, "cdf" = ex_cdf_values)) + +} diff --git a/man/get_task_id_cols.Rd b/man/get_task_id_cols.Rd new file mode 100644 index 0000000..7077e3e --- /dev/null +++ b/man/get_task_id_cols.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_task_id_cols.R +\name{get_task_id_cols} +\alias{get_task_id_cols} +\title{Get task ID column names from \code{model_out_tbl} object} +\usage{ +get_task_id_cols(model_out_tbl) +} +\arguments{ +\item{model_out_tbl}{an object of class \code{model_out_tbl}} +} +\value{ +a character vector of task ID column names +} +\description{ +Get task ID column names from \code{model_out_tbl} object +} diff --git a/tests/testthat/test-convert_output_types.R b/tests/testthat/test-convert_output_types.R new file mode 100644 index 0000000..5cfa565 --- /dev/null +++ b/tests/testthat/test-convert_output_types.R @@ -0,0 +1,371 @@ +# set up toy data for tests +ex_qs <- seq(0, 1, length.out = 500)[2:499] +quantile_outputs <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "quantile", + output_type_id = ex_qs, + stringsAsFactors = FALSE +) %>% + dplyr::mutate(mean = grp1 * ifelse(model_id == "A", 1, 3), + value = qnorm(ex_qs, mean)) %>% + dplyr::select(-mean) + +ex_ps <- seq(-2, 10, length.out = 500)[2:499] +cdf_outputs <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "cdf", + output_type_id = ex_ps, + stringsAsFactors = FALSE +) %>% + dplyr::mutate(mean = grp1 * ifelse(model_id == "A", 1, 3), + value = pnorm(output_type_id, mean)) %>% + dplyr::select(-mean) + +### test convert_output_type() +test_that("convert_output_type works (quantile >> mean)", { + new_output_type <- "mean" + new_output_type_id <- NA + expected <- tibble::tibble( + grp1 = rep(1:2, 2), model_id = sort(rep(LETTERS[1:2], 2)) + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_output_type(quantile_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (quantile >> median)", { + new_output_type <- "median" + new_output_type_id <- NA + expected <- tibble::tibble( + grp1 = rep(1:2, 2), model_id = sort(rep(LETTERS[1:2], 2)) + ) %>% + dplyr:: mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_output_type(quantile_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (quantile >> cdf)", { + new_output_type <- "cdf" + new_output_type_id <- seq(-2, 2, 0.5) + expected <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = new_output_type, + output_type_id = new_output_type_id, + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr:: mutate(value = pnorm(output_type_id, grp1 * ifelse(model_id == "A", 1, 3))) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + set.seed(101) + test <- convert_output_type(quantile_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (quantile >> cdf, median)", { + new_output_type <- c("cdf", "median") + new_output_type_id <- list(cdf = seq(-2, 2, 0.5), median = NA) + expected_median <- tibble::tibble( + grp1 = rep(1:2, 2), model_id = sort(rep(LETTERS[1:2], 2)) + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::mutate(output_type = new_output_type[2], + output_type_id = new_output_type_id[[2]]) %>% + as_model_out_tbl() + expected_cdf <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = new_output_type[1], + output_type_id = new_output_type_id[[1]], + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr::mutate(value = pnorm(output_type_id, grp1 * ifelse(model_id == "A", 1, 3))) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + expected <- rbind(expected_cdf, expected_median) + set.seed(101) + test <- convert_output_type(quantile_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (cdf >> mean)", { + new_output_type <- "mean" + new_output_type_id <- NA + expected <- tibble::tibble( + grp1 = rep(1:2, 2), model_id = sort(rep(LETTERS[1:2], 2)) + ) %>% + dplyr:: mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_output_type(cdf_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (cdf >> median)", { + new_output_type <- "median" + new_output_type_id <- NA + expected <- tibble::tibble( + grp1 = rep(1:2, 2), model_id = sort(rep(LETTERS[1:2], 2)) + ) %>% + dplyr:: mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_output_type(cdf_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (cdf >> quantile)", { + new_output_type <- "quantile" + new_output_type_id <- c(0.25, 0.5, 0.75) + expected <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = new_output_type, + output_type_id = new_output_type_id, + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr:: mutate(value = qnorm(output_type_id, grp1 * ifelse(model_id == "A", 1, 3))) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + test <- convert_output_type(cdf_outputs, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type works (sample >> quantile, cdf, mean)", { + ex_bins <- seq(-2, 2, 1) + ex_quantiles <- c(0.25, 0.5, 0.75) + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "sample", + output_type_id = 1:1e5, + stringsAsFactors = FALSE + ) %>% + dplyr::mutate(mean = grp1 * ifelse(model_id == "A", 1, 3), + value = rnorm(dplyr::n(), mean)) %>% + dplyr::select(-mean) + new_output_type <- c("mean", "quantile", "cdf") + new_output_type_id <- list("mean" = NA, "quantile" = ex_quantiles, "cdf" = ex_bins) + expected_quantile <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "quantile", + output_type_id = ex_quantiles, + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr:: mutate(value = qnorm(output_type_id, grp1 * ifelse(model_id == "A", 1, 3))) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + expected_cdf <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "cdf", + output_type_id = ex_bins, + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr:: mutate(value = pnorm(output_type_id, grp1 * ifelse(model_id == "A", 1, 3))) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + expected_mean <- tibble::as_tibble(expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "mean", + output_type_id = NA, + KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE + )) %>% + dplyr:: mutate(value = grp1 * ifelse(model_id == "A", 1, 3)) %>% + dplyr::arrange(model_id, grp1) %>% + as_model_out_tbl() + expected <- dplyr::bind_rows(expected_mean, expected_quantile, expected_cdf) + test <- convert_output_type(model_out_tbl, new_output_type, new_output_type_id) + expect_equal(test, expected, tolerance = 1e-2) +}) + +test_that("convert_output_type fails correctly: multiple starting output types provided", { + model_out_tbl <- expand.grid( + grp1 = 1, + model_id = "A", + output_type = c("cdf", "quantile"), + output_type_id = 1:2, + stringsAsFactors = FALSE + ) + model_out_tbl$output_type_id <- c(5, 0.25, 10, 0.75) + new_output_type <- "mean" + expect_error(convert_output_type(model_out_tbl, new_output_type)) +}) + +test_that("convert_output_type fails correctly: wrong starting output_type", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "pmf", + output_type_id = c("bin1", "bin2"), + stringsAsFactors = FALSE + ) + new_output_type <- "mean" + expect_error(convert_output_type(model_out_tbl, new_output_type)) +}) + +test_that("convert_output_type fails correctly: wrong new_output_type (quantile >> pmf)", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "quantile", + output_type_id = c(0.25, 0.5, 0.75), + stringsAsFactors = FALSE + ) + new_output_type <- "pmf" + expect_error(convert_output_type(model_out_tbl, new_output_type)) +}) + +test_that("convert_output_type fails correctly: wrong new_output_type (cdf >> sample)", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "cdf", + output_type_id <- -1:1, + stringsAsFactors = FALSE + ) + new_output_type <- "sample" + expect_error(convert_output_type(model_out_tbl, new_output_type)) +}) + +test_that("convert_output_type fails correctly: wrong new_output_type_id (mean)", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "cdf", + output_type_id <- -1:1, + stringsAsFactors = FALSE + ) + new_output_type <- "mean" + new_output_type_id <- c("A", "B") + expect_error(convert_output_type(model_out_tbl, new_output_type, new_output_type_id)) +}) + +test_that("convert_output_type fails correctly: wrong new_output_type_id (quantile)", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "cdf", + output_type_id <- -1:1, + stringsAsFactors = FALSE + ) + new_output_type <- "quantile" + new_output_type_id <- c(-1, 0, 1) + expect_error( + convert_output_type(model_out_tbl, new_output_type, new_output_type_id) + ) +}) + +test_that("convert_output_type fails correctly: wrong new_output_type_id (cdf)", { + model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type = "quantile", + output_type_id <- seq(0, 1, 0.5), + stringsAsFactors = FALSE + ) + new_output_type <- "cdf" + new_output_type_id <- c("A", "B") + expect_error( + convert_output_type(model_out_tbl, new_output_type, new_output_type_id) + ) +}) + +### test convert_from_sample() +test_that("convert_from_sample works (return mean)", { + grouped_model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type_id <- 1:5 + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3) * output_type_id) %>% + dplyr::group_by(grp1, model_id) + new_output_type <- "mean" + new_output_type_id <- NA + expected <- grouped_model_out_tbl %>% + dplyr::reframe(value = mean(value)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_from_sample(grouped_model_out_tbl, new_output_type, + new_output_type_id) + expect_equal(test, expected) +}) + +test_that("convert_from_sample works (return median)", { + grouped_model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type_id <- 1:5 + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3) * output_type_id) %>% + dplyr::group_by(grp1, model_id) + new_output_type <- "median" + new_output_type_id <- NA + expected <- grouped_model_out_tbl %>% + dplyr::reframe(value = median(value)) %>% + dplyr::mutate(output_type = new_output_type, + output_type_id = new_output_type_id) %>% + as_model_out_tbl() + test <- convert_from_sample(grouped_model_out_tbl, new_output_type, + new_output_type_id) + expect_equal(test, expected) +}) + +test_that("convert_from_sample works (return quantile)", { + grouped_model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type_id <- 1:5 + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3) * output_type_id) %>% + dplyr::group_by(grp1, model_id) + new_output_type <- "quantile" + new_output_type_id <- c(0.25, 0.75) + expected <- grouped_model_out_tbl %>% + dplyr::reframe(value = quantile(value, new_output_type_id, names = FALSE), + output_type_id = new_output_type_id) %>% + dplyr::mutate(output_type = new_output_type) %>% + as_model_out_tbl() + test <- convert_from_sample(grouped_model_out_tbl, new_output_type, + new_output_type_id) + expect_equal(test, expected) +}) + +test_that("convert_from_sample works (return cdf)", { + grouped_model_out_tbl <- expand.grid( + grp1 = 1:2, + model_id = LETTERS[1:2], + output_type_id <- 1:5 + ) %>% + dplyr::mutate(value = grp1 * ifelse(model_id == "A", 1, 3) * output_type_id) %>% + dplyr::group_by(grp1, model_id) + new_output_type <- "cdf" + new_output_type_id <- seq(0, 30, 5) + expected <- grouped_model_out_tbl %>% + dplyr::reframe(value = ecdf(value)(new_output_type_id), + output_type_id = new_output_type_id) %>% + dplyr::mutate(output_type = new_output_type) %>% + as_model_out_tbl() + test <- convert_from_sample(grouped_model_out_tbl, new_output_type, + new_output_type_id) + expect_equal(test, expected) +}) diff --git a/tests/testthat/test-get_task_id_cols.R b/tests/testthat/test-get_task_id_cols.R new file mode 100644 index 0000000..14a6280 --- /dev/null +++ b/tests/testthat/test-get_task_id_cols.R @@ -0,0 +1,15 @@ +test_that("get_task_id_cols works", { + ex_qs <- seq(0, 1, length.out = 5) + model_outputs <- expand.grid( + grp1 = 1:2, + grp2 = 1:3, + model_id = LETTERS[1:2], + output_type = "quantile", + output_type_id = ex_qs, + stringsAsFactors = FALSE + ) + model_outputs$value <- runif(nrow(model_outputs)) + expected <- c("grp1", "grp2") + test <- get_task_id_cols(model_outputs) + expect_equal(expected, test) +})