Skip to content

Commit

Permalink
Merge pull request #82 from hubverse-org/feature/handle-samples
Browse files Browse the repository at this point in the history
Feature/ Handle V3 sample specification
  • Loading branch information
annakrystalli authored Jun 19, 2024
2 parents f6b9eb2 + 86f3a2c commit bfab4d3
Show file tree
Hide file tree
Showing 83 changed files with 10,130 additions and 141 deletions.
2 changes: 1 addition & 1 deletion .github/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Information required includes:

### Pull request process

* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("Infectious-Disease-Modeling-Hubs/hubValidations", fork = TRUE)`.
* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("hubverse-org/hubValidations", fork = TRUE)`.

* Install all development dependencies with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`.
If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing.
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/issue_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Bug report or feature request
about: Describe a bug you've seen or make a case for a new feature
---

Please briefly describe your problem and what output you expect. If you have a question, consider creating a post on our [Discussion Board](https://github.com/Infectious-Disease-Modeling-Hubs/hubValidations/discussions).
Please briefly describe your problem and what output you expect. If you have a question, consider creating a post on our [Discussion Board](https://github.com/hubverse-org/hubValidations/discussions).

Please include a minimal reproducible example (AKA a reprex). If you've never heard of a [reprex](http://reprex.tidyverse.org/) before, start by reading <https://www.tidyverse.org/help/#reprex>.

Expand Down
3 changes: 2 additions & 1 deletion .lintr
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
linters: linters_with_defaults(
line_length_linter = line_length_linter(120L),
commented_code_linter = NULL,
object_length_linter = object_length_linter(length = 50L)
object_length_linter = object_length_linter(length = 50L),
cyclocomp_linter = cyclocomp_linter(complexity_limit = 20L)
)
24 changes: 12 additions & 12 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hubValidations
Title: Testing framework for hubverse hub validations
Version: 0.0.1
Version: 0.1.0
Authors@R: c(
person(
given = "Anna",
Expand Down Expand Up @@ -34,12 +34,12 @@ Imports:
checkmate,
cli,
config,
dplyr,
dplyr (>= 1.1.0),
fs,
gh,
hubAdmin (>= 0.1.0),
hubData (>= 0.1.0),
hubUtils (>= 0.0.1),
hubAdmin (>= 1.0.0),
hubData (>= 0.2.0),
hubUtils (>= 0.1.0),
jsonlite,
jsonvalidate,
lubridate,
Expand All @@ -60,18 +60,18 @@ Suggests:
testthis,
withr
Remotes:
Infectious-Disease-Modeling-Hubs/hubUtils,
Infectious-Disease-Modeling-Hubs/hubData,
Infectious-Disease-Modeling-Hubs/hubAdmin,
hubverse-org/hubUtils,
hubverse-org/hubData,
hubverse-org/hubAdmin,
assignUser/octolog,
apache/arrow/[email protected]
Config/testthat/edition: 3
Config/Needs/website: pkgdown, Infectious-Disease-Modeling-Hubs/hubStyle
Config/Needs/website: pkgdown, hubverse-org/hubStyle
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
URL: https://github.com/Infectious-Disease-Modeling-Hubs/hubValidations,
https://infectious-disease-modeling-hubs.github.io/hubValidations/
BugReports: https://github.com/Infectious-Disease-Modeling-Hubs/hubValidations/issues
URL: https://github.com/hubverse-org/hubValidations,
https://hubverse-org.github.io/hubValidations/
BugReports: https://github.com/hubverse-org/hubValidations/issues
Depends:
R (>= 3.5.0)
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ export(check_tbl_col_types)
export(check_tbl_colnames)
export(check_tbl_match_round_id)
export(check_tbl_rows_unique)
export(check_tbl_spl_compound_tid)
export(check_tbl_spl_n)
export(check_tbl_spl_non_compound_tid)
export(check_tbl_unique_round_id)
export(check_tbl_value_col)
export(check_tbl_value_col_ascending)
Expand Down
81 changes: 81 additions & 0 deletions R/check_tbl_spl_compound_tid.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#' Check model output data tbl samples contain single unique values for each
#' compound task ID within individual samples
#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_colnames return
#' @details Output of the check includes an `errors` element, a list of items,
#' one for each sample failing validation, with the following structure:
#' - `mt_id`: Index identifying the config modeling task the sample is associated with.
#' - `output_type_id`: The output type ID of the sample that does not contain a
#' single, unique value for each compound task ID.
#' - `values`: The unique values of each compound task ID.
#' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html)
#' for more details.
#' @export
check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")

if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) {
return(skip_v3_spl_check(file_path))
}

hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks)
# TODO: Currently, samples must strictly match the compound task ID set expectations
# and cannot handle coarser-grained compound task ID sets.
n_tbl <- hash_tbl[hash_tbl$n_compound_idx > 1L, ]

check <- nrow(n_tbl) == 0L

if (check) {
details <- NULL
errors <- NULL
} else {
errors <- comptid_mismatch(
n_tbl, tbl, config_tasks, round_id
)
output_type_ids <- purrr::map(errors, ~ .x$output_type_id) %>% # nolint: object_usage_linter
purrr::flatten_chr() %>%
unique() %>%
sort()

details <- cli::format_inline(
"Sample{?s} {.val {output_type_ids}} d{?oes/o} not contain ",
"unique compound task ID combinations. ",
"See {.var errors} attribute for details."
)
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Each sample compound task ID",
msg_attribute = "single, unique value.",
msg_verbs = c("contains", "does not contain"),
details = details,
errors = errors,
error = TRUE
)
}

comptid_mismatch <- function(n_tbl, tbl, config_tasks, round_id) {
tbl <- tbl[tbl$output_type == "sample", ]
purrr::map(
seq_along(n_tbl$output_type_id),
~ {
x <- n_tbl[.x, ]
compound_taskids <- get_round_compound_task_ids(config_tasks, round_id)[[x$mt_id]]
spl <- tbl[tbl$output_type_id == x$output_type_id, compound_taskids] %>%
unique()

values <- spl[, purrr::map_lgl(spl, ~ length(unique(.x)) > 1L)] %>%
as.list() %>%
purrr::map(unique)

list(
mt_id = x$mt_id,
output_type_id = x$output_type_id,
values = values
)
}
)
}
140 changes: 140 additions & 0 deletions R/check_tbl_spl_n.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#' Check model output data tbl samples contain the appropriate number of samples
#' for a given compound idx.
#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @details Output of the check includes an `errors` element, a list of items,
#' one for each compound_idx failing validation, with the following structure:
#' - `compound_idx`: the compound idx that failed validation of number of samples.
#' - `n`: the number of samples counted for the compound idx.
#' - `min_samples_per_task`: the minimum number of samples required for the compound idx.
#' - `max_samples_per_task`: the maximum number of samples required for the compound idx.
#' - `compound_idx_tbl`: a tibble of the expected structure for samples belonging to
#' the compound idx.
#' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html)
#' for more details.
#' @export
check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")

if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) {
return(skip_v3_spl_check(file_path))
}

hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks)
n_ranges <- get_round_spl_n_ranges(config_tasks, round_id)

n_tbl <- dplyr::group_by(hash_tbl, .data$compound_idx) %>%
dplyr::summarise(
n = dplyr::n_distinct(.data$output_type_id),
mt_id = unique(.data$mt_id)
) %>%
dplyr::left_join(n_ranges, by = "mt_id") %>%
dplyr::mutate(
less = .data$n < .data$n_min,
more = .data$n > .data$n_max,
out_range = .data$less | .data$more
) %>%
dplyr::filter(.data$out_range)

check <- nrow(n_tbl) == 0L

if (check) {
details <- NULL
errors <- NULL
} else {
errors <- n_mismatch_errors(
n_tbl, hash_tbl, tbl
)
details <- n_mismatch_details(n_tbl)
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Required samples per compound idx task",
msg_attribute = NULL,
msg_verbs = c("present.", "not present."),
details = details,
errors = errors
)
}

get_round_spl_n_ranges <- function(config_tasks, round_id) {
round_mt <- hubUtils::get_round_model_tasks(config_tasks, round_id)
purrr::imap(
round_mt,
~ {
output_type_id_params <- purrr::pluck(
.x,
"output_type",
"sample",
"output_type_id_params"
)

if (is.null(output_type_id_params)) {
return(NULL)
} else {
tibble::tibble(
mt_id = .y,
n_min = output_type_id_params$min_samples_per_task,
n_max = output_type_id_params$max_samples_per_task
)
}
}
) %>%
purrr::list_rbind()
}

n_mismatch_errors <- function(n_tbl, hash_tbl, tbl) {
tbl <- tbl[tbl$output_type == "sample", names(tbl) != "value"]
purrr::map(
purrr::set_names(n_tbl$compound_idx),
~ {
spl_d <- hash_tbl$output_type_id[hash_tbl$compound_idx == .x][1L]
compound_idx_tbl <- tbl[
tbl$output_type_id == spl_d,
setdiff(names(tbl), c("output_type_id", "value", "output_type"))
]
row <- n_tbl[n_tbl$compound_idx == .x, ] %>% as.vector()
list(
compound_idx = .x,
n = row$n,
min_samples_per_task = row$n_min,
max_samples_per_task = row$n_max,
compound_idx_tbl = compound_idx_tbl
)
}
)
}


n_mismatch_details <- function(n_tbl) {
cat_msg <- function(compound_idx, type) { # nolint: object_usage_linter
switch(type,
less = paste0(
"File contains less than the minimum required number of samples per task ",
"for compound idx{?s} {.val {compound_idx}}"
),
more = paste0(
"File contains more than the maximum required number of samples per task ",
"for compound idx{?s} {.val {compound_idx}}"
)
) %>% cli::format_inline()
}

purrr::map(
c("less", "more"),
~ {
compound_idx <- n_tbl[n_tbl[[.x]], "compound_idx", drop = TRUE]
if (length(compound_idx) == 0L) {
return(NULL)
} else {
cat_msg(compound_idx, .x)
}
}
) %>%
purrr::compact() %>%
c("See {.var errors} attribute for details.") %>%
paste(collapse = ". ")
}
Loading

0 comments on commit bfab4d3

Please sign in to comment.