From 9aa46d7cab0e6b3ebcb332a8fb7b27856204352f Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Mon, 25 Sep 2023 12:37:44 +0300 Subject: [PATCH] Create function opt_check_tbl_horizon_timediff. Resolves #31 --- NAMESPACE | 1 + R/opt_check_tbl_horizon_timediff.R | 79 +++++++++++++++++ .../flusight/hub-config/validations.yml | 7 +- man/opt_check_tbl_horizon_timediff.Rd | 54 ++++++++++++ .../testthat/_snaps/execute_custom_checks.md | 4 +- .../_snaps/opt_check_tbl_horizon_timediff.md | 72 +++++++++++++++ tests/testthat/_snaps/validate_model_data.md | 8 +- .../test-opt_check_tbl_horizon_timediff.R | 88 +++++++++++++++++++ tests/testthat/test-validate_model_data.R | 4 +- .../testdata/config/validations-error.yml | 11 ++- .../testthat/testdata/config/validations.yml | 5 +- vignettes/articles/custom-functions.Rmd | 22 ++++- 12 files changed, 331 insertions(+), 24 deletions(-) create mode 100644 R/opt_check_tbl_horizon_timediff.R create mode 100644 man/opt_check_tbl_horizon_timediff.Rd create mode 100644 tests/testthat/_snaps/opt_check_tbl_horizon_timediff.md create mode 100644 tests/testthat/test-opt_check_tbl_horizon_timediff.R diff --git a/NAMESPACE b/NAMESPACE index 96da6584..cc98c95f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,6 +41,7 @@ export(is_info) export(is_success) export(not_pass) export(opt_check_tbl_col_timediff) +export(opt_check_tbl_horizon_timediff) export(read_model_out_file) export(try_check) export(validate_model_data) diff --git a/R/opt_check_tbl_horizon_timediff.R b/R/opt_check_tbl_horizon_timediff.R new file mode 100644 index 00000000..8b75254b --- /dev/null +++ b/R/opt_check_tbl_horizon_timediff.R @@ -0,0 +1,79 @@ +#' Check time difference between values in two date columns equal a defined period. +#' +#' @param t0_colname Character string. The name of the time zero date column. +#' @param t1_colname Character string. The name of the time zero + 1 time step date column. +#' @param horizon_colname Character string. The name of the horizon column. +#' Defaults to `"horizon"`. +#' @param timediff an object of class `lubridate` [`Period-class`] and length 1. +#' The period of a single horizon. Default to 1 week. +#' @inherit check_tbl_colnames params +#' @inherit check_tbl_col_types return +#' @export +opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname, + t1_colname, horizon_colname = "horizon", + timediff = lubridate::weeks()) { + checkmate::assert_class(timediff, "Period") + checkmate::assert_scalar(timediff) + checkmate::assert_character(t0_colname, len = 1L) + checkmate::assert_character(t1_colname, len = 1L) + checkmate::assert_character(horizon_colname, len = 1L) + checkmate::assert_choice(t0_colname, choices = names(tbl)) + checkmate::assert_choice(t1_colname, choices = names(tbl)) + checkmate::assert_choice(horizon_colname, choices = names(tbl)) + + config_tasks <- hubUtils::read_config(hub_path, "tasks") + schema <- hubUtils::create_hub_schema(config_tasks, + partitions = NULL, + r_schema = TRUE + ) + assert_column_date(t0_colname, schema) + assert_column_date(t1_colname, schema) + assert_column_integer(horizon_colname, schema) + + if (!lubridate::is.Date(tbl[[t0_colname]])) { + tbl[, t0_colname] <- as.Date(tbl[[t0_colname]]) + } + if (!lubridate::is.Date(tbl[[t1_colname]])) { + tbl[, t1_colname] <- as.Date(tbl[[t1_colname]]) + } + if (!is.integer(tbl[[horizon_colname]])) { + tbl[, horizon_colname] <- as.integer(tbl[[horizon_colname]]) + } + + compare <- tbl[[t0_colname]] + (timediff * tbl[[horizon_colname]]) == tbl[[t1_colname]] + check <- all(compare) + if (check) { + details <- NULL + } else { + invalid_vals <- paste0( + tbl[[t1_colname]][!compare], + " (horizon = ", tbl[[horizon_colname]][!compare], ")" + ) %>% unique() + + details <- cli::format_inline( + "t1 var value{?s} {.val {invalid_vals}} are invalid." + ) + } + + capture_check_cnd( + check = check, + file_path = file_path, + msg_subject = cli::format_inline( + "Time differences between t0 var {.var {t0_colname}} and t1 var + {.var {t1_colname}}" + ), + msg_verbs = c("all match", "do not all match"), + msg_attribute = cli::format_inline("expected period of {.val {timediff}} * {.var {horizon_colname}}."), + details = details + ) +} + +assert_column_integer <- function(colname, schema) { + if (schema[colname] != "integer") { + cli::cli_abort( + "Column {.arg colname} must be configured as {.cls integer} not + {.cls {schema[colname]}}.", + call = rlang::caller_call() + ) + } +} diff --git a/inst/testhubs/flusight/hub-config/validations.yml b/inst/testhubs/flusight/hub-config/validations.yml index e36059e9..7727f434 100644 --- a/inst/testhubs/flusight/hub-config/validations.yml +++ b/inst/testhubs/flusight/hub-config/validations.yml @@ -1,9 +1,10 @@ default: validate_model_data: - col_timediff: - fn: "opt_check_tbl_col_timediff" + horizon_timediff: + fn: "opt_check_tbl_horizon_timediff" pkg: "hubValidations" args: t0_colname: "forecast_date" t1_colname: "target_end_date" - timediff: !expr lubridate::weeks(2) + horizon_colname: "horizon" + timediff: !expr lubridate::weeks() diff --git a/man/opt_check_tbl_horizon_timediff.Rd b/man/opt_check_tbl_horizon_timediff.Rd new file mode 100644 index 00000000..6a26fed0 --- /dev/null +++ b/man/opt_check_tbl_horizon_timediff.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/opt_check_tbl_horizon_timediff.R +\name{opt_check_tbl_horizon_timediff} +\alias{opt_check_tbl_horizon_timediff} +\title{Check time difference between values in two date columns equal a defined period.} +\usage{ +opt_check_tbl_horizon_timediff( + tbl, + file_path, + hub_path, + t0_colname, + t1_colname, + horizon_colname = "horizon", + timediff = lubridate::weeks() +) +} +\arguments{ +\item{tbl}{a tibble/data.frame of the contents of the file being validated.} + +\item{file_path}{character string. Path to the file being validated relative to +the hub's model-output directory.} + +\item{hub_path}{Either a character string path to a local Modeling Hub directory +or an object of class \verb{} created using functions \code{\link[hubUtils:s3_bucket]{s3_bucket()}} +or \code{\link[hubUtils:gs_bucket]{gs_bucket()}} by providing a string S3 or GCS bucket name or path to a +Modeling Hub directory stored in the cloud. +For more details consult the +\href{https://arrow.apache.org/docs/r/articles/fs.html}{Using cloud storage (S3, GCS)} +in the \code{arrow} package. +The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} +files within the \code{hub-config} directory.} + +\item{t0_colname}{Character string. The name of the time zero date column.} + +\item{t1_colname}{Character string. The name of the time zero + 1 time step date column.} + +\item{horizon_colname}{Character string. The name of the horizon column. +Defaults to \code{"horizon"}.} + +\item{timediff}{an object of class \code{lubridate} \code{\linkS4class{Period}} and length 1. +The period of a single horizon. Default to 1 week.} +} +\value{ +Depending on whether validation has succeeded, one of: +\itemize{ +\item \verb{} condition class object. +\item \verb{} condition class object. +} + +Returned object also inherits from subclass \verb{}. +} +\description{ +Check time difference between values in two date columns equal a defined period. +} diff --git a/tests/testthat/_snaps/execute_custom_checks.md b/tests/testthat/_snaps/execute_custom_checks.md index 1eae02b7..d360a75c 100644 --- a/tests/testthat/_snaps/execute_custom_checks.md +++ b/tests/testthat/_snaps/execute_custom_checks.md @@ -5,7 +5,7 @@ "testdata", "config", "validations.yml"))) Output List of 1 - $ col_timediff:List of 4 + $ horizon_timediff:List of 4 ..$ message : chr "Time differences between t0 var `forecast_date` and t1 var\n `target_end_date` all match expected period"| __truncated__ ..$ where : chr "hub-ensemble/2023-05-08-hub-ensemble.parquet" ..$ call : NULL @@ -20,7 +20,7 @@ "testdata", "config", "validations-error.yml"))) Output List of 1 - $ col_timediff:List of 4 + $ horizon_timediff:List of 4 ..$ message : chr "Time differences between t0 var `forecast_date` and t1 var\n `target_end_date` do not all match expected"| __truncated__ ..$ where : chr "hub-ensemble/2023-05-08-hub-ensemble.parquet" ..$ call : NULL diff --git a/tests/testthat/_snaps/opt_check_tbl_horizon_timediff.md b/tests/testthat/_snaps/opt_check_tbl_horizon_timediff.md new file mode 100644 index 00000000..129e0bf3 --- /dev/null +++ b/tests/testthat/_snaps/opt_check_tbl_horizon_timediff.md @@ -0,0 +1,72 @@ +# opt_check_tbl_horizon_timediff works + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date") + Output + + Message: + Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 7d 0H 0M 0S * `horizon`. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl_chr, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date") + Output + + Message: + Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 7d 0H 0M 0S * `horizon`. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date") + Output + + Warning: + Time differences between t0 var `forecast_date` and t1 var `target_end_date` do not all match expected period of 7d 0H 0M 0S * `horizon`. t1 var value "2023-05-22 (horizon = 1)" are invalid. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date", timediff = lubridate::weeks(2)) + Output + + Warning: + Time differences between t0 var `forecast_date` and t1 var `target_end_date` do not all match expected period of 14d 0H 0M 0S * `horizon`. t1 var values "2023-05-15 (horizon = 1)" and "2023-05-22 (horizon = 2)" are invalid. + +# opt_check_tbl_horizon_timediff fails correctly + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_dates") + Error + Assertion on 't1_colname' failed: Must be element of set {'forecast_date','target_end_date','horizon','target','location','output_type','output_type_id','value'}, but is 'target_end_dates'. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = c("target_end_date", "forecast_date")) + Error + Assertion on 't1_colname' failed: Must have length 1, but has length 2. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date", timediff = 7L) + Error + Assertion on 'timediff' failed: Must inherit from class 'Period', but has class 'integer'. + +--- + + Code + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date", + t1_colname = "target_end_date") + Error + Column `colname` must be configured as not . + diff --git a/tests/testthat/_snaps/validate_model_data.md b/tests/testthat/_snaps/validate_model_data.md index 23b09737..e0678872 100644 --- a/tests/testthat/_snaps/validate_model_data.md +++ b/tests/testthat/_snaps/validate_model_data.md @@ -218,22 +218,22 @@ # validate_model_data with config function works Code - validate_model_data(hub_path, file_path)[["col_timediff"]] + validate_model_data(hub_path, file_path)[["horizon_timediff"]] Output Message: - Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 14d 0H 0M 0S. + Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 7d 0H 0M 0S * `horizon`. --- Code validate_model_data(hub_path, file_path, validations_cfg_path = system.file( "testhubs/flusight/hub-config/validations.yml", package = "hubValidations"))[[ - "col_timediff"]] + "horizon_timediff"]] Output Message: - Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 14d 0H 0M 0S. + Time differences between t0 var `forecast_date` and t1 var `target_end_date` all match expected period of 7d 0H 0M 0S * `horizon`. # validate_model_data print method work [plain] diff --git a/tests/testthat/test-opt_check_tbl_horizon_timediff.R b/tests/testthat/test-opt_check_tbl_horizon_timediff.R new file mode 100644 index 00000000..2568caad --- /dev/null +++ b/tests/testthat/test-opt_check_tbl_horizon_timediff.R @@ -0,0 +1,88 @@ +test_that("opt_check_tbl_horizon_timediff works", { + hub_path <- system.file("testhubs/flusight", package = "hubValidations") + file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet" + tbl <- hubValidations::read_model_out_file(file_path, hub_path) + + + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date" + ) + ) + + tbl_chr <- hubUtils::coerce_to_character(tbl) + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl_chr, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date" + ) + ) + + tbl$target_end_date[1] <- tbl$forecast_date[1] + lubridate::weeks(2) + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date" + ) + ) + + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date", + timediff = lubridate::weeks(2) + ) + ) +}) + + +test_that("opt_check_tbl_horizon_timediff fails correctly", { + hub_path <- system.file("testhubs/flusight", package = "hubValidations") + file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet" + tbl <- hubValidations::read_model_out_file(file_path, hub_path) + + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_dates" + ), + error = TRUE + ) + + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = c("target_end_date", "forecast_date") + ), + error = TRUE + ) + + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date", + timediff = 7L + ), + error = TRUE + ) + + schema <- c( + forecast_date = "Date", target = "character", horizon = "integer", + location = "character", output_type = "character", output_type_id = "character", + value = "double", target_end_date = "character" + ) + mockery::stub( + opt_check_tbl_horizon_timediff, + "hubUtils::create_hub_schema", + schema, + 2 + ) + expect_snapshot( + opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, + t0_colname = "forecast_date", + t1_colname = "target_end_date" + ), + error = TRUE + ) +}) diff --git a/tests/testthat/test-validate_model_data.R b/tests/testthat/test-validate_model_data.R index 2c177f00..2ec528c8 100644 --- a/tests/testthat/test-validate_model_data.R +++ b/tests/testthat/test-validate_model_data.R @@ -39,7 +39,7 @@ test_that("validate_model_data with config function works", { hub_path <- system.file("testhubs/flusight", package = "hubValidations") file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet" expect_snapshot( - validate_model_data(hub_path, file_path)[["col_timediff"]] + validate_model_data(hub_path, file_path)[["horizon_timediff"]] ) expect_snapshot( validate_model_data( @@ -48,7 +48,7 @@ test_that("validate_model_data with config function works", { "testhubs/flusight/hub-config/validations.yml", package = "hubValidations" ) - )[["col_timediff"]] + )[["horizon_timediff"]] ) }) diff --git a/tests/testthat/testdata/config/validations-error.yml b/tests/testthat/testdata/config/validations-error.yml index 11f7caec..7303d5f4 100644 --- a/tests/testthat/testdata/config/validations-error.yml +++ b/tests/testthat/testdata/config/validations-error.yml @@ -1,19 +1,18 @@ default: test_custom_checks_caller: - col_timediff: - fn: "opt_check_tbl_col_timediff" + horizon_timediff: + fn: "opt_check_tbl_horizon_timediff" pkg: "hubValidations" args: t0_colname: "forecast_date" t1_colname: "target_end_date" - timediff: !expr lubridate::weeks(2) 2023-05-08: test_custom_checks_caller: - col_timediff: - fn: "opt_check_tbl_col_timediff" + horizon_timediff: + fn: "opt_check_tbl_horizon_timediff" pkg: "hubValidations" args: t0_colname: "forecast_date" t1_colname: "target_end_date" - timediff: !expr lubridate::weeks(1) + timediff: !expr lubridate::weeks(2) diff --git a/tests/testthat/testdata/config/validations.yml b/tests/testthat/testdata/config/validations.yml index 0f5652b4..f512456c 100644 --- a/tests/testthat/testdata/config/validations.yml +++ b/tests/testthat/testdata/config/validations.yml @@ -1,9 +1,8 @@ default: test_custom_checks_caller: - col_timediff: - fn: "opt_check_tbl_col_timediff" + horizon_timediff: + fn: "opt_check_tbl_horizon_timediff" pkg: "hubValidations" args: t0_colname: "forecast_date" t1_colname: "target_end_date" - timediff: !expr lubridate::weeks(2) diff --git a/vignettes/articles/custom-functions.Rmd b/vignettes/articles/custom-functions.Rmd index 3c4c7796..77152010 100644 --- a/vignettes/articles/custom-functions.Rmd +++ b/vignettes/articles/custom-functions.Rmd @@ -56,20 +56,35 @@ Note that each of the `validate_*()` functions contain a standard objects in the - `round_id`: character string of `round_id` - `file_meta`: named list containing `round_id`, `team_abbr`, `model_abbr` and `model_id` details. -The `args` configuration can be used to override objects from the caller environment. +The `args` configuration can be used to override objects from the caller environment as well as defaults. -Here's an example configuration for a single check (`opt_check_tbl_col_timediff()`) to be run as part of the `validate_model_data()` validation function which checks the content of the model data submission files. +Here's an example configuration for a single check (`opt_check_tbl_horizon_timediff()`) to be run as part of the `validate_model_data()` validation function which checks the content of the model data submission files. ```{r, eval=FALSE, code=readLines(system.file('testhubs/flusight/hub-config/validations.yml', package = 'hubValidations'))} ``` +The above configuration file relies on default values for arguments `horizon_colname` (`"horizon"`) and `timediff` (`lubridate::weeks()`). We can use the `validation.yml` `args` list to override the default values. Here's an example that includes **executable r code** as the value of an argument. + +``` +default: + validate_model_data: + horizon_timediff: + fn: "opt_check_tbl_horizon_timediff" + pkg: "hubValidations" + args: + t0_colname: "forecast_date" + t1_colname: "target_end_date" + horizon_colname: "horizons" + timediff: !expr lubridate::weeks(2) +``` + ### Round specific configuration Additional round specific configurations can be included in `validation.yml` that can add to or override default configurations. -For example, in the following `validation.yml`, if the file being validated is being submitted to a round with round ID `"2023-08-15"`, default `col_timediff` check configuration will be overiden by the `2023-08-15` configuration. +For example, in the following `validation.yml` which deploys the `opt_check_tbl_col_timediff()` optional check, if the file being validated is being submitted to a round with round ID `"2023-08-15"`, default `col_timediff` check configuration will be overridden by the `2023-08-15` configuration. ```yml default: @@ -80,7 +95,6 @@ default: args: t0_colname: "forecast_date" t1_colname: "target_end_date" - timediff: !expr lubridate::weeks(2) 2023-08-15: validate_model_data: