Skip to content

Commit

Permalink
Merge pull request #146 from hubverse-org/ak/timediffs-ignore-NAs/140
Browse files Browse the repository at this point in the history
Allow for NA values in optional horizon time difference checks (#140)
  • Loading branch information
annakrystalli authored Oct 31, 2024
2 parents f0b1406 + a835ba0 commit a75e337
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 0 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

* Downgrade result of missing model metadata file check from `check_error` to `check_failure` and suppress early return in case of check failure in `validate_model_file()` (#138).
* Add `check_file_n()` function to validate that the number of files submitted per round does not exceed the allowed number of submissions per team (#139).
* Ignore `NA`s in relevant `tbl` columns in `opt_check_tbl_col_timediff()` and `opt_check_tbl_horizon_timediff()` checks to ensure rows that may not be targeting relevant to modeling task do not cause false check failure. (#140).

# hubValidations 0.7.1

Expand Down
19 changes: 19 additions & 0 deletions R/opt_check_tbl_col_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ opt_check_tbl_col_timediff <- function(tbl, file_path, hub_path,
assert_column_date(t0_colname, schema)
assert_column_date(t1_colname, schema)

# Subset tbl to only relevant columns for check and complete cases to perform
# checks. This ensures non-relevant model task rows are ignored.
tbl <- subset_check_tbl(tbl, c(t0_colname, t1_colname))
# If no rows returned by sub-setting for complete cases of relevant columns,
# skip check by returning capture_check_info object early.
if (nrow(tbl) == 0) {
return(
capture_check_info(
file_path = file_path,
msg = "No relevant data to check. Check skipped."
)
)
}

if (!lubridate::is.Date(tbl[[t0_colname]])) {
tbl[, t0_colname] <- as.Date(tbl[[t0_colname]])
}
Expand Down Expand Up @@ -73,3 +87,8 @@ assert_column_date <- function(colname, schema) {
)
}
}

subset_check_tbl <- function(tbl, check_cols) {
tbl <- tbl[, check_cols]
tbl[stats::complete.cases(tbl), ]
}
14 changes: 14 additions & 0 deletions R/opt_check_tbl_horizon_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
assert_column_date(t1_colname, schema)
assert_column_integer(horizon_colname, schema)

# Subset tbl to only relevant columns for check and complete cases to perform
# checks. This ensures non-relevant model task rows are ignored.
tbl <- subset_check_tbl(tbl, c(t0_colname, t1_colname, horizon_colname))
# If no rows returned by sub-setting for complete cases of relevant columns,
# skip check by returning capture_check_info object early.
if (nrow(tbl) == 0) {
return(
capture_check_info(
file_path = file_path,
msg = "No relevant data to check. Check skipped."
)
)
}

if (!lubridate::is.Date(tbl[[t0_colname]])) {
tbl[, t0_colname] <- as.Date(tbl[[t0_colname]])
}
Expand Down
10 changes: 10 additions & 0 deletions tests/testthat/_snaps/opt_check_tbl_col_timediff.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,13 @@
Error in `opt_check_tbl_col_timediff()`:
! Column `colname` must be configured as <Date> not <character>.

# handling of NAs in opt_check_tbl_col_timediff works

Code
opt_check_tbl_col_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date",
t1_colname = "target_end_date")
Output
<message/check_info>
Message:
No relevant data to check. Check skipped.

10 changes: 10 additions & 0 deletions tests/testthat/_snaps/opt_check_tbl_horizon_timediff.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,13 @@
Error in `opt_check_tbl_horizon_timediff()`:
! Column `colname` must be configured as <Date> not <character>.

# handling of NAs in opt_check_tbl_horizon_timediff works

Code
opt_check_tbl_horizon_timediff(tbl, file_path, hub_path, t0_colname = "forecast_date",
t1_colname = "target_end_date")
Output
<message/check_info>
Message:
No relevant data to check. Check skipped.

40 changes: 40 additions & 0 deletions tests/testthat/test-opt_check_tbl_col_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,43 @@ test_that("opt_check_tbl_col_timediff fails correctly", {
error = TRUE
)
})


test_that("handling of NAs in opt_check_tbl_col_timediff works", {
hub_path <- system.file("testhubs/flusight", package = "hubValidations")
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl$target_end_date <- tbl$forecast_date + lubridate::weeks(2)

# This should pass
tbl$forecast_date[1] <- NA
expect_s3_class(
opt_check_tbl_col_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date",
timediff = lubridate::weeks(2)
),
c("check_success", "hub_check", "rlang_message", "message", "condition"),
exact = TRUE
)

# This should pass
tbl$target_end_date[1:3] <- NA
expect_s3_class(
opt_check_tbl_col_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
),
c("check_success", "hub_check", "rlang_message", "message", "condition"),
exact = TRUE
)

# This should be skipped
tbl$target_end_date <- NA
expect_snapshot(
opt_check_tbl_col_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
)
)
})
47 changes: 47 additions & 0 deletions tests/testthat/test-opt_check_tbl_horizon_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,50 @@ test_that("opt_check_tbl_horizon_timediff fails correctly", {
error = TRUE
)
})

test_that("handling of NAs in opt_check_tbl_horizon_timediff works", {
hub_path <- system.file("testhubs/flusight", package = "hubValidations")
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
tbl <- hubValidations::read_model_out_file(file_path, hub_path)

# This should pass
tbl$forecast_date[1] <- NA
expect_s3_class(
opt_check_tbl_horizon_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
),
c("check_success", "hub_check", "rlang_message", "message", "condition"),
exact = TRUE
)

# This should pass
tbl$target_end_date[1:3] <- NA
expect_s3_class(
opt_check_tbl_horizon_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
),
c("check_success", "hub_check", "rlang_message", "message", "condition"),
exact = TRUE
)

tbl$horizon[8:15] <- NA
expect_s3_class(
opt_check_tbl_horizon_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
),
c("check_success", "hub_check", "rlang_message", "message", "condition"),
exact = TRUE
)

# This should be skipped
tbl$target_end_date <- NA
expect_snapshot(
opt_check_tbl_horizon_timediff(tbl, file_path, hub_path,
t0_colname = "forecast_date",
t1_colname = "target_end_date"
)
)
})

0 comments on commit a75e337

Please sign in to comment.