Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix file name parse error (#132) #133

Merged
merged 5 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Added:
- new vignette on how to create custom validation checks for hub validations (#121)
- new section on how to manage additional dependencies required by custom validation functions (#22).
* Bolstered parsing of file names in `parse_file_name()` to ensure they match the specific pattern required rather than just checking patterns of interest are extracted successfully and added more fine-grained check error messages (#132).

# hubValidations 0.7.0

Expand Down
127 changes: 99 additions & 28 deletions R/parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,51 +29,122 @@ parse_file_name <- function(file_path, file_type = c("model_output", "model_meta
file_type <- rlang::arg_match(file_type)
checkmate::assert_string(file_path)
file_name <- tools::file_path_sans_ext(basename(file_path))
# Detect, validate and remove compression extension before validating and splitting
# file name
compression_ext <- validate_compression_ext(
compression_ext = fs::path_ext(file_name)
)
file_name <- tools::file_path_sans_ext(file_name)
validate_filename_contents(file_name)
validate_filename_pattern(file_name, file_type)

split_res <- split_filename(file_name, file_type)

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
}

# Split file name into round_id, team_abbr and model_abbr
split_filename <- function(file_name, file_type) {
split_pattern <- stringr::regex(
"([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
TRUE
)
compress_regex <- paste0(
"\\.(",
paste0(compress_codec, collapse = "|"),
")$"
)
split_res <- unlist(
stringr::str_extract_all(
# remove compression extension before splitting
stringr::str_remove(file_name, compress_regex),
split_pattern
)
)
split_res <- stringr::str_extract_all(file_name, split_pattern) |> unlist()

exp_n <- switch(file_type,
model_output = 3L,
model_metadata = 2L
)

if (length(split_res) != exp_n) {
cli::cli_abort(
"Could not parse file name {.path {file_name}} for submission metadata.
Please consult documentation for file name requirements for correct
metadata parsing."
Please consult
{.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} for correct metadata parsing."
)
}
if (file_type == "model_metadata") {
split_res <- c(NA, split_res)
}
# extract compression extension if present
compression_ext <- stringr::str_extract(file_name, compress_regex)
if (is.na(compression_ext)) {
compression_ext <- NULL
} else {
compression_ext <- stringr::str_remove(compression_ext, "^\\.")
split_res
}

# Function that validates a hubverse file name does not contain invalid characters
validate_filename_contents <- function(file_name, call = rlang::caller_env()) {
pattern <- "^[A-Za-z0-9_-]+$"
invalid_contents <- isFALSE(grepl(pattern, file_name))

if (invalid_contents) {

invalid_char <- stringr::str_remove_all(file_name, "[A-Za-z0-9_-]+") |> # nolint: object_usage_linter
strsplit("") |>
unlist() |>
unique()

cli::cli_abort(
c("x" = "File name {.file {file_name}} contains character{?s}
{.val {invalid_char}} that {?is/are} not allowed"),
call = call
)
}
}

# Function that validates a hubverse file name matches expected pattern:
# [round_id]-[team_abbr]-[model_abbr]
validate_filename_pattern <- function(file_name, file_type,
call = rlang::caller_env()) {
pattern <- switch(file_type,
model_output = "^((\\d{4}-\\d{2}-\\d{2})|[A-Za-z0-9_]+)-([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$",
model_metadata = "^([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$"
)


expected_pattern <- switch(file_type, # nolint: object_usage_linter
model_output = "[round_id]-[team_abbr]-[model_abbr]",
model_metadata = "[team_abbr]-[model_abbr]"
)


info_url <- switch(file_type, # nolint: object_usage_linter
model_output = "https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure",
model_metadata = "https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure"
)

if (!grepl(pattern, file_name)) {
cli::cli_abort(
c("x" = "File name {.file {file_name}} does not match expected pattern of
{.field {expected_pattern}}. Please consult
{.href [documentation on file name requirements
]({info_url})}
for details."),
call = call
)
}
}

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
# Function that validates a hubverse file name compression extension.
# Returns NULL if empty string or compression_ext if valid.
validate_compression_ext <- function(compression_ext, call = rlang::caller_env()) {
if (compression_ext == "") {
return(NULL)
}
if (!compression_ext %in% compress_codec) {
cli::cli_abort(
c("x" = "Compression extension {.val {compression_ext}} is not valid.
Must be one of {.val {compress_codec}}.
Please consult {.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)}
for details."),
call = call
)
}
compression_ext
}
22 changes: 19 additions & 3 deletions tests/testthat/_snaps/parse_file_name.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,31 @@
parse_file_name("hubBaseline.yml", file_type = "model_metadata")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'hubBaseline' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name 'hubBaseline' does not match expected pattern of [team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure>) for details.

# parse_file_name fails correctly

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1_goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name '2022-10-08-team1_goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel ' contains character " " that is not allowed

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel*' contains character "*" that is not allowed

# parse_file_name ignores compression extensions

Expand Down Expand Up @@ -180,5 +196,5 @@
"model-output/team1-goodmodel/2022-10-08-team1-goodmodel.gzipr.parquet")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1-goodmodel.gzipr' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x Compression extension "gzipr" is not valid. Must be one of "snappy", "gzip", "gz", "brotli", "zstd", "lz4", "lzo", and "bz2". Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

2 changes: 1 addition & 1 deletion tests/testthat/_snaps/utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
get_file_round_id(file_path = "team1-goodmodel/2022-10-08-team-1-goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team-1-goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name '2022-10-08-team-1-goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# get_file_* utils work

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/_snaps/validate_model_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@
validate_model_data(hub_path, file_path = "random-path.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'random-path' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name 'random-path' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# validate_model_data with v3 sample data works

Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ test_that("parse_file_name fails correctly", {
),
error = TRUE
)
# Correctly detects invalid characters
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv"
),
error = TRUE
)
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv"
),
error = TRUE
)

expect_error(
parse_file_name(
"model-output/team1-goodmodel/round_1-team1-good-model.parquet"
Expand Down
12 changes: 7 additions & 5 deletions tests/testthat/test-validate_pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,13 @@ test_that("validate_pr works on valid PR using v2.0.0 schema and old orgname", {
branch = "pr-valid"
)

checks <- validate_pr(
hub_path = temp_hub,
gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
pr_number = 1,
skip_submit_window_check = TRUE
checks <- suppressMessages(
validate_pr(
hub_path = temp_hub,
gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
pr_number = 1,
skip_submit_window_check = TRUE
)
)

expect_snapshot(str(checks))
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/testdata/hub-now/hub-config/admin.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"schema_version": "https://raw.githubusercontent.com/Infectious-Disease-Modeling-Hubs/schemas/main/v3.0.1/admin-schema.json",
"schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v3.0.1/admin-schema.json",
"name": "SARS-CoV-2 Variant Nowcast Hub",
"maintainer": "jbloggs @ UMass-Amherst",
"contact": {
Expand Down
Loading