hubverse-org · annakrystalli · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 * Added:
   - new vignette on how to create custom validation checks for hub validations (#121) 
   - new section on how to manage additional dependencies required by custom validation functions (#22).
+* Bolstered parsing of file names in `parse_file_name()` to ensure they match the specific pattern required rather than just checking patterns of interest are extracted successfully and added more fine-grained check error messages  (#132).
 
 # hubValidations 0.7.0
 

diff --git a/R/parse_file_name.R b/R/parse_file_name.R
@@ -29,51 +29,122 @@ parse_file_name <- function(file_path, file_type = c("model_output", "model_meta
   file_type <- rlang::arg_match(file_type)
   checkmate::assert_string(file_path)
   file_name <- tools::file_path_sans_ext(basename(file_path))
+  # Detect, validate and remove compression extension before validating and splitting
+  # file name
+  compression_ext <- validate_compression_ext(
+    compression_ext = fs::path_ext(file_name)
+  )
+  file_name <- tools::file_path_sans_ext(file_name)
+  validate_filename_contents(file_name)
+  validate_filename_pattern(file_name, file_type)
+
+  split_res <- split_filename(file_name, file_type)
 
+  list(
+    round_id = split_res[1],
+    team_abbr = split_res[2],
+    model_abbr = split_res[3],
+    model_id = paste(split_res[2], split_res[3], sep = "-"),
+    ext = fs::path_ext(file_path),
+    compression_ext = compression_ext
+  ) %>% purrr::compact()
+}
+
+# Split file name into round_id, team_abbr and model_abbr
+split_filename <- function(file_name, file_type) {
   split_pattern <- stringr::regex(
     "([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
     TRUE
   )
-  compress_regex <- paste0(
-    "\\.(",
-    paste0(compress_codec, collapse = "|"),
-    ")$"
-  )
-  split_res <- unlist(
-    stringr::str_extract_all(
-      # remove compression extension before splitting
-      stringr::str_remove(file_name, compress_regex),
-      split_pattern
-    )
-  )
+  split_res <- stringr::str_extract_all(file_name, split_pattern) |> unlist()
+
   exp_n <- switch(file_type,
     model_output = 3L,
     model_metadata = 2L
   )
+
   if (length(split_res) != exp_n) {
     cli::cli_abort(
       "Could not parse file name {.path {file_name}} for submission metadata.
-      Please consult documentation for file name requirements for correct
-      metadata parsing."
+      Please consult
+      {.href [documentation on file name requirements
+      ](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} for correct metadata parsing."
     )
   }
   if (file_type == "model_metadata") {
     split_res <- c(NA, split_res)
   }
-  # extract compression extension if present
-  compression_ext <- stringr::str_extract(file_name, compress_regex)
-  if (is.na(compression_ext)) {
-    compression_ext <- NULL
-  } else {
-    compression_ext <- stringr::str_remove(compression_ext, "^\\.")
+  split_res
+}
+
+# Function that validates a hubverse file name does not contain invalid characters
+validate_filename_contents <- function(file_name, call = rlang::caller_env()) {
+  pattern <- "^[A-Za-z0-9_-]+$"
+  invalid_contents <- isFALSE(grepl(pattern, file_name))
+
+  if (invalid_contents) {
+
+    invalid_char <- stringr::str_remove_all(file_name, "[A-Za-z0-9_-]+") |> # nolint: object_usage_linter
+      strsplit("") |>
+      unlist() |>
+      unique()
+
+    cli::cli_abort(
+      c("x" = "File name {.file {file_name}} contains character{?s}
+        {.val {invalid_char}} that {?is/are} not allowed"),
+      call = call
+    )
+  }
+}
+
+# Function that validates a hubverse file name matches expected pattern:
+# [round_id]-[team_abbr]-[model_abbr]
+validate_filename_pattern <- function(file_name, file_type,
+                                      call = rlang::caller_env()) {
+  pattern <- switch(file_type,
+    model_output = "^((\\d{4}-\\d{2}-\\d{2})|[A-Za-z0-9_]+)-([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$",
+    model_metadata = "^([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$"
+  )
+
+
+  expected_pattern <- switch(file_type, # nolint: object_usage_linter
+    model_output = "[round_id]-[team_abbr]-[model_abbr]",
+    model_metadata = "[team_abbr]-[model_abbr]"
+  )
+
+
+  info_url <- switch(file_type, # nolint: object_usage_linter
+    model_output = "https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure",
+    model_metadata = "https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure"
+  )
+
+  if (!grepl(pattern, file_name)) {
+    cli::cli_abort(
+      c("x" = "File name {.file {file_name}} does not match expected pattern of
+      {.field {expected_pattern}}. Please consult
+      {.href [documentation on file name requirements
+      ]({info_url})}
+      for details."),
+      call = call
+    )
   }
+}
 
-  list(
-    round_id = split_res[1],
-    team_abbr = split_res[2],
-    model_abbr = split_res[3],
-    model_id = paste(split_res[2], split_res[3], sep = "-"),
-    ext = fs::path_ext(file_path),
-    compression_ext = compression_ext
-  ) %>% purrr::compact()
+# Function that validates a hubverse file name compression extension.
+# Returns NULL if empty string or compression_ext if valid.
+validate_compression_ext <- function(compression_ext, call = rlang::caller_env()) {
+  if (compression_ext == "") {
+    return(NULL)
+  }
+  if (!compression_ext %in% compress_codec) {
+    cli::cli_abort(
+      c("x" = "Compression extension {.val {compression_ext}} is not valid.
+      Must be one of {.val {compress_codec}}.
+      Please consult {.href [documentation on file name requirements
+      ](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)}
+      for details."),
+      call = call
+    )
+  }
+  compression_ext
 }
diff --git a/tests/testthat/_snaps/parse_file_name.md b/tests/testthat/_snaps/parse_file_name.md
@@ -88,15 +88,31 @@
       parse_file_name("hubBaseline.yml", file_type = "model_metadata")
     Condition
       Error in `parse_file_name()`:
-      ! Could not parse file name 'hubBaseline' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
+      x File name 'hubBaseline' does not match expected pattern of [team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure>) for details.
 
 # parse_file_name fails correctly
 
     Code
       parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel.csv")
     Condition
       Error in `parse_file_name()`:
-      ! Could not parse file name '2022-10-08-team1_goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
+      x File name '2022-10-08-team1_goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.
+
+---
+
+    Code
+      parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv")
+    Condition
+      Error in `parse_file_name()`:
+      x File name '2022-10-08-team1_goodmodel ' contains character " " that is not allowed
+
+---
+
+    Code
+      parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv")
+    Condition
+      Error in `parse_file_name()`:
+      x File name '2022-10-08-team1_goodmodel*' contains character "*" that is not allowed
 
 # parse_file_name ignores compression extensions
 
@@ -180,5 +196,5 @@
         "model-output/team1-goodmodel/2022-10-08-team1-goodmodel.gzipr.parquet")
     Condition
       Error in `parse_file_name()`:
-      ! Could not parse file name '2022-10-08-team1-goodmodel.gzipr' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
+      x Compression extension "gzipr" is not valid. Must be one of "snappy", "gzip", "gz", "brotli", "zstd", "lz4", "lzo", and "bz2". Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.
 
diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md
@@ -41,7 +41,7 @@
       get_file_round_id(file_path = "team1-goodmodel/2022-10-08-team-1-goodmodel.csv")
     Condition
       Error in `parse_file_name()`:
-      ! Could not parse file name '2022-10-08-team-1-goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
+      x File name '2022-10-08-team-1-goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.
 
 # get_file_* utils work
 

diff --git a/tests/testthat/_snaps/validate_model_data.md b/tests/testthat/_snaps/validate_model_data.md
@@ -334,7 +334,7 @@
       validate_model_data(hub_path, file_path = "random-path.csv")
     Condition
       Error in `parse_file_name()`:
-      ! Could not parse file name 'random-path' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
+      x File name 'random-path' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.
 
 # validate_model_data with v3 sample data works
 

diff --git a/tests/testthat/test-parse_file_name.R b/tests/testthat/test-parse_file_name.R
@@ -56,6 +56,20 @@ test_that("parse_file_name fails correctly", {
     ),
     error = TRUE
   )
+  # Correctly detects invalid characters
+  expect_snapshot(
+    parse_file_name(
+      "model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv"
+    ),
+    error = TRUE
+  )
+  expect_snapshot(
+    parse_file_name(
+      "model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv"
+    ),
+    error = TRUE
+  )
+
   expect_error(
     parse_file_name(
       "model-output/team1-goodmodel/round_1-team1-good-model.parquet"

diff --git a/tests/testthat/test-validate_pr.R b/tests/testthat/test-validate_pr.R
@@ -190,11 +190,13 @@ test_that("validate_pr works on valid PR using v2.0.0 schema and old orgname", {
     branch = "pr-valid"
   )
 
-  checks <- validate_pr(
-    hub_path = temp_hub,
-    gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
-    pr_number = 1,
-    skip_submit_window_check = TRUE
+  checks <- suppressMessages(
+    validate_pr(
+      hub_path = temp_hub,
+      gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
+      pr_number = 1,
+      skip_submit_window_check = TRUE
+    )
   )
 
   expect_snapshot(str(checks))

diff --git a/tests/testthat/testdata/hub-now/hub-config/admin.json b/tests/testthat/testdata/hub-now/hub-config/admin.json
@@ -1,5 +1,5 @@
 {
-    "schema_version": "https://raw.githubusercontent.com/Infectious-Disease-Modeling-Hubs/schemas/main/v3.0.1/admin-schema.json",
+    "schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v3.0.1/admin-schema.json",
     "name": "SARS-CoV-2 Variant Nowcast Hub",
     "maintainer": "jbloggs @ UMass-Amherst",
     "contact": {