Merge branch 'master' into issue10

Public-Health-Scotland · Jun 14, 2024 · 2f3c2df · 2f3c2df
2 parents eef0c76 + 6f57c1d
commit 2f3c2df
Show file tree

Hide file tree

Showing 32 changed files with 491 additions and 204 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,32 +1,35 @@
 Package: phsopendata
 Title: Extract Open Data from opendata.nhs.scot
 Version: 0.1.0
-Authors@R: 
-    c(person(given = "Csilla", family = "Scharle", role = c("aut", "cre"), email = "[email protected]"),
-      person("James", "McMahon", email = "[email protected]", role = "aut"),
-      person("David", "Aikman", email = "[email protected]", role = "aut"))
-Description: Functions to extract and interact with data from the Scottish Health and Social Care Open Data platform.
+Authors@R: c(
+    person("Csilla", "Scharle", , "[email protected]", role = c("aut", "cre")),
+    person("James", "McMahon", , "[email protected]", role = "aut"),
+    person("David", "Aikman", , "[email protected]", role = "aut")
+  )
+Description: Functions to extract and interact with data from the Scottish
+    Health and Social Care Open Data platform.
 License: GPL (>= 2)
+URL: https://github.com/Public-Health-Scotland/phsopendata,
+    https://public-health-scotland.github.io/phsopendata/
+BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
 Imports:
+    cli,
     dplyr (>= 1.0.0),
-    httr (>= 1.0.0),
     glue (>= 1.0.0),
-    purrr,
-    tibble (>= 3.0.0),
+    httr (>= 1.0.0),
     jsonlite (>= 1.0.0),
     magrittr (>= 1.0.0),
+    purrr,
     readr (>= 1.0.0),
     stringdist,
-    cli,
+    tibble (>= 3.0.0),
     xml2
 Suggests: 
     covr,
     testthat (>= 3.0.0)
+Config/testthat/edition: 3
+Config/testthat/parallel: true
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
-Config/testthat/edition: 3
-URL: https://github.com/Public-Health-Scotland/phsopendata,
-    https://public-health-scotland.github.io/phsopendata/
-BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export("%>%")
 export(get_dataset)
 export(get_resource)
 export(get_resource_sql)

diff --git a/R/add_context.R b/R/add_context.R
@@ -0,0 +1,28 @@
+add_context <- function(data, id, name, created_date, modified_date) {
+  # Catch if the resource has never been modified
+  if (is.null(modified_date)) {
+    modified_date <- NA_character_
+  }
+
+  # Parse the date values
+  created_date <- as.POSIXct(created_date, format = "%FT%X", tz = "UTC")
+  modified_date <- as.POSIXct(modified_date, format = "%FT%X", tz = "UTC")
+
+  # The platform can record the modified date as being before the created date
+  # by a few microseconds, this will catch any rounding which ensure
+  # created_date is always <= modified_date
+  if (modified_date < created_date) {
+    modified_date <- created_date
+  }
+
+  data_with_context <- dplyr::mutate(
+    data,
+    "ResID" = id,
+    "ResName" = name,
+    "ResCreatedDate" = created_date,
+    "ResModifiedDate" = modified_date,
+    .before = dplyr::everything()
+  )
+
+  return(data_with_context)
+}
diff --git a/R/get_dataset.R b/R/get_dataset.R
@@ -7,18 +7,21 @@
 #' it will return the n latest resources
 #' @param rows (optional) specify the max number of rows
 #' to return for each resource.
+#' @inheritParams get_resource
 #'
 #' @seealso [get_resource()] for downloading a single resource
 #' from a dataset.
 #'
-#' @importFrom magrittr %>%
 #' @return a [tibble][tibble::tibble-package] with the data
 #' @export
 #'
 #' @examples get_dataset("gp-practice-populations",
 #'   max_resources = 2, rows = 10
 #' )
-get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
+get_dataset <- function(dataset_name,
+                        max_resources = NULL,
+                        rows = NULL,
+                        include_context = FALSE) {
   # throw error if name type/format is invalid
   check_dataset_name(dataset_name)
 
@@ -37,23 +40,26 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
 
   # define list of resource IDs to get
   all_ids <- purrr::map_chr(content$result$resources, ~ .x$id)
+
   n_res <- length(all_ids)
   res_index <- 1:min(n_res, max_resources)
-  ids_selection <- all_ids[res_index]
+
+  selection_ids <- all_ids[res_index]
 
   # get all resources
   all_data <- purrr::map(
-    ids_selection,
+    selection_ids,
     get_resource,
     rows = rows
   )
 
   # resolve class issues
   types <- purrr::map(
     all_data,
-    ~ unlist(lapply(.x, class))
+    ~ purrr::map_chr(.x, class)
   )
 
+
   # for each df, check if next df class matches
   inconsistencies <- vector(length = length(types) - 1, mode = "list")
   for (i in seq_along(types)) {
@@ -73,28 +79,43 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
   }
 
   # define which columns to coerce and warn
-  conflicts <- unlist(inconsistencies)
-  to_coerce <- unique(names(conflicts))
+  to_coerce <- unique(names(unlist(inconsistencies)))
 
   if (length(to_coerce) > 0) {
     cli::cli_warn(c(
       "Due to conflicts between column types across resources,
       the following {cli::qty(to_coerce)} column{?s} ha{?s/ve} been coerced to type character:",
       "{.val {to_coerce}}"
     ))
-  }
 
-  # combine
-  combined <- purrr::map_df(
-    all_data,
-    ~ dplyr::mutate(
-      .x,
-      dplyr::across(
-        dplyr::any_of(to_coerce),
-        as.character
+    all_data <- purrr::map(
+      all_data,
+      ~ dplyr::mutate(
+        .x,
+        dplyr::across(
+          dplyr::any_of(to_coerce),
+          as.character
+        )
       )
     )
-  )
+  }
+
+  if (include_context) {
+    # Add the 'resource context' as columns to the data
+    all_data <- purrr::pmap(
+      list(
+        "data" = all_data,
+        "id" = selection_ids,
+        "name" = purrr::map_chr(content$result$resources[res_index], ~ .x$name),
+        "created_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$created),
+        "modified_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$last_modified)
+      ),
+      add_context
+    )
+  }
+
+  # Combine the list of resources into a single tibble
+  combined <- purrr::list_rbind(all_data)
 
   return(combined)
 }
diff --git a/R/get_resource.R b/R/get_resource.R
@@ -3,25 +3,40 @@
 #' @param res_id The resource ID as found on
 #' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform}
 #' @param rows (optional) specify the max number of rows to return.
-#' @param row_filters (optional) a named list or vector that specifies values of columns/fields to keep.
+#' @param row_filters (optional) a named list or vector that specifies values of
+#'  columns/fields to keep.
 #' e.g. list(Date = 20220216, Sex = "Female").
-#' @param col_select (optional) a character vector containing the names of desired columns/fields.
+#' @param col_select (optional) a character vector containing the names of
+#' desired columns/fields.
 #' e.g. c("Date", "Sex").
+#' @param include_context (optional) If `TRUE` additional information about the
+#' resource will be added as columns to the data, including the resource ID, the
+#' resource name, the creation date and the last modified/updated date.
 #'
 #' @seealso [get_dataset()] for downloading all resources
 #' from a given dataset.
 #'
-#' @importFrom magrittr %>%
 #' @return a [tibble][tibble::tibble-package] with the data
 #' @export
 #'
 #' @examples
 #' res_id <- "ca3f8e44-9a84-43d6-819c-a880b23bd278"
+#'
+#' data <- get_resource(res_id)
+#'
 #' filters <- list("HB" = "S08000030", "Month" = "202109")
 #' wanted_cols <- c("HB", "Month", "TotalPatientsSeen")
 #'
-#' df <- get_resource(res_id = res_id, row_filters = filters, col_select = wanted_cols)
-get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = NULL) {
+#' filtered_data <- get_resource(
+#'   res_id = res_id,
+#'   row_filters = filters,
+#'   col_select = wanted_cols
+#' )
+get_resource <- function(res_id,
+                         rows = NULL,
+                         row_filters = NULL,
+                         col_select = NULL,
+                         include_context = FALSE) {
   # check res_id
   check_res_id(res_id)
 
@@ -35,51 +50,71 @@ get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = N
 
   # if dump should be used, use it
   if (use_dump_check(query, rows)) {
-    return(dump_download(res_id))
-  }
+    data <- dump_download(res_id)
+  } else {
+    # if there is no row limit set
+    # set limit to CKAN max
+    if (is.null(query$limit)) query$limit <- 99999
 
-  # if there is no row limit set
-  # set limit to CKAN max
-  if (is.null(query$limit)) query$limit <- 99999
+    # remove null values from query
+    null_q_field <- sapply(query, is.null)
+    query[null_q_field] <- NULL
 
-  # remove null values from query
-  null_q_field <- sapply(query, is.null)
-  query[null_q_field] <- NULL
+    # fetch the data
+    res_content <- phs_GET("datastore_search", query)
 
-  # fetch the data
-  res_content <- phs_GET("datastore_search", query)
-
-  # if the total number of rows is greater than the
-  # number of rows fetched
-  # AND the user was not aware of this limit (`rows` defaulted to NULL)
-  # warn the user about this limit.
-  total_rows <- res_content$result$total
-  if (is.null(rows) && query$limit < total_rows) {
-    cli::cli_warn(c(
-      "Returning the first {query$limit}
+    # if the total number of rows is greater than the
+    # number of rows fetched
+    # AND the user was not aware of this limit (`rows` defaulted to NULL)
+    # warn the user about this limit.
+    total_rows <- res_content$result$total
+    if (is.null(rows) && query$limit < total_rows) {
+      cli::cli_warn(c(
+        "Returning the first {query$limit}
       results (rows) of your query.
       {total_rows} rows match your query in total.",
-      i = "To get ALL matching rows you will need to download
+        i = "To get ALL matching rows you will need to download
       the whole resource and apply filters/selections locally."
-    ))
-  }
+      ))
+    }
 
-  # if more rows were requested than received
-  # let the user know
-  if (!is.null(rows) && query$limit > total_rows) {
-    cli::cli_alert_warning(c(
-      "You set {.var rows} to {query$limit} but
-      only {total_rows} rows matched your query."
-    ))
+    # if more rows were requested than received
+    # let the user know
+    if (!is.null(rows) && query$limit > total_rows) {
+      cli::cli_warn(
+        "You set {.var rows} to {.val {query$limit}} but only {.val {total_rows}} rows matched your query."
+      )
+    }
+
+    # extract data from response content
+    data <- purrr::map_dfr(
+      res_content$result$records, ~.x
+    ) %>% dplyr::select(
+      -dplyr::starts_with("rank "),
+      -dplyr::matches("_id")
+    )
   }
 
-  # extract data from response content
-  data <- purrr::map_dfr(
-    res_content$result$records, ~.x
-  ) %>% dplyr::select(
-    -dplyr::starts_with("rank "),
-    -dplyr::matches("_id")
-  )
+  if (include_context) {
+    # Get resource context if required
+    context_content <- phs_GET(
+      action = "resource_show",
+      query = paste0("id=", res_id)
+    )
+
+    res_id <- context_content$result$id
+    res_name <- context_content$result$name
+    res_created_date <- context_content$result$created
+    res_modified_date <- context_content$result$last_modified
+
+    data <- data %>%
+      add_context(
+        id = res_id,
+        name = res_name,
+        created_date = res_created_date,
+        modified_date = res_modified_date
+      )
+  }
 
   return(data)
 }
diff --git a/R/get_resource_sql.R b/R/get_resource_sql.R
@@ -27,7 +27,6 @@
 #'
 #' @seealso [get_resource()] for downloading a resource without using a SQL query.
 #'
-#' @importFrom magrittr %>%
 #' @return a [tibble][tibble::tibble-package] with the query results.
 #' Only 32,000 rows can be returned from a single SQL query.
 #' @export

diff --git a/R/request_url.R b/R/request_url.R
@@ -6,11 +6,18 @@
 #'
 request_url <- function(action, query) {
   # check action is valid
-  valid_actions <- c("datastore_search", "datastore_search_sql", "dump", "package_show", "package_list")
+  valid_actions <- c(
+    "datastore_search",
+    "datastore_search_sql",
+    "dump",
+    "package_show",
+    "package_list",
+    "resource_show"
+  )
   if (!(action %in% valid_actions)) {
     cli::cli_abort(c(
       "API call failed.",
-      x = "Invalid {.var action} argument in request."
+      x = "{.val {action}} is an invalid {.arg action} argument."
     ))
   }
 

diff --git a/R/suggest_dataset_name.R b/R/suggest_dataset_name.R
@@ -17,16 +17,16 @@ suggest_dataset_name <- function(dataset_name) {
       "Can't find the dataset name
       {.var {dataset_name}}, or a close match.",
       i = "Find a dataset's name in the URL
-      of it's page on {.url www.opendata.nhs.scot.}"
+      of its page on {.url www.opendata.nhs.scot.}"
     ))
   }
 
   # find closet match
-  closest_match <- dataset_names[which.min(string_distances)]
+  closest_match <- dataset_names[which(string_distances == min(string_distances))]
 
   # throw error with suggestion
   cli::cli_abort(c(
     "Can't find the dataset name {.var {dataset_name}}.",
-    "i" = "Did you mean '{closest_match}'?"
+    "i" = "Did you mean {?any of }{.val {closest_match}}?"
   ))
 }