Split get_latest_resource to its own branch

Public-Health-Scotland · Jul 8, 2024 · 05bd504 · 05bd504
1 parent 223527d
commit 05bd504
Show file tree

Hide file tree

Showing 6 changed files with 267 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export("%>%")
 export(get_dataset)
+export(get_latest_resource)
 export(get_resource)
 export(get_resource_sql)
 importFrom(magrittr,"%>%")
diff --git a/R/get_latest_resource.R b/R/get_latest_resource.R
@@ -0,0 +1,64 @@
+#' Get the latest resource from a data set
+#'
+#' `get_dataset_additional_info()` returns the most
+#' recently uploaded resource to a dataset
+#'
+#' There are some datasets on the open data platform that
+#' keep historic resources instead of updating existing ones.
+#' For these it is useful to be able to retrieve the latest
+#' resource. As of 5.7.2024 these data sets include:
+#' * gp-practice-populations
+#' * gp-practice-contact-details-and-list-sizes
+#' * nhsscotland-payments-to-general-practice
+#' * dental-practices-and-patient-registrations
+#' * general-practitioner-contact-details
+#' * prescribed-dispensed
+#' * prescriptions-in-the-community
+#' * community-pharmacy-contractor-activity
+#'
+#' @param dataset_name name of the dataset as found on
+#' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform}
+#' @param rows (optional) specify the max number of rows to return.
+#' @param row_filters (optional) a named list or vector that specifies values of
+#'  columns/fields to keep.
+#' e.g. list(Date = 20220216, Sex = "Female").
+#' @param col_select (optional) a character vector containing the names of
+#' desired columns/fields.
+#' e.g. c("Date", "Sex").
+#' @param include_context (optional) If `TRUE` additional information about the
+#' resource will be added as columns to the data, including the resource ID, the
+#' resource name, the creation date and the last modified/updated date.
+#'
+#' @return a [tibble][tibble::tibble-package] with the data
+#' @export
+#'
+#' @examples
+#' dataset_name <- "gp-practice-contact-details-and-list-sizes"
+#'
+#' data <- get_latest_resource(dataset_name)
+#'
+#' filters <- list("Postcode" = "DD11 1ES")
+#' wanted_cols <- c("PracticeCode", "Postcode", "Dispensing")
+#'
+#' filtered_data <- get_latest_resource(
+#'   dataset_name = dataset_name,
+#'   row_filters = filters,
+#'   col_select = wanted_cols
+#' )
+#'
+get_latest_resource <- function(dataset_name,
+                                rows = NULL,
+                                row_filters = NULL,
+                                col_select = NULL,
+                                include_context = FALSE) {
+  # get the latest resource id
+  id <- get_latest_resource_id(dataset_name)
+
+  return_value <- get_resource(
+    id,
+    rows,
+    row_filters,
+    col_select,
+    include_context
+  )
+}
diff --git a/R/get_latest_resource_id.R b/R/get_latest_resource_id.R
@@ -0,0 +1,96 @@
+#' get_latest_resource_id
+#'
+#' to be confident that the resource returned is the one intended
+#' two conditions have to be met. It has to appear at the top of
+#' of the resource list as shown on the open data platform.
+#' The order they are returned via the api is the same
+#' as they appear on the open data platform. It also
+#' has to have the most recent date created
+#'
+#' There are only some datasets that this functionality
+#' is relevant to, these are listed within applicable
+#' datasets and are the datasets that keep historic
+#' resources instead of over writing them.
+#'
+#' @inheritParams get_dataset
+#'
+#' @return a string with the resource id
+get_latest_resource_id <- function(dataset_name) {
+  applicable_datasets <- c(
+    "gp-practice-populations", "gp-practice-contact-details-and-list-sizes",
+    "nhsscotland-payments-to-general-practice", "dental-practices-and-patient-registrations",
+    "general-practitioner-contact-details", "prescribed-dispensed",
+    "prescriptions-in-the-community", "community-pharmacy-contractor-activity"
+  )
+
+  # throw error if name type/format is invalid
+  check_dataset_name(dataset_name)
+
+  # define query and try API call
+  query <- list("id" = dataset_name)
+  content <- try(
+    phs_GET("package_show", query),
+    silent = TRUE
+  )
+
+
+  # check if data set is within applicable datasets
+  # throw error if not
+  if (!dataset_name %in% applicable_datasets) {
+    cli::cli_abort(c(
+      "The dataset name supplied {.var {dataset_name}} is not within the applicable datasets.
+      These are:\n
+      {.var {applicable_datasets}}",
+      "x" = "Please see get_latest_reource documentation.",
+      "i" = "You can find dataset names in the URL
+      of a dataset's page on {.url www.opendata.nhs.scot}."
+    ))
+  }
+
+  # if content contains a 'Not Found Error'
+  # throw error with suggested dataset name
+  if (grepl("Not Found Error", content[1])) {
+    suggest_dataset_name(dataset_name)
+  }
+
+  # send the api request
+  query <- list("id" = dataset_name)
+  content <- try(
+    phs_GET("package_show", query),
+    silent = TRUE
+  )
+
+  # retrieve the resource id's from returned contect
+  all_ids <- purrr::map_chr(content$result$resources, ~ .x$id)
+
+
+  # add the id, created date and last_modified to a dataframe
+  id <- c()
+  created_date <- c()
+  modified_date <- c()
+
+  for (i in content$result$resources) {
+    id <- append(id, i$id)
+    created_date <- append(created_date, i$created)
+    modified_date <- append(modified_date, i$last_modified)
+  }
+  all_id_data <- tibble::tibble(
+    id = id,
+    created_date = strptime(created_date, format = "%FT%X", tz = "UTC"),
+    modified_date = strptime(modified_date, format = "%FT%X", tz = "UTC")
+  ) %>%
+    dplyr::mutate(most_recent_date_created = max(created_date))
+
+  # get the first row of the rources, this will be the same that appears on the top
+  # on the open data platform
+  all_id_data_first_row <- all_id_data %>%
+    dplyr::slice(1)
+
+  # if the resource at the top as appearing on the open data platform also has the most
+  # recent date created, return it. Otherwise return warning
+  if (all_id_data_first_row$created_date == all_id_data_first_row$most_recent_date_created) {
+    return(all_id_data_first_row$id)
+  } else {
+    (warning("most recent id could not be identified"))
+  }
+}
diff --git a/man/get_latest_resource.Rd b/man/get_latest_resource.Rd
diff --git a/man/get_latest_resource_id.Rd b/man/get_latest_resource_id.Rd
diff --git a/tests/testthat/test-get_latest_resource_id.R b/tests/testthat/test-get_latest_resource_id.R
@@ -0,0 +1,7 @@
+test_that("returns data for a dataset that is listed", {
+  expect_no_error(get_latest_resource_id("gp-practice-populations"))
+})
+
+test_that("returns error for a dataset that is not listed", {
+  expect_error(get_latest_resource_id("hospital-codes"))
+})