From 05bd5048a764bbfb764c9b660359c4c3f6ee5540 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 8 Jul 2024 09:52:40 +0100
Subject: [PATCH] Split `get_latest_resource` to its own branch

---
 NAMESPACE                                    |  1 +
 R/get_latest_resource.R                      | 64 +++++++++++++
 R/get_latest_resource_id.R                   | 96 ++++++++++++++++++++
 man/get_latest_resource.Rd                   | 70 ++++++++++++++
 man/get_latest_resource_id.Rd                | 29 ++++++
 tests/testthat/test-get_latest_resource_id.R |  7 ++
 6 files changed, 267 insertions(+)
 create mode 100644 R/get_latest_resource.R
 create mode 100644 R/get_latest_resource_id.R
 create mode 100644 man/get_latest_resource.Rd
 create mode 100644 man/get_latest_resource_id.Rd
 create mode 100644 tests/testthat/test-get_latest_resource_id.R

diff --git a/NAMESPACE b/NAMESPACE
index 219181b..b42714e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export("%>%")
 export(get_dataset)
+export(get_latest_resource)
 export(get_resource)
 export(get_resource_sql)
 importFrom(magrittr,"%>%")
diff --git a/R/get_latest_resource.R b/R/get_latest_resource.R
new file mode 100644
index 0000000..defbf8b
--- /dev/null
+++ b/R/get_latest_resource.R
@@ -0,0 +1,64 @@
+#' Get the latest resource from a data set
+#'
+#' `get_dataset_additional_info()` returns the most
+#' recently uploaded resource to a dataset
+#'
+#' There are some datasets on the open data platform that
+#' keep historic resources instead of updating existing ones.
+#' For these it is useful to be able to retrieve the latest
+#' resource. As of 5.7.2024 these data sets include:
+#' * gp-practice-populations
+#' * gp-practice-contact-details-and-list-sizes
+#' * nhsscotland-payments-to-general-practice
+#' * dental-practices-and-patient-registrations
+#' * general-practitioner-contact-details
+#' * prescribed-dispensed
+#' * prescriptions-in-the-community
+#' * community-pharmacy-contractor-activity
+#'
+#' @param dataset_name name of the dataset as found on
+#' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform}
+#' @param rows (optional) specify the max number of rows to return.
+#' @param row_filters (optional) a named list or vector that specifies values of
+#'  columns/fields to keep.
+#' e.g. list(Date = 20220216, Sex = "Female").
+#' @param col_select (optional) a character vector containing the names of
+#' desired columns/fields.
+#' e.g. c("Date", "Sex").
+#' @param include_context (optional) If `TRUE` additional information about the
+#' resource will be added as columns to the data, including the resource ID, the
+#' resource name, the creation date and the last modified/updated date.
+#'
+#' @return a [tibble][tibble::tibble-package] with the data
+#' @export
+#'
+#' @examples
+#' dataset_name <- "gp-practice-contact-details-and-list-sizes"
+#'
+#' data <- get_latest_resource(dataset_name)
+#'
+#' filters <- list("Postcode" = "DD11 1ES")
+#' wanted_cols <- c("PracticeCode", "Postcode", "Dispensing")
+#'
+#' filtered_data <- get_latest_resource(
+#'   dataset_name = dataset_name,
+#'   row_filters = filters,
+#'   col_select = wanted_cols
+#' )
+#'
+get_latest_resource <- function(dataset_name,
+                                rows = NULL,
+                                row_filters = NULL,
+                                col_select = NULL,
+                                include_context = FALSE) {
+  # get the latest resource id
+  id <- get_latest_resource_id(dataset_name)
+
+  return_value <- get_resource(
+    id,
+    rows,
+    row_filters,
+    col_select,
+    include_context
+  )
+}
diff --git a/R/get_latest_resource_id.R b/R/get_latest_resource_id.R
new file mode 100644
index 0000000..f693810
--- /dev/null
+++ b/R/get_latest_resource_id.R
@@ -0,0 +1,96 @@
+#' get_latest_resource_id
+#'
+#' to be confident that the resource returned is the one intended
+#' two conditions have to be met. It has to appear at the top of
+#' of the resource list as shown on the open data platform.
+#' The order they are returned via the api is the same
+#' as they appear on the open data platform. It also
+#' has to have the most recent date created
+#'
+#' There are only some datasets that this functionality
+#' is relevant to, these are listed within applicable
+#' datasets and are the datasets that keep historic
+#' resources instead of over writing them.
+#'
+#' @inheritParams get_dataset
+#'
+#' @return a string with the resource id
+get_latest_resource_id <- function(dataset_name) {
+  applicable_datasets <- c(
+    "gp-practice-populations", "gp-practice-contact-details-and-list-sizes",
+    "nhsscotland-payments-to-general-practice", "dental-practices-and-patient-registrations",
+    "general-practitioner-contact-details", "prescribed-dispensed",
+    "prescriptions-in-the-community", "community-pharmacy-contractor-activity"
+  )
+
+  # throw error if name type/format is invalid
+  check_dataset_name(dataset_name)
+
+  # define query and try API call
+  query <- list("id" = dataset_name)
+  content <- try(
+    phs_GET("package_show", query),
+    silent = TRUE
+  )
+
+
+  # check if data set is within applicable datasets
+  # throw error if not
+  if (!dataset_name %in% applicable_datasets) {
+    cli::cli_abort(c(
+      "The dataset name supplied {.var {dataset_name}} is not within the applicable datasets.
+      These are:\n
+      {.var {applicable_datasets}}",
+      "x" = "Please see get_latest_reource documentation.",
+      "i" = "You can find dataset names in the URL
+      of a dataset's page on {.url www.opendata.nhs.scot}."
+    ))
+  }
+
+  # if content contains a 'Not Found Error'
+  # throw error with suggested dataset name
+  if (grepl("Not Found Error", content[1])) {
+    suggest_dataset_name(dataset_name)
+  }
+
+  # send the api request
+  query <- list("id" = dataset_name)
+  content <- try(
+    phs_GET("package_show", query),
+    silent = TRUE
+  )
+
+  # retrieve the resource id's from returned contect
+  all_ids <- purrr::map_chr(content$result$resources, ~ .x$id)
+
+
+  # add the id, created date and last_modified to a dataframe
+  id <- c()
+  created_date <- c()
+  modified_date <- c()
+
+  for (i in content$result$resources) {
+    id <- append(id, i$id)
+    created_date <- append(created_date, i$created)
+    modified_date <- append(modified_date, i$last_modified)
+  }
+  all_id_data <- tibble::tibble(
+    id = id,
+    created_date = strptime(created_date, format = "%FT%X", tz = "UTC"),
+    modified_date = strptime(modified_date, format = "%FT%X", tz = "UTC")
+  ) %>%
+    dplyr::mutate(most_recent_date_created = max(created_date))
+
+  # get the first row of the rources, this will be the same that appears on the top
+  # on the open data platform
+  all_id_data_first_row <- all_id_data %>%
+    dplyr::slice(1)
+
+  # if the resource at the top as appearing on the open data platform also has the most
+  # recent date created, return it. Otherwise return warning
+  if (all_id_data_first_row$created_date == all_id_data_first_row$most_recent_date_created) {
+    return(all_id_data_first_row$id)
+  } else {
+    (warning("most recent id could not be identified"))
+  }
+}
diff --git a/man/get_latest_resource.Rd b/man/get_latest_resource.Rd
new file mode 100644
index 0000000..26b2202
--- /dev/null
+++ b/man/get_latest_resource.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_latest_resource.R
+\name{get_latest_resource}
+\alias{get_latest_resource}
+\title{Get the latest resource from a data set}
+\usage{
+get_latest_resource(
+  dataset_name,
+  rows = NULL,
+  row_filters = NULL,
+  col_select = NULL,
+  include_context = FALSE
+)
+}
+\arguments{
+\item{dataset_name}{name of the dataset as found on
+\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}}
+
+\item{rows}{(optional) specify the max number of rows to return.}
+
+\item{row_filters}{(optional) a named list or vector that specifies values of
+columns/fields to keep.
+e.g. list(Date = 20220216, Sex = "Female").}
+
+\item{col_select}{(optional) a character vector containing the names of
+desired columns/fields.
+e.g. c("Date", "Sex").}
+
+\item{include_context}{(optional) If \code{TRUE} additional information about the
+resource will be added as columns to the data, including the resource ID, the
+resource name, the creation date and the last modified/updated date.}
+}
+\value{
+a \link[tibble:tibble-package]{tibble} with the data
+}
+\description{
+\code{get_dataset_additional_info()} returns the most
+recently uploaded resource to a dataset
+}
+\details{
+There are some datasets on the open data platform that
+keep historic resources instead of updating existing ones.
+For these it is useful to be able to retrieve the latest
+resource. As of 5.7.2024 these data sets include:
+\itemize{
+\item gp-practice-populations
+\item gp-practice-contact-details-and-list-sizes
+\item nhsscotland-payments-to-general-practice
+\item dental-practices-and-patient-registrations
+\item general-practitioner-contact-details
+\item prescribed-dispensed
+\item prescriptions-in-the-community
+\item community-pharmacy-contractor-activity
+}
+}
+\examples{
+dataset_name <- "gp-practice-contact-details-and-list-sizes"
+
+data <- get_latest_resource(dataset_name)
+
+filters <- list("Postcode" = "DD11 1ES")
+wanted_cols <- c("PracticeCode", "Postcode", "Dispensing")
+
+filtered_data <- get_latest_resource(
+  dataset_name = dataset_name,
+  row_filters = filters,
+  col_select = wanted_cols
+)
+
+}
diff --git a/man/get_latest_resource_id.Rd b/man/get_latest_resource_id.Rd
new file mode 100644
index 0000000..d79e6ae
--- /dev/null
+++ b/man/get_latest_resource_id.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_latest_resource_id.R
+\name{get_latest_resource_id}
+\alias{get_latest_resource_id}
+\title{get_latest_resource_id}
+\usage{
+get_latest_resource_id(dataset_name)
+}
+\arguments{
+\item{dataset_name}{name of the dataset as found on
+\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}}
+}
+\value{
+a string with the resource id
+}
+\description{
+to be confident that the resource returned is the one intended
+two conditions have to be met. It has to appear at the top of
+of the resource list as shown on the open data platform.
+The order they are returned via the api is the same
+as they appear on the open data platform. It also
+has to have the most recent date created
+}
+\details{
+There are only some datasets that this functionality
+is relevant to, these are listed within applicable
+datasets and are the datasets that keep historic
+resources instead of over writing them.
+}
diff --git a/tests/testthat/test-get_latest_resource_id.R b/tests/testthat/test-get_latest_resource_id.R
new file mode 100644
index 0000000..c2e7569
--- /dev/null
+++ b/tests/testthat/test-get_latest_resource_id.R
@@ -0,0 +1,7 @@
+test_that("returns data for a dataset that is listed", {
+  expect_no_error(get_latest_resource_id("gp-practice-populations"))
+})
+
+test_that("returns error for a dataset that is not listed", {
+  expect_error(get_latest_resource_id("hospital-codes"))
+})