From 05bd5048a764bbfb764c9b660359c4c3f6ee5540 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Mon, 8 Jul 2024 09:52:40 +0100 Subject: [PATCH] Split `get_latest_resource` to its own branch --- NAMESPACE | 1 + R/get_latest_resource.R | 64 +++++++++++++ R/get_latest_resource_id.R | 96 ++++++++++++++++++++ man/get_latest_resource.Rd | 70 ++++++++++++++ man/get_latest_resource_id.Rd | 29 ++++++ tests/testthat/test-get_latest_resource_id.R | 7 ++ 6 files changed, 267 insertions(+) create mode 100644 R/get_latest_resource.R create mode 100644 R/get_latest_resource_id.R create mode 100644 man/get_latest_resource.Rd create mode 100644 man/get_latest_resource_id.Rd create mode 100644 tests/testthat/test-get_latest_resource_id.R diff --git a/NAMESPACE b/NAMESPACE index 219181b..b42714e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(get_dataset) +export(get_latest_resource) export(get_resource) export(get_resource_sql) importFrom(magrittr,"%>%") diff --git a/R/get_latest_resource.R b/R/get_latest_resource.R new file mode 100644 index 0000000..defbf8b --- /dev/null +++ b/R/get_latest_resource.R @@ -0,0 +1,64 @@ +#' Get the latest resource from a data set +#' +#' `get_dataset_additional_info()` returns the most +#' recently uploaded resource to a dataset +#' +#' There are some datasets on the open data platform that +#' keep historic resources instead of updating existing ones. +#' For these it is useful to be able to retrieve the latest +#' resource. As of 5.7.2024 these data sets include: +#' * gp-practice-populations +#' * gp-practice-contact-details-and-list-sizes +#' * nhsscotland-payments-to-general-practice +#' * dental-practices-and-patient-registrations +#' * general-practitioner-contact-details +#' * prescribed-dispensed +#' * prescriptions-in-the-community +#' * community-pharmacy-contractor-activity +#' +#' @param dataset_name name of the dataset as found on +#' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform} +#' @param rows (optional) specify the max number of rows to return. +#' @param row_filters (optional) a named list or vector that specifies values of +#' columns/fields to keep. +#' e.g. list(Date = 20220216, Sex = "Female"). +#' @param col_select (optional) a character vector containing the names of +#' desired columns/fields. +#' e.g. c("Date", "Sex"). +#' @param include_context (optional) If `TRUE` additional information about the +#' resource will be added as columns to the data, including the resource ID, the +#' resource name, the creation date and the last modified/updated date. +#' +#' @return a [tibble][tibble::tibble-package] with the data +#' @export +#' +#' @examples +#' dataset_name <- "gp-practice-contact-details-and-list-sizes" +#' +#' data <- get_latest_resource(dataset_name) +#' +#' filters <- list("Postcode" = "DD11 1ES") +#' wanted_cols <- c("PracticeCode", "Postcode", "Dispensing") +#' +#' filtered_data <- get_latest_resource( +#' dataset_name = dataset_name, +#' row_filters = filters, +#' col_select = wanted_cols +#' ) +#' +get_latest_resource <- function(dataset_name, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = FALSE) { + # get the latest resource id + id <- get_latest_resource_id(dataset_name) + + return_value <- get_resource( + id, + rows, + row_filters, + col_select, + include_context + ) +} diff --git a/R/get_latest_resource_id.R b/R/get_latest_resource_id.R new file mode 100644 index 0000000..f693810 --- /dev/null +++ b/R/get_latest_resource_id.R @@ -0,0 +1,96 @@ +#' get_latest_resource_id +#' +#' to be confident that the resource returned is the one intended +#' two conditions have to be met. It has to appear at the top of +#' of the resource list as shown on the open data platform. +#' The order they are returned via the api is the same +#' as they appear on the open data platform. It also +#' has to have the most recent date created +#' +#' There are only some datasets that this functionality +#' is relevant to, these are listed within applicable +#' datasets and are the datasets that keep historic +#' resources instead of over writing them. +#' +#' @inheritParams get_dataset +#' +#' @return a string with the resource id +get_latest_resource_id <- function(dataset_name) { + applicable_datasets <- c( + "gp-practice-populations", "gp-practice-contact-details-and-list-sizes", + "nhsscotland-payments-to-general-practice", "dental-practices-and-patient-registrations", + "general-practitioner-contact-details", "prescribed-dispensed", + "prescriptions-in-the-community", "community-pharmacy-contractor-activity" + ) + + # throw error if name type/format is invalid + check_dataset_name(dataset_name) + + # define query and try API call + query <- list("id" = dataset_name) + content <- try( + phs_GET("package_show", query), + silent = TRUE + ) + + + # check if data set is within applicable datasets + # throw error if not + if (!dataset_name %in% applicable_datasets) { + cli::cli_abort(c( + "The dataset name supplied {.var {dataset_name}} is not within the applicable datasets. + These are:\n + {.var {applicable_datasets}}", + "x" = "Please see get_latest_reource documentation.", + "i" = "You can find dataset names in the URL + of a dataset's page on {.url www.opendata.nhs.scot}." + )) + } + + # if content contains a 'Not Found Error' + # throw error with suggested dataset name + if (grepl("Not Found Error", content[1])) { + suggest_dataset_name(dataset_name) + } + + # send the api request + query <- list("id" = dataset_name) + content <- try( + phs_GET("package_show", query), + silent = TRUE + ) + + # retrieve the resource id's from returned contect + all_ids <- purrr::map_chr(content$result$resources, ~ .x$id) + + + # add the id, created date and last_modified to a dataframe + id <- c() + created_date <- c() + modified_date <- c() + + for (i in content$result$resources) { + id <- append(id, i$id) + created_date <- append(created_date, i$created) + modified_date <- append(modified_date, i$last_modified) + } + all_id_data <- tibble::tibble( + id = id, + created_date = strptime(created_date, format = "%FT%X", tz = "UTC"), + modified_date = strptime(modified_date, format = "%FT%X", tz = "UTC") + ) %>% + dplyr::mutate(most_recent_date_created = max(created_date)) + + # get the first row of the rources, this will be the same that appears on the top + # on the open data platform + all_id_data_first_row <- all_id_data %>% + dplyr::slice(1) + + # if the resource at the top as appearing on the open data platform also has the most + # recent date created, return it. Otherwise return warning + if (all_id_data_first_row$created_date == all_id_data_first_row$most_recent_date_created) { + return(all_id_data_first_row$id) + } else { + (warning("most recent id could not be identified")) + } +} diff --git a/man/get_latest_resource.Rd b/man/get_latest_resource.Rd new file mode 100644 index 0000000..26b2202 --- /dev/null +++ b/man/get_latest_resource.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_latest_resource.R +\name{get_latest_resource} +\alias{get_latest_resource} +\title{Get the latest resource from a data set} +\usage{ +get_latest_resource( + dataset_name, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = FALSE +) +} +\arguments{ +\item{dataset_name}{name of the dataset as found on +\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}} + +\item{rows}{(optional) specify the max number of rows to return.} + +\item{row_filters}{(optional) a named list or vector that specifies values of +columns/fields to keep. +e.g. list(Date = 20220216, Sex = "Female").} + +\item{col_select}{(optional) a character vector containing the names of +desired columns/fields. +e.g. c("Date", "Sex").} + +\item{include_context}{(optional) If \code{TRUE} additional information about the +resource will be added as columns to the data, including the resource ID, the +resource name, the creation date and the last modified/updated date.} +} +\value{ +a \link[tibble:tibble-package]{tibble} with the data +} +\description{ +\code{get_dataset_additional_info()} returns the most +recently uploaded resource to a dataset +} +\details{ +There are some datasets on the open data platform that +keep historic resources instead of updating existing ones. +For these it is useful to be able to retrieve the latest +resource. As of 5.7.2024 these data sets include: +\itemize{ +\item gp-practice-populations +\item gp-practice-contact-details-and-list-sizes +\item nhsscotland-payments-to-general-practice +\item dental-practices-and-patient-registrations +\item general-practitioner-contact-details +\item prescribed-dispensed +\item prescriptions-in-the-community +\item community-pharmacy-contractor-activity +} +} +\examples{ +dataset_name <- "gp-practice-contact-details-and-list-sizes" + +data <- get_latest_resource(dataset_name) + +filters <- list("Postcode" = "DD11 1ES") +wanted_cols <- c("PracticeCode", "Postcode", "Dispensing") + +filtered_data <- get_latest_resource( + dataset_name = dataset_name, + row_filters = filters, + col_select = wanted_cols +) + +} diff --git a/man/get_latest_resource_id.Rd b/man/get_latest_resource_id.Rd new file mode 100644 index 0000000..d79e6ae --- /dev/null +++ b/man/get_latest_resource_id.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_latest_resource_id.R +\name{get_latest_resource_id} +\alias{get_latest_resource_id} +\title{get_latest_resource_id} +\usage{ +get_latest_resource_id(dataset_name) +} +\arguments{ +\item{dataset_name}{name of the dataset as found on +\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}} +} +\value{ +a string with the resource id +} +\description{ +to be confident that the resource returned is the one intended +two conditions have to be met. It has to appear at the top of +of the resource list as shown on the open data platform. +The order they are returned via the api is the same +as they appear on the open data platform. It also +has to have the most recent date created +} +\details{ +There are only some datasets that this functionality +is relevant to, these are listed within applicable +datasets and are the datasets that keep historic +resources instead of over writing them. +} diff --git a/tests/testthat/test-get_latest_resource_id.R b/tests/testthat/test-get_latest_resource_id.R new file mode 100644 index 0000000..c2e7569 --- /dev/null +++ b/tests/testthat/test-get_latest_resource_id.R @@ -0,0 +1,7 @@ +test_that("returns data for a dataset that is listed", { + expect_no_error(get_latest_resource_id("gp-practice-populations")) +}) + +test_that("returns error for a dataset that is not listed", { + expect_error(get_latest_resource_id("hospital-codes")) +})