diff --git a/DESCRIPTION b/DESCRIPTION index e28368d..6ba1ee5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,6 +20,7 @@ Imports: magrittr (>= 1.0.0), purrr, readr (>= 1.0.0), + rlang (>= 1.0.0), stringdist, tibble (>= 3.0.0), xml2 @@ -31,4 +32,4 @@ Config/testthat/parallel: true Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 219181b..b42714e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(get_dataset) +export(get_latest_resource) export(get_resource) export(get_resource_sql) importFrom(magrittr,"%>%") diff --git a/R/get_latest_resource.R b/R/get_latest_resource.R new file mode 100644 index 0000000..b85d277 --- /dev/null +++ b/R/get_latest_resource.R @@ -0,0 +1,82 @@ +#' Get the latest resource from a data set +#' +#' Returns the latest resource available in a dataset. +#' +#' There are some datasets on the open data platform that +#' keep historic resources instead of updating existing ones. +#' For these it is useful to be able to retrieve the latest +#' resource. As of 1.8.2024 these data sets include: +#' * gp-practice-populations +#' * gp-practice-contact-details-and-list-sizes +#' * nhsscotland-payments-to-general-practice +#' * dental-practices-and-patient-registrations +#' * general-practitioner-contact-details +#' * prescribed-dispensed +#' * dispenser-location-contact-details +#' * community-pharmacy-contractor-activity +#' +#' @inheritParams get_dataset +#' @inheritParams get_resource +#' +#' @return a [tibble][tibble::tibble-package] with the data +#' @export +#' +#' @examples +#' dataset_name <- "gp-practice-contact-details-and-list-sizes" +#' +#' data <- get_latest_resource(dataset_name) +#' +#' filters <- list("Postcode" = "DD11 1ES") +#' wanted_cols <- c("PracticeCode", "Postcode", "Dispensing") +#' +#' filtered_data <- get_latest_resource( +#' dataset_name = dataset_name, +#' row_filters = filters, +#' col_select = wanted_cols +#' ) +#' +get_latest_resource <- function(dataset_name, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = TRUE) { + applicable_datasets <- c( + "community-pharmacy-contractor-activity", + "dental-practices-and-patient-registrations", + "dispenser-location-contact-details", + "general-practitioner-contact-details", + "gp-practice-contact-details-and-list-sizes", + "gp-practice-populations", + "nhsscotland-payments-to-general-practice", + "prescribed-dispensed" + ) + + # check if data set is within applicable datasets + # throw error if not + if (!dataset_name %in% applicable_datasets) { + cli::cli_abort( + c( + "The dataset name supplied {.val {dataset_name}} is not within the applicable datasets. + These are: {.val {applicable_datasets}}", + "x" = "Please see {.fun get_latest_resource} documentation.", + "i" = "You can find dataset names in the URL + of a dataset's page on {.url www.opendata.nhs.scot}." + ), + call = rlang::caller_env() + ) + } + + + # get the latest resource id + id <- get_latest_resource_id(dataset_name) + + data <- get_resource( + res_id = id, + rows = rows, + row_filters = row_filters, + col_select = col_select, + include_context = include_context + ) + + return(data) +} diff --git a/R/get_latest_resource_id.R b/R/get_latest_resource_id.R new file mode 100644 index 0000000..4bdabbe --- /dev/null +++ b/R/get_latest_resource_id.R @@ -0,0 +1,55 @@ +#' get_latest_resource_id +#' +#' to be confident that the resource returned is the one intended +#' two conditions have to be met. It has to appear at the top of +#' of the resource list as shown on the open data platform. +#' The order they are returned via the api is the same +#' as they appear on the open data platform. It also +#' has to have the most recent date created +#' +#' There are only some datasets that this functionality +#' is relevant to, these are listed within applicable +#' datasets and are the datasets that keep historic +#' resources instead of over writing them. +#' +#' @inheritParams get_dataset +#' +#' @return a string with the resource id +get_latest_resource_id <- function(dataset_name) { + # send the api request + query <- list("id" = dataset_name) + content <- phs_GET("package_show", query) + + # retrieve the resource id's from returned contect + all_ids <- purrr::map_chr(content$result$resources, ~ .x$id) + + + # add the id, created date and last_modified to a dataframe + id <- c() + created_date <- c() + modified_date <- c() + + for (res in content$result$resources) { + id <- append(id, res$id) + created_date <- append(created_date, res$created) + modified_date <- append(modified_date, res$last_modified) + } + all_id_data <- tibble::tibble( + id = id, + created_date = strptime(created_date, format = "%FT%X", tz = "UTC"), + modified_date = strptime(modified_date, format = "%FT%X", tz = "UTC") + ) %>% + dplyr::mutate(most_recent_date_created = max(created_date)) + + # get the first row of the resources, this will be the same that appears on the top + # on the open data platform + all_id_data_first_row <- all_id_data %>% + dplyr::slice(1) + + # If the resource at the top as appearing on the open data platform also has the most + # recent date created, return it. Otherwise, error + if (all_id_data_first_row$created_date == all_id_data_first_row$most_recent_date_created) { + return(all_id_data_first_row$id) + } + cli::cli_abort("The most recent id could not be identified") +} diff --git a/man/get_latest_resource.Rd b/man/get_latest_resource.Rd new file mode 100644 index 0000000..9a10e87 --- /dev/null +++ b/man/get_latest_resource.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_latest_resource.R +\name{get_latest_resource} +\alias{get_latest_resource} +\title{Get the latest resource from a data set} +\usage{ +get_latest_resource( + dataset_name, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = TRUE +) +} +\arguments{ +\item{dataset_name}{name of the dataset as found on +\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}} + +\item{rows}{(optional) specify the max number of rows +to return for each resource.} + +\item{row_filters}{(optional) a named list or vector that specifies values of +columns/fields to keep. +e.g. list(Date = 20220216, Sex = "Female").} + +\item{col_select}{(optional) a character vector containing the names of +desired columns/fields. +e.g. c("Date", "Sex").} + +\item{include_context}{(optional) If \code{TRUE} additional information about the +resource will be added as columns to the data, including the resource ID, the +resource name, the creation date and the last modified/updated date.} +} +\value{ +a \link[tibble:tibble-package]{tibble} with the data +} +\description{ +Returns the latest resource available in a dataset. +} +\details{ +There are some datasets on the open data platform that +keep historic resources instead of updating existing ones. +For these it is useful to be able to retrieve the latest +resource. As of 1.8.2024 these data sets include: +\itemize{ +\item gp-practice-populations +\item gp-practice-contact-details-and-list-sizes +\item nhsscotland-payments-to-general-practice +\item dental-practices-and-patient-registrations +\item general-practitioner-contact-details +\item prescribed-dispensed +\item dispenser-location-contact-details +\item community-pharmacy-contractor-activity +} +} +\examples{ +dataset_name <- "gp-practice-contact-details-and-list-sizes" + +data <- get_latest_resource(dataset_name) + +filters <- list("Postcode" = "DD11 1ES") +wanted_cols <- c("PracticeCode", "Postcode", "Dispensing") + +filtered_data <- get_latest_resource( + dataset_name = dataset_name, + row_filters = filters, + col_select = wanted_cols +) + +} diff --git a/man/get_latest_resource_id.Rd b/man/get_latest_resource_id.Rd new file mode 100644 index 0000000..d79e6ae --- /dev/null +++ b/man/get_latest_resource_id.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_latest_resource_id.R +\name{get_latest_resource_id} +\alias{get_latest_resource_id} +\title{get_latest_resource_id} +\usage{ +get_latest_resource_id(dataset_name) +} +\arguments{ +\item{dataset_name}{name of the dataset as found on +\href{https://www.opendata.nhs.scot/}{NHS Open Data platform}} +} +\value{ +a string with the resource id +} +\description{ +to be confident that the resource returned is the one intended +two conditions have to be met. It has to appear at the top of +of the resource list as shown on the open data platform. +The order they are returned via the api is the same +as they appear on the open data platform. It also +has to have the most recent date created +} +\details{ +There are only some datasets that this functionality +is relevant to, these are listed within applicable +datasets and are the datasets that keep historic +resources instead of over writing them. +} diff --git a/tests/testthat/test-get_latest_resource.R b/tests/testthat/test-get_latest_resource.R new file mode 100644 index 0000000..6defe90 --- /dev/null +++ b/tests/testthat/test-get_latest_resource.R @@ -0,0 +1,7 @@ +test_that("returns data for a dataset that is listed", { + expect_no_error(get_latest_resource("gp-practice-populations")) +}) + +test_that("returns error for a dataset that is not listed", { + expect_error(get_latest_resource("hospital-codes")) +}) diff --git a/tests/testthat/test-get_latest_resource_id.R b/tests/testthat/test-get_latest_resource_id.R new file mode 100644 index 0000000..6defe90 --- /dev/null +++ b/tests/testthat/test-get_latest_resource_id.R @@ -0,0 +1,7 @@ +test_that("returns data for a dataset that is listed", { + expect_no_error(get_latest_resource("gp-practice-populations")) +}) + +test_that("returns error for a dataset that is not listed", { + expect_error(get_latest_resource("hospital-codes")) +})