Skip to content

Commit

Permalink
Merge branch 'master' into issue10
Browse files Browse the repository at this point in the history
  • Loading branch information
ross-hull committed Jun 14, 2024
2 parents eef0c76 + 6f57c1d commit 2f3c2df
Show file tree
Hide file tree
Showing 32 changed files with 491 additions and 204 deletions.
29 changes: 16 additions & 13 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,32 +1,35 @@
Package: phsopendata
Title: Extract Open Data from opendata.nhs.scot
Version: 0.1.0
Authors@R:
c(person(given = "Csilla", family = "Scharle", role = c("aut", "cre"), email = "[email protected]"),
person("James", "McMahon", email = "[email protected]", role = "aut"),
person("David", "Aikman", email = "[email protected]", role = "aut"))
Description: Functions to extract and interact with data from the Scottish Health and Social Care Open Data platform.
Authors@R: c(
person("Csilla", "Scharle", , "[email protected]", role = c("aut", "cre")),
person("James", "McMahon", , "[email protected]", role = "aut"),
person("David", "Aikman", , "[email protected]", role = "aut")
)
Description: Functions to extract and interact with data from the Scottish
Health and Social Care Open Data platform.
License: GPL (>= 2)
URL: https://github.com/Public-Health-Scotland/phsopendata,
https://public-health-scotland.github.io/phsopendata/
BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
Imports:
cli,
dplyr (>= 1.0.0),
httr (>= 1.0.0),
glue (>= 1.0.0),
purrr,
tibble (>= 3.0.0),
httr (>= 1.0.0),
jsonlite (>= 1.0.0),
magrittr (>= 1.0.0),
purrr,
readr (>= 1.0.0),
stringdist,
cli,
tibble (>= 3.0.0),
xml2
Suggests:
covr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Config/testthat/parallel: true
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Config/testthat/edition: 3
URL: https://github.com/Public-Health-Scotland/phsopendata,
https://public-health-scotland.github.io/phsopendata/
BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(get_dataset)
export(get_resource)
export(get_resource_sql)
Expand Down
28 changes: 28 additions & 0 deletions R/add_context.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
add_context <- function(data, id, name, created_date, modified_date) {
# Catch if the resource has never been modified
if (is.null(modified_date)) {
modified_date <- NA_character_
}

# Parse the date values
created_date <- as.POSIXct(created_date, format = "%FT%X", tz = "UTC")
modified_date <- as.POSIXct(modified_date, format = "%FT%X", tz = "UTC")

# The platform can record the modified date as being before the created date
# by a few microseconds, this will catch any rounding which ensure
# created_date is always <= modified_date
if (modified_date < created_date) {
modified_date <- created_date
}

data_with_context <- dplyr::mutate(
data,
"ResID" = id,
"ResName" = name,
"ResCreatedDate" = created_date,
"ResModifiedDate" = modified_date,
.before = dplyr::everything()
)

return(data_with_context)
}
55 changes: 38 additions & 17 deletions R/get_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@
#' it will return the n latest resources
#' @param rows (optional) specify the max number of rows
#' to return for each resource.
#' @inheritParams get_resource
#'
#' @seealso [get_resource()] for downloading a single resource
#' from a dataset.
#'
#' @importFrom magrittr %>%
#' @return a [tibble][tibble::tibble-package] with the data
#' @export
#'
#' @examples get_dataset("gp-practice-populations",
#' max_resources = 2, rows = 10
#' )
get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
get_dataset <- function(dataset_name,
max_resources = NULL,
rows = NULL,
include_context = FALSE) {
# throw error if name type/format is invalid
check_dataset_name(dataset_name)

Expand All @@ -37,23 +40,26 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {

# define list of resource IDs to get
all_ids <- purrr::map_chr(content$result$resources, ~ .x$id)

n_res <- length(all_ids)
res_index <- 1:min(n_res, max_resources)
ids_selection <- all_ids[res_index]

selection_ids <- all_ids[res_index]

# get all resources
all_data <- purrr::map(
ids_selection,
selection_ids,
get_resource,
rows = rows
)

# resolve class issues
types <- purrr::map(
all_data,
~ unlist(lapply(.x, class))
~ purrr::map_chr(.x, class)
)


# for each df, check if next df class matches
inconsistencies <- vector(length = length(types) - 1, mode = "list")
for (i in seq_along(types)) {
Expand All @@ -73,28 +79,43 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
}

# define which columns to coerce and warn
conflicts <- unlist(inconsistencies)
to_coerce <- unique(names(conflicts))
to_coerce <- unique(names(unlist(inconsistencies)))

if (length(to_coerce) > 0) {
cli::cli_warn(c(
"Due to conflicts between column types across resources,
the following {cli::qty(to_coerce)} column{?s} ha{?s/ve} been coerced to type character:",
"{.val {to_coerce}}"
))
}

# combine
combined <- purrr::map_df(
all_data,
~ dplyr::mutate(
.x,
dplyr::across(
dplyr::any_of(to_coerce),
as.character
all_data <- purrr::map(
all_data,
~ dplyr::mutate(
.x,
dplyr::across(
dplyr::any_of(to_coerce),
as.character
)
)
)
)
}

if (include_context) {
# Add the 'resource context' as columns to the data
all_data <- purrr::pmap(
list(
"data" = all_data,
"id" = selection_ids,
"name" = purrr::map_chr(content$result$resources[res_index], ~ .x$name),
"created_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$created),
"modified_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$last_modified)
),
add_context
)
}

# Combine the list of resources into a single tibble
combined <- purrr::list_rbind(all_data)

return(combined)
}
117 changes: 76 additions & 41 deletions R/get_resource.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,40 @@
#' @param res_id The resource ID as found on
#' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform}
#' @param rows (optional) specify the max number of rows to return.
#' @param row_filters (optional) a named list or vector that specifies values of columns/fields to keep.
#' @param row_filters (optional) a named list or vector that specifies values of
#' columns/fields to keep.
#' e.g. list(Date = 20220216, Sex = "Female").
#' @param col_select (optional) a character vector containing the names of desired columns/fields.
#' @param col_select (optional) a character vector containing the names of
#' desired columns/fields.
#' e.g. c("Date", "Sex").
#' @param include_context (optional) If `TRUE` additional information about the
#' resource will be added as columns to the data, including the resource ID, the
#' resource name, the creation date and the last modified/updated date.
#'
#' @seealso [get_dataset()] for downloading all resources
#' from a given dataset.
#'
#' @importFrom magrittr %>%
#' @return a [tibble][tibble::tibble-package] with the data
#' @export
#'
#' @examples
#' res_id <- "ca3f8e44-9a84-43d6-819c-a880b23bd278"
#'
#' data <- get_resource(res_id)
#'
#' filters <- list("HB" = "S08000030", "Month" = "202109")
#' wanted_cols <- c("HB", "Month", "TotalPatientsSeen")
#'
#' df <- get_resource(res_id = res_id, row_filters = filters, col_select = wanted_cols)
get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = NULL) {
#' filtered_data <- get_resource(
#' res_id = res_id,
#' row_filters = filters,
#' col_select = wanted_cols
#' )
get_resource <- function(res_id,
rows = NULL,
row_filters = NULL,
col_select = NULL,
include_context = FALSE) {
# check res_id
check_res_id(res_id)

Expand All @@ -35,51 +50,71 @@ get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = N

# if dump should be used, use it
if (use_dump_check(query, rows)) {
return(dump_download(res_id))
}
data <- dump_download(res_id)
} else {
# if there is no row limit set
# set limit to CKAN max
if (is.null(query$limit)) query$limit <- 99999

# if there is no row limit set
# set limit to CKAN max
if (is.null(query$limit)) query$limit <- 99999
# remove null values from query
null_q_field <- sapply(query, is.null)
query[null_q_field] <- NULL

# remove null values from query
null_q_field <- sapply(query, is.null)
query[null_q_field] <- NULL
# fetch the data
res_content <- phs_GET("datastore_search", query)

# fetch the data
res_content <- phs_GET("datastore_search", query)

# if the total number of rows is greater than the
# number of rows fetched
# AND the user was not aware of this limit (`rows` defaulted to NULL)
# warn the user about this limit.
total_rows <- res_content$result$total
if (is.null(rows) && query$limit < total_rows) {
cli::cli_warn(c(
"Returning the first {query$limit}
# if the total number of rows is greater than the
# number of rows fetched
# AND the user was not aware of this limit (`rows` defaulted to NULL)
# warn the user about this limit.
total_rows <- res_content$result$total
if (is.null(rows) && query$limit < total_rows) {
cli::cli_warn(c(
"Returning the first {query$limit}
results (rows) of your query.
{total_rows} rows match your query in total.",
i = "To get ALL matching rows you will need to download
i = "To get ALL matching rows you will need to download
the whole resource and apply filters/selections locally."
))
}
))
}

# if more rows were requested than received
# let the user know
if (!is.null(rows) && query$limit > total_rows) {
cli::cli_alert_warning(c(
"You set {.var rows} to {query$limit} but
only {total_rows} rows matched your query."
))
# if more rows were requested than received
# let the user know
if (!is.null(rows) && query$limit > total_rows) {
cli::cli_warn(
"You set {.var rows} to {.val {query$limit}} but only {.val {total_rows}} rows matched your query."
)
}

# extract data from response content
data <- purrr::map_dfr(
res_content$result$records, ~.x
) %>% dplyr::select(
-dplyr::starts_with("rank "),
-dplyr::matches("_id")
)
}

# extract data from response content
data <- purrr::map_dfr(
res_content$result$records, ~.x
) %>% dplyr::select(
-dplyr::starts_with("rank "),
-dplyr::matches("_id")
)
if (include_context) {
# Get resource context if required
context_content <- phs_GET(
action = "resource_show",
query = paste0("id=", res_id)
)

res_id <- context_content$result$id
res_name <- context_content$result$name
res_created_date <- context_content$result$created
res_modified_date <- context_content$result$last_modified

data <- data %>%
add_context(
id = res_id,
name = res_name,
created_date = res_created_date,
modified_date = res_modified_date
)
}

return(data)
}
1 change: 0 additions & 1 deletion R/get_resource_sql.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
#'
#' @seealso [get_resource()] for downloading a resource without using a SQL query.
#'
#' @importFrom magrittr %>%
#' @return a [tibble][tibble::tibble-package] with the query results.
#' Only 32,000 rows can be returned from a single SQL query.
#' @export
Expand Down
11 changes: 9 additions & 2 deletions R/request_url.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@
#'
request_url <- function(action, query) {
# check action is valid
valid_actions <- c("datastore_search", "datastore_search_sql", "dump", "package_show", "package_list")
valid_actions <- c(
"datastore_search",
"datastore_search_sql",
"dump",
"package_show",
"package_list",
"resource_show"
)
if (!(action %in% valid_actions)) {
cli::cli_abort(c(
"API call failed.",
x = "Invalid {.var action} argument in request."
x = "{.val {action}} is an invalid {.arg action} argument."
))
}

Expand Down
6 changes: 3 additions & 3 deletions R/suggest_dataset_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ suggest_dataset_name <- function(dataset_name) {
"Can't find the dataset name
{.var {dataset_name}}, or a close match.",
i = "Find a dataset's name in the URL
of it's page on {.url www.opendata.nhs.scot.}"
of its page on {.url www.opendata.nhs.scot.}"
))
}

# find closet match
closest_match <- dataset_names[which.min(string_distances)]
closest_match <- dataset_names[which(string_distances == min(string_distances))]

# throw error with suggestion
cli::cli_abort(c(
"Can't find the dataset name {.var {dataset_name}}.",
"i" = "Did you mean '{closest_match}'?"
"i" = "Did you mean {?any of }{.val {closest_match}}?"
))
}
Loading

0 comments on commit 2f3c2df

Please sign in to comment.