Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add context (res id and name) when downloading data #24

Merged
merged 19 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,32 +1,35 @@
Package: phsopendata
Title: Extract Open Data from opendata.nhs.scot
Version: 0.1.0
Authors@R:
c(person(given = "Csilla", family = "Scharle", role = c("aut", "cre"), email = "[email protected]"),
person("James", "McMahon", email = "[email protected]", role = "aut"),
person("David", "Aikman", email = "[email protected]", role = "aut"))
Description: Functions to extract and interact with data from the Scottish Health and Social Care Open Data platform.
Authors@R: c(
person("Csilla", "Scharle", , "[email protected]", role = c("aut", "cre")),
person("James", "McMahon", , "[email protected]", role = "aut"),
person("David", "Aikman", , "[email protected]", role = "aut")
)
Description: Functions to extract and interact with data from the Scottish
Health and Social Care Open Data platform.
License: GPL (>= 2)
URL: https://github.com/Public-Health-Scotland/phsopendata,
https://public-health-scotland.github.io/phsopendata/
BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
Imports:
cli,
dplyr (>= 1.0.0),
httr (>= 1.0.0),
glue (>= 1.0.0),
purrr,
tibble (>= 3.0.0),
httr (>= 1.0.0),
jsonlite (>= 1.0.0),
magrittr (>= 1.0.0),
purrr,
readr (>= 1.0.0),
stringdist,
cli,
tibble (>= 3.0.0),
xml2
Suggests:
covr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Config/testthat/parallel: true
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Config/testthat/edition: 3
URL: https://github.com/Public-Health-Scotland/phsopendata,
https://public-health-scotland.github.io/phsopendata/
BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues
28 changes: 28 additions & 0 deletions R/add_context.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
add_context <- function(data, id, name, created_date, modified_date) {
# Catch if the resource has never been modified
if (is.null(modified_date)) {
modified_date <- NA_character_
}

# Parse the date values
created_date <- as.POSIXct(created_date, format = "%FT%X", tz = "UTC")
modified_date <- as.POSIXct(modified_date, format = "%FT%X", tz = "UTC")

# The platform can record the modified date as being before the created date
# by a few microseconds, this will catch any rounding which ensure
# created_date is always <= modified_date
if (modified_date < created_date) {
modified_date <- created_date
}

data_with_context <- dplyr::mutate(
data,
"res_id" = id,
"res_name" = name,
"res_created_date" = created_date,
"res_modified_date" = modified_date,
.before = dplyr::everything()
)

return(data_with_context)
}
55 changes: 38 additions & 17 deletions R/get_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@
#' it will return the n latest resources
#' @param rows (optional) specify the max number of rows
#' to return for each resource.
#' @inheritParams get_resource
#'
#' @seealso [get_resource()] for downloading a single resource
#' from a dataset.
#'
#' @importFrom magrittr %>%
#' @return a [tibble][tibble::tibble-package] with the data
#' @export
#'
#' @examples get_dataset("gp-practice-populations",
#' max_resources = 2, rows = 10
#' )
get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) {
get_dataset <- function(dataset_name,
max_resources = NULL,
rows = NULL,
include_context = FALSE) {
# throw error if name type/format is invalid
check_dataset_name(dataset_name)

Expand All @@ -37,23 +40,26 @@

# define list of resource IDs to get
all_ids <- purrr::map_chr(content$result$resources, ~ .x$id)

n_res <- length(all_ids)
res_index <- 1:min(n_res, max_resources)
ids_selection <- all_ids[res_index]

selection_ids <- all_ids[res_index]

# get all resources
all_data <- purrr::map(
ids_selection,
selection_ids,
get_resource,
rows = rows
)

# resolve class issues
types <- purrr::map(
all_data,
~ unlist(lapply(.x, class))
~ purrr::map_chr(.x, class)
)


# for each df, check if next df class matches
inconsistencies <- vector(length = length(types) - 1, mode = "list")
for (i in seq_along(types)) {
Expand All @@ -68,33 +74,48 @@
)

# of matching name cols, find if types match too
inconsistent_index <- this_types[matching_names] != next_types[matching_names]

Check warning on line 77 in R/get_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/get_dataset.R,line=77,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 82 characters.
inconsistencies[[i]] <- this_types[matching_names][inconsistent_index]
}

# define which columns to coerce and warn
conflicts <- unlist(inconsistencies)
to_coerce <- unique(names(conflicts))
to_coerce <- unique(names(unlist(inconsistencies)))

if (length(to_coerce) > 0) {
cli::cli_warn(c(
"Due to conflicts between column types across resources,
the following {cli::qty(to_coerce)} column{?s} ha{?s/ve} been coerced to type character:",

Check warning on line 87 in R/get_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/get_dataset.R,line=87,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 96 characters.
"{.val {to_coerce}}"
))
}

# combine
combined <- purrr::map_df(
all_data,
~ dplyr::mutate(
.x,
dplyr::across(
dplyr::any_of(to_coerce),
as.character
all_data <- purrr::map(
all_data,
~ dplyr::mutate(
.x,
dplyr::across(
dplyr::any_of(to_coerce),
as.character
)
)
)
)
}

if (include_context) {
# Add the 'resource context' as columns to the data
all_data <- purrr::pmap(
list(
"data" = all_data,
"id" = selection_ids,
"name" = purrr::map_chr(content$result$resources[res_index], ~ .x$name),
"created_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$created),

Check warning on line 110 in R/get_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/get_dataset.R,line=110,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 91 characters.
"modified_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$last_modified)

Check warning on line 111 in R/get_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/get_dataset.R,line=111,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 97 characters.
),
add_context
)
}

# Combine the list of resources into a single tibble
combined <- purrr::list_rbind(all_data)

return(combined)
}
116 changes: 76 additions & 40 deletions R/get_resource.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
#' @param res_id The resource ID as found on
#' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform}
#' @param rows (optional) specify the max number of rows to return.
#' @param row_filters (optional) a named list or vector that specifies values of columns/fields to keep.
#' @param row_filters (optional) a named list or vector that specifies values of
#' columns/fields to keep.
#' e.g. list(Date = 20220216, Sex = "Female").
#' @param col_select (optional) a character vector containing the names of desired columns/fields.
#' @param col_select (optional) a character vector containing the names of
#' desired columns/fields.
#' e.g. c("Date", "Sex").
#' @param include_context (optional) If `TRUE` additional information about the
#' resource will be added as columns to the data, including the resource ID, the
#' resource name, the creation date and the last modified/updated date.
#'
#' @seealso [get_dataset()] for downloading all resources
#' from a given dataset.
Expand All @@ -17,11 +22,22 @@
#'
#' @examples
#' res_id <- "ca3f8e44-9a84-43d6-819c-a880b23bd278"
#'
#' data <- get_resource(res_id)
#'
#' filters <- list("HB" = "S08000030", "Month" = "202109")
#' wanted_cols <- c("HB", "Month", "TotalPatientsSeen")
#'
#' df <- get_resource(res_id = res_id, row_filters = filters, col_select = wanted_cols)
get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = NULL) {
#' filtered_data <- get_resource(
#' res_id = res_id,
#' row_filters = filters,
#' col_select = wanted_cols
#' )
get_resource <- function(res_id,
rows = NULL,
row_filters = NULL,
col_select = NULL,
include_context = FALSE) {
# check res_id
check_res_id(res_id)

Expand All @@ -35,51 +51,71 @@

# if dump should be used, use it
if (use_dump_check(query, rows)) {
return(dump_download(res_id))
}
data <- dump_download(res_id)
} else {
# if there is no row limit set
# set limit to CKAN max
if (is.null(query$limit)) query$limit <- 99999

# if there is no row limit set
# set limit to CKAN max
if (is.null(query$limit)) query$limit <- 99999
# remove null values from query
null_q_field <- sapply(query, is.null)
query[null_q_field] <- NULL

# remove null values from query
null_q_field <- sapply(query, is.null)
query[null_q_field] <- NULL
# fetch the data
res_content <- phs_GET("datastore_search", query)

# fetch the data
res_content <- phs_GET("datastore_search", query)

# if the total number of rows is greater than the
# number of rows fetched
# AND the user was not aware of this limit (`rows` defaulted to NULL)
# warn the user about this limit.
total_rows <- res_content$result$total
if (is.null(rows) && query$limit < total_rows) {
cli::cli_warn(c(
"Returning the first {query$limit}
# if the total number of rows is greater than the
# number of rows fetched
# AND the user was not aware of this limit (`rows` defaulted to NULL)
# warn the user about this limit.
total_rows <- res_content$result$total
if (is.null(rows) && query$limit < total_rows) {
cli::cli_warn(c(
"Returning the first {query$limit}
results (rows) of your query.
{total_rows} rows match your query in total.",
i = "To get ALL matching rows you will need to download
i = "To get ALL matching rows you will need to download
the whole resource and apply filters/selections locally."
))
}
))
}

# if more rows were requested than received
# let the user know
if (!is.null(rows) && query$limit > total_rows) {
cli::cli_alert_warning(c(
"You set {.var rows} to {query$limit} but
only {total_rows} rows matched your query."
))
# if more rows were requested than received
# let the user know
if (!is.null(rows) && query$limit > total_rows) {
cli::cli_warn(
"You set {.var rows} to {.val {query$limit}} but only {.val {total_rows}} rows matched your query."

Check warning on line 86 in R/get_resource.R

View workflow job for this annotation

GitHub Actions / lint

file=R/get_resource.R,line=86,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 107 characters.
)
}

# extract data from response content
data <- purrr::map_dfr(
res_content$result$records, ~.x
) %>% dplyr::select(
-dplyr::starts_with("rank "),
-dplyr::matches("_id")
)
}

# extract data from response content
data <- purrr::map_dfr(
res_content$result$records, ~.x
) %>% dplyr::select(
-dplyr::starts_with("rank "),
-dplyr::matches("_id")
)
if (include_context) {
# Get resource context if required
context_content <- phs_GET(
action = "resource_show",
query = paste0("id=", res_id)
)

res_id <- context_content$result$id
res_name <- context_content$result$name
res_created_date <- context_content$result$created
res_modified_date <- context_content$result$last_modified

data <- data %>%
add_context(
id = res_id,
name = res_name,
created_date = res_created_date,
modified_date = res_modified_date
)
}

return(data)
}
11 changes: 9 additions & 2 deletions R/request_url.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@
#'
request_url <- function(action, query) {
# check action is valid
valid_actions <- c("datastore_search", "datastore_search_sql", "dump", "package_show", "package_list")
valid_actions <- c(
"datastore_search",
"datastore_search_sql",
"dump",
"package_show",
"package_list",
"resource_show"
)
if (!(action %in% valid_actions)) {
cli::cli_abort(c(
"API call failed.",
x = "Invalid {.var action} argument in request."
x = "{.val {action}} is an invalid {.arg action} argument."
))
}

Expand Down
11 changes: 10 additions & 1 deletion man/get_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading