Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update episode file functions to pass data through #754

Merged
merged 12 commits into from
Sep 26, 2023
13 changes: 8 additions & 5 deletions R/add_nsu_cohort.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
#'
#' @param data The input data frame
#' @param year The year being processed
#' @param nsu_cohort The NSU data for the year
#'
#' @return A data frame containing the Non-Service Users as additional rows
#' @export
#'
#' @family episode file
#' @seealso [get_nsu_path()]
add_nsu_cohort <- function(data, year) {
add_nsu_cohort <- function(
data,
year,
nsu_cohort = read_file(get_nsu_path(year))) {
year_param <- year

if (!check_year_valid(year, "NSU")) {
Expand All @@ -29,9 +33,9 @@ add_nsu_cohort <- function(data, year) {
)
)

matched <- dplyr::full_join(data,
# NSU cohort file
read_file(get_nsu_path(year)) %>%
matched <- dplyr::full_join(
data,
nsu_cohort %>%
dplyr::mutate(
dob = as.Date(.data[["dob"]]),
gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
Expand Down Expand Up @@ -110,7 +114,6 @@ add_nsu_cohort <- function(data, year) {
.data[["chi"]]
)
) %>%
# Remove the additional columns
dplyr::select(-dplyr::contains("_nsu"), -"has_chi")

return(return_df)
Expand Down
61 changes: 44 additions & 17 deletions R/create_episode_file.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
#' Create the Source Episode file
#' Produce the Source Episode file
#'
#' @param processed_data_list containing data from processed extracts.
#' @param year The year to process, in FY format.
#' @param write_to_disk (optional) Should the data be written to disk default is
#' `TRUE` i.e. write the data to disk.
#' @inheritParams add_nsu_cohort
#' @inheritParams fill_geographies
#' @inheritParams join_cohort_lookups
#' @inheritParams join_deaths_data
#' @inheritParams match_on_ltcs
#' @inheritParams link_delayed_discharge_eps
#' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output
#' (instead of chi).
#' (instead of chi)
#'
#' @return the Source Episode file as a [tibble][tibble::tibble-package].
#' @return a [tibble][tibble::tibble-package] containing the episode file
#' @export
create_episode_file <- function(
processed_data_list,
year,
dd_data = read_file(get_source_extract_path(year, "DD")),
nsu_cohort = read_file(get_nsu_path(year)),
ltc_data = read_file(get_ltcs_path(year)),
slf_pc_lookup = read_file(get_slf_postcode_path()),
slf_gpprac_lookup = read_file(
get_slf_gpprac_path(),
col_select = c("gpprac", "cluster", "hbpraccode")
),
slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)),
write_to_disk = TRUE,
anon_chi_out = TRUE) {
episode_file <- dplyr::bind_rows(processed_data_list) %>%
Expand Down Expand Up @@ -99,15 +114,21 @@ create_episode_file <- function(
correct_cij_vars() %>%
fill_missing_cij_markers() %>%
add_ppa_flag() %>%
link_delayed_discharge_eps(year) %>%
add_nsu_cohort(year) %>%
match_on_ltcs(year) %>%
link_delayed_discharge_eps(year, dd_data) %>%
add_nsu_cohort(year, nsu_cohort) %>%
match_on_ltcs(year, ltc_data) %>%
correct_demographics(year) %>%
create_cohort_lookups(year) %>%
join_cohort_lookups(year) %>%
join_sparra_hhg(year) %>%
fill_geographies() %>%
join_deaths_data(year) %>%
fill_geographies(
slf_pc_lookup,
slf_gpprac_lookup
) %>%
join_deaths_data(
year,
slf_deaths_lookup
) %>%
load_ep_file_vars(year)

if (anon_chi_out) {
Expand Down Expand Up @@ -354,22 +375,28 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
#'
#' @inheritParams store_ep_file_vars
#' @inheritParams get_demographic_cohorts_path
#' @param demographic_cohort,service_use_cohort The cohort data
#'
#' @return The data including the Demographic and Service Use lookups.
join_cohort_lookups <- function(data, year, update = latest_update()) {
join_cohort_lookups <- function(
data,
year,
update = latest_update(),
demographic_cohort = read_file(
get_demographic_cohorts_path(year, update),
col_select = c("chi", "demographic_cohort")
),
service_use_cohort = read_file(
get_service_use_cohorts_path(year, update),
col_select = c("chi", "service_use_cohort")
)) {
join_cohort_lookups <- data %>%
dplyr::left_join(
read_file(
get_demographic_cohorts_path(year, update),
col_select = c("chi", "demographic_cohort")
),
demographic_cohort,
by = "chi"
) %>%
dplyr::left_join(
read_file(
get_service_use_cohorts_path(year, update),
col_select = c("chi", "service_use_cohort")
),
service_use_cohort,
by = "chi"
)

Expand Down
51 changes: 37 additions & 14 deletions R/fill_geographies.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
#' then use the lookups to match on additional variables.
#'
#' @param data the SLF
#' @param slf_pc_lookup The SLF Postcode lookup
#' @param slf_gpprac_lookup The SLF GP Practice lookup
#'
#' @return a [tibble][tibble::tibble-package] of the SLF with improved
#' Postcode and GP Practice details.
fill_geographies <- function(data) {
fill_geographies <- function(
data,
slf_pc_lookup = read_file(get_slf_postcode_path()),
slf_gpprac_lookup = read_file(
get_slf_gpprac_path(),
col_select = c("gpprac", "cluster", "hbpraccode")
)) {
check_variables_exist(data, c(
"chi",
"postcode",
Expand All @@ -21,8 +29,15 @@ fill_geographies <- function(data) {
))

data %>%
fill_postcode_geogs() %>%
fill_gpprac_geographies()
fill_postcode_geogs(
slf_pc_lookup = read_file(get_slf_postcode_path())
) %>%
fill_gpprac_geographies(
slf_gpprac_lookup = read_file(
get_slf_gpprac_path(),
col_select = c("gpprac", "cluster", "hbpraccode")
)
)
}

#' Make a postcode lookup for filling to most recent postcodes based on CHI
Expand Down Expand Up @@ -86,9 +101,9 @@ make_gpprac_lookup <- function(data) {
return(gpprac_lookup)
}

fill_postcode_geogs <- function(data) {
slf_pc_lookup <- read_file(get_slf_postcode_path())

fill_postcode_geogs <- function(
data,
slf_pc_lookup) {
filled_postcodes <- dplyr::left_join(
data,
make_postcode_lookup(data),
Expand Down Expand Up @@ -123,17 +138,20 @@ fill_postcode_geogs <- function(data) {
lca = dplyr::coalesce(.data$lca, .data$lca_old),
datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old)
) %>%
dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode"))
dplyr::select(!c(
"hb2018",
"hscp",
"lca_old",
"datazone2011_old",
"most_recent_postcode"
))

return(filled_postcodes)
}

fill_gpprac_geographies <- function(data) {
gpprac_ref <- read_file(
get_slf_gpprac_path(),
col_select = c("gpprac", "cluster", "hbpraccode")
)

fill_gpprac_geographies <- function(
data,
slf_gpprac_lookup) {
filled_gpprac <- dplyr::left_join(
data,
make_gpprac_lookup(data),
Expand All @@ -147,7 +165,12 @@ fill_gpprac_geographies <- function(data) {
.data$gpprac
)
) %>%
dplyr::left_join(gpprac_ref, by = "gpprac", suffix = c("_old", "")) %>%
dplyr::left_join(
slf_gpprac_lookup %>%
dplyr::select("gpprac", "cluster", "hbpraccode"),
by = "gpprac",
suffix = c("_old", "")
) %>%
dplyr::mutate(
hbpraccode = dplyr::coalesce(.data$hbpraccode, .data$hbpraccode_old)
) %>%
Expand Down
26 changes: 13 additions & 13 deletions R/get_source_extract_path.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,34 @@ get_source_extract_path <- function(
type <- match.arg(type)

if (!check_year_valid(year, type)) {
return(NA)
return(get_dummy_boxi_extract_path())
}

file_name <- dplyr::case_match(
type,
"Acute" ~ "acute_for_source",
"AE" ~ "a&e_for_source",
"AT" ~ "Alarms-Telecare-for-source",
"AE" ~ "a_and_e_for_source",
"AT" ~ "alarms-telecare-for-source",
"CH" ~ "care_home_for_source",
"CMH" ~ "CMH_for_source",
"CMH" ~ "cmh_for_source",
"Client" ~ "client_for_source",
"DD" ~ "DD_for_source",
"DD" ~ "delayed_discharge_for_source",
"Deaths" ~ "deaths_for_source",
"DN" ~ "DN_for_source",
"GPOoH" ~ "GP_OOH_for_source",
"HC" ~ "Home_Care_for_source",
"DN" ~ "district_nursing_for_source",
"GPOoH" ~ "gp_ooh_for_source",
"HC" ~ "home_care_for_source",
"Homelessness" ~ "homelessness_for_source",
"Maternity" ~ "maternity_for_source",
"MH" ~ "mental_health_for_source",
"DD" ~ "DD_for_source",
"Outpatients" ~ "outpatients_for_source",
"PIS" ~ "prescribing_file_for_source",
"SDS" ~ "SDS-for-source"
)
"PIS" ~ "prescribing_for_source",
"SDS" ~ "sds_for_source"
) %>%
stringr::str_glue("-{year}.parquet")

source_extract_path <- get_file_path(
directory = get_year_dir(year),
file_name = stringr::str_glue("{file_name}-20{year}.parquet"),
file_name = file_name,
...
)

Expand Down
8 changes: 8 additions & 0 deletions R/get_sparra_hhg_paths.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
#' @family extract file paths
#' @seealso [get_file_path()] for the generic function.
get_hhg_path <- function(year, ...) {
if (!check_year_valid(year, "HHG")) {
return(get_dummy_boxi_extract_path())
}

hhg_file_path <- get_file_path(
directory = fs::path(get_slf_dir(), "HHG"),
file_name = stringr::str_glue("HHG-20{year}.parquet"),
Expand All @@ -31,6 +35,10 @@ get_hhg_path <- function(year, ...) {
#' @family extract file paths
#' @seealso [get_file_path()] for the generic function.
get_sparra_path <- function(year, ...) {
if (!check_year_valid(year, "SPARRA")) {
return(get_dummy_boxi_extract_path())
}

sparra_file_path <- get_file_path(
directory = fs::path(get_slf_dir(), "SPARRA"),
file_name = stringr::str_glue("SPARRA-20{year}.parquet"),
Expand Down
6 changes: 2 additions & 4 deletions R/join_deaths_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
#'
#' @param data Episode file data
#' @param year financial year, e.g. '1920'
#' @param slf_deaths_lookup_path Path to slf deaths lookup.
#' @param slf_deaths_lookup The SLF deaths lookup.
#'
#' @return The data including the deaths lookup matched
#' on to the episode file.
join_deaths_data <- function(
data,
year,
slf_deaths_lookup_path = get_slf_deaths_lookup_path(year)) {
slf_deaths_lookup <- read_file(slf_deaths_lookup_path)

slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year))) {
return(
data %>%
dplyr::left_join(
Expand Down
8 changes: 6 additions & 2 deletions R/match_on_ltcs.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
#'
#' @param data episode files
#' @param year financial year, e.g. '1920'
#' @param ltc_data The LTC data for the year
#'
#' @return data matched with long term conditions
match_on_ltcs <- function(data, year) {
match_on_ltcs <- function(
data,
year,
ltc_data = read_file(get_ltcs_path(year))) {
# Match on LTC lookup
matched <- dplyr::left_join(
data,
read_file(get_ltcs_path(year)),
ltc_data,
by = "chi",
suffix = c("", "_ltc")
) %>%
Expand Down
5 changes: 5 additions & 0 deletions R/read_file.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) {
"parquet"
)

# Return an empty tibble if trying to read the dummy path
if (path == get_dummy_boxi_extract_path()) {
return(tibble::tibble())
}

ext <- fs::path_ext(path)

if (ext == "gz") {
Expand Down
7 changes: 7 additions & 0 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -543,11 +543,18 @@ list(
source_sc_alarms_tele
)
),
tar_file_read(nsu_cohort, get_nsu_path(year), read_file(!!.x)),
tar_target(
episode_file,
create_episode_file(
processed_data_list,
year,
dd_data = source_dd_extract,
nsu_cohort = nsu_cohort,
ltc_data = source_ltc_lookup,
slf_pc_lookup = source_pc_lookup,
slf_gpprac_lookup = source_gp_lookup,
slf_deaths_lookup = slf_deaths_lookup,
write_to_disk
)
),
Expand Down
4 changes: 3 additions & 1 deletion man/add_nsu_cohort.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading