From a5bc79e3b55de2476d1d460ef58defa461019004 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 26 Sep 2023 11:39:58 +0100 Subject: [PATCH] Update episode file functions to pass data through (#754) * Update `read_file` to return an empty tibble if passed the dummy path This is needed for some other bits, notably NSUs * Update SPARRA and HHG paths to return dummy if the year is invalid * Extract all data as a parameter * Style code * Update documentation * Style code * Update documentation * rename `run` to `create_episode_file` * Update documentation --------- Co-authored-by: Moohan Co-authored-by: Jennifer Thom Co-authored-by: Jennit07 --- R/add_nsu_cohort.R | 13 +++++--- R/create_episode_file.R | 61 +++++++++++++++++++++++++---------- R/fill_geographies.R | 51 +++++++++++++++++++++-------- R/get_source_extract_path.R | 26 +++++++-------- R/get_sparra_hhg_paths.R | 8 +++++ R/join_deaths_data.R | 6 ++-- R/match_on_ltcs.R | 8 +++-- R/read_file.R | 5 +++ _targets.R | 7 ++++ man/add_nsu_cohort.Rd | 4 ++- man/create_episode_file.Rd | 27 +++++++++++++--- man/create_individual_file.Rd | 2 +- man/fill_geographies.Rd | 11 ++++++- man/join_cohort_lookups.Rd | 12 ++++++- man/join_deaths_data.Rd | 4 +-- man/match_on_ltcs.Rd | 4 ++- 16 files changed, 183 insertions(+), 66 deletions(-) diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R index c5a26da12..00260bb8e 100644 --- a/R/add_nsu_cohort.R +++ b/R/add_nsu_cohort.R @@ -2,13 +2,17 @@ #' #' @param data The input data frame #' @param year The year being processed +#' @param nsu_cohort The NSU data for the year #' #' @return A data frame containing the Non-Service Users as additional rows #' @export #' #' @family episode file #' @seealso [get_nsu_path()] -add_nsu_cohort <- function(data, year) { +add_nsu_cohort <- function( + data, + year, + nsu_cohort = read_file(get_nsu_path(year))) { year_param <- year if (!check_year_valid(year, "NSU")) { @@ -29,9 +33,9 @@ add_nsu_cohort <- function(data, year) { ) ) - matched <- dplyr::full_join(data, - # NSU cohort file - read_file(get_nsu_path(year)) %>% + matched <- dplyr::full_join( + data, + nsu_cohort %>% dplyr::mutate( dob = as.Date(.data[["dob"]]), gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]]) @@ -110,7 +114,6 @@ add_nsu_cohort <- function(data, year) { .data[["chi"]] ) ) %>% - # Remove the additional columns dplyr::select(-dplyr::contains("_nsu"), -"has_chi") return(return_df) diff --git a/R/create_episode_file.R b/R/create_episode_file.R index bad42be5e..3dc33e193 100644 --- a/R/create_episode_file.R +++ b/R/create_episode_file.R @@ -1,17 +1,32 @@ -#' Create the Source Episode file +#' Produce the Source Episode file #' #' @param processed_data_list containing data from processed extracts. #' @param year The year to process, in FY format. #' @param write_to_disk (optional) Should the data be written to disk default is #' `TRUE` i.e. write the data to disk. +#' @inheritParams add_nsu_cohort +#' @inheritParams fill_geographies +#' @inheritParams join_cohort_lookups +#' @inheritParams join_deaths_data +#' @inheritParams match_on_ltcs +#' @inheritParams link_delayed_discharge_eps #' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output -#' (instead of chi). +#' (instead of chi) #' -#' @return the Source Episode file as a [tibble][tibble::tibble-package]. +#' @return a [tibble][tibble::tibble-package] containing the episode file #' @export create_episode_file <- function( processed_data_list, year, + dd_data = read_file(get_source_extract_path(year, "DD")), + nsu_cohort = read_file(get_nsu_path(year)), + ltc_data = read_file(get_ltcs_path(year)), + slf_pc_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file( + get_slf_gpprac_path(), + col_select = c("gpprac", "cluster", "hbpraccode") + ), + slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)), write_to_disk = TRUE, anon_chi_out = TRUE) { episode_file <- dplyr::bind_rows(processed_data_list) %>% @@ -99,15 +114,21 @@ create_episode_file <- function( correct_cij_vars() %>% fill_missing_cij_markers() %>% add_ppa_flag() %>% - link_delayed_discharge_eps(year) %>% - add_nsu_cohort(year) %>% - match_on_ltcs(year) %>% + link_delayed_discharge_eps(year, dd_data) %>% + add_nsu_cohort(year, nsu_cohort) %>% + match_on_ltcs(year, ltc_data) %>% correct_demographics(year) %>% create_cohort_lookups(year) %>% join_cohort_lookups(year) %>% join_sparra_hhg(year) %>% - fill_geographies() %>% - join_deaths_data(year) %>% + fill_geographies( + slf_pc_lookup, + slf_gpprac_lookup + ) %>% + join_deaths_data( + year, + slf_deaths_lookup + ) %>% load_ep_file_vars(year) if (anon_chi_out) { @@ -354,22 +375,28 @@ create_cohort_lookups <- function(data, year, update = latest_update()) { #' #' @inheritParams store_ep_file_vars #' @inheritParams get_demographic_cohorts_path +#' @param demographic_cohort,service_use_cohort The cohort data #' #' @return The data including the Demographic and Service Use lookups. -join_cohort_lookups <- function(data, year, update = latest_update()) { +join_cohort_lookups <- function( + data, + year, + update = latest_update(), + demographic_cohort = read_file( + get_demographic_cohorts_path(year, update), + col_select = c("chi", "demographic_cohort") + ), + service_use_cohort = read_file( + get_service_use_cohorts_path(year, update), + col_select = c("chi", "service_use_cohort") + )) { join_cohort_lookups <- data %>% dplyr::left_join( - read_file( - get_demographic_cohorts_path(year, update), - col_select = c("chi", "demographic_cohort") - ), + demographic_cohort, by = "chi" ) %>% dplyr::left_join( - read_file( - get_service_use_cohorts_path(year, update), - col_select = c("chi", "service_use_cohort") - ), + service_use_cohort, by = "chi" ) diff --git a/R/fill_geographies.R b/R/fill_geographies.R index 8f4a470e8..c9aee6355 100644 --- a/R/fill_geographies.R +++ b/R/fill_geographies.R @@ -4,10 +4,18 @@ #' then use the lookups to match on additional variables. #' #' @param data the SLF +#' @param slf_pc_lookup The SLF Postcode lookup +#' @param slf_gpprac_lookup The SLF GP Practice lookup #' #' @return a [tibble][tibble::tibble-package] of the SLF with improved #' Postcode and GP Practice details. -fill_geographies <- function(data) { +fill_geographies <- function( + data, + slf_pc_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file( + get_slf_gpprac_path(), + col_select = c("gpprac", "cluster", "hbpraccode") + )) { check_variables_exist(data, c( "chi", "postcode", @@ -21,8 +29,15 @@ fill_geographies <- function(data) { )) data %>% - fill_postcode_geogs() %>% - fill_gpprac_geographies() + fill_postcode_geogs( + slf_pc_lookup = read_file(get_slf_postcode_path()) + ) %>% + fill_gpprac_geographies( + slf_gpprac_lookup = read_file( + get_slf_gpprac_path(), + col_select = c("gpprac", "cluster", "hbpraccode") + ) + ) } #' Make a postcode lookup for filling to most recent postcodes based on CHI @@ -86,9 +101,9 @@ make_gpprac_lookup <- function(data) { return(gpprac_lookup) } -fill_postcode_geogs <- function(data) { - slf_pc_lookup <- read_file(get_slf_postcode_path()) - +fill_postcode_geogs <- function( + data, + slf_pc_lookup) { filled_postcodes <- dplyr::left_join( data, make_postcode_lookup(data), @@ -123,17 +138,20 @@ fill_postcode_geogs <- function(data) { lca = dplyr::coalesce(.data$lca, .data$lca_old), datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old) ) %>% - dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode")) + dplyr::select(!c( + "hb2018", + "hscp", + "lca_old", + "datazone2011_old", + "most_recent_postcode" + )) return(filled_postcodes) } -fill_gpprac_geographies <- function(data) { - gpprac_ref <- read_file( - get_slf_gpprac_path(), - col_select = c("gpprac", "cluster", "hbpraccode") - ) - +fill_gpprac_geographies <- function( + data, + slf_gpprac_lookup) { filled_gpprac <- dplyr::left_join( data, make_gpprac_lookup(data), @@ -147,7 +165,12 @@ fill_gpprac_geographies <- function(data) { .data$gpprac ) ) %>% - dplyr::left_join(gpprac_ref, by = "gpprac", suffix = c("_old", "")) %>% + dplyr::left_join( + slf_gpprac_lookup %>% + dplyr::select("gpprac", "cluster", "hbpraccode"), + by = "gpprac", + suffix = c("_old", "") + ) %>% dplyr::mutate( hbpraccode = dplyr::coalesce(.data$hbpraccode, .data$hbpraccode_old) ) %>% diff --git a/R/get_source_extract_path.R b/R/get_source_extract_path.R index 4cb5eef44..37ed545cf 100644 --- a/R/get_source_extract_path.R +++ b/R/get_source_extract_path.R @@ -41,34 +41,34 @@ get_source_extract_path <- function( type <- match.arg(type) if (!check_year_valid(year, type)) { - return(NA) + return(get_dummy_boxi_extract_path()) } file_name <- dplyr::case_match( type, "Acute" ~ "acute_for_source", - "AE" ~ "a&e_for_source", - "AT" ~ "Alarms-Telecare-for-source", + "AE" ~ "a_and_e_for_source", + "AT" ~ "alarms-telecare-for-source", "CH" ~ "care_home_for_source", - "CMH" ~ "CMH_for_source", + "CMH" ~ "cmh_for_source", "Client" ~ "client_for_source", - "DD" ~ "DD_for_source", + "DD" ~ "delayed_discharge_for_source", "Deaths" ~ "deaths_for_source", - "DN" ~ "DN_for_source", - "GPOoH" ~ "GP_OOH_for_source", - "HC" ~ "Home_Care_for_source", + "DN" ~ "district_nursing_for_source", + "GPOoH" ~ "gp_ooh_for_source", + "HC" ~ "home_care_for_source", "Homelessness" ~ "homelessness_for_source", "Maternity" ~ "maternity_for_source", "MH" ~ "mental_health_for_source", - "DD" ~ "DD_for_source", "Outpatients" ~ "outpatients_for_source", - "PIS" ~ "prescribing_file_for_source", - "SDS" ~ "SDS-for-source" - ) + "PIS" ~ "prescribing_for_source", + "SDS" ~ "sds_for_source" + ) %>% + stringr::str_glue("-{year}.parquet") source_extract_path <- get_file_path( directory = get_year_dir(year), - file_name = stringr::str_glue("{file_name}-20{year}.parquet"), + file_name = file_name, ... ) diff --git a/R/get_sparra_hhg_paths.R b/R/get_sparra_hhg_paths.R index 2fd1a69f9..157160ed4 100644 --- a/R/get_sparra_hhg_paths.R +++ b/R/get_sparra_hhg_paths.R @@ -10,6 +10,10 @@ #' @family extract file paths #' @seealso [get_file_path()] for the generic function. get_hhg_path <- function(year, ...) { + if (!check_year_valid(year, "HHG")) { + return(get_dummy_boxi_extract_path()) + } + hhg_file_path <- get_file_path( directory = fs::path(get_slf_dir(), "HHG"), file_name = stringr::str_glue("HHG-20{year}.parquet"), @@ -31,6 +35,10 @@ get_hhg_path <- function(year, ...) { #' @family extract file paths #' @seealso [get_file_path()] for the generic function. get_sparra_path <- function(year, ...) { + if (!check_year_valid(year, "SPARRA")) { + return(get_dummy_boxi_extract_path()) + } + sparra_file_path <- get_file_path( directory = fs::path(get_slf_dir(), "SPARRA"), file_name = stringr::str_glue("SPARRA-20{year}.parquet"), diff --git a/R/join_deaths_data.R b/R/join_deaths_data.R index 694d2e2b9..89bcbbe13 100644 --- a/R/join_deaths_data.R +++ b/R/join_deaths_data.R @@ -2,16 +2,14 @@ #' #' @param data Episode file data #' @param year financial year, e.g. '1920' -#' @param slf_deaths_lookup_path Path to slf deaths lookup. +#' @param slf_deaths_lookup The SLF deaths lookup. #' #' @return The data including the deaths lookup matched #' on to the episode file. join_deaths_data <- function( data, year, - slf_deaths_lookup_path = get_slf_deaths_lookup_path(year)) { - slf_deaths_lookup <- read_file(slf_deaths_lookup_path) - + slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year))) { return( data %>% dplyr::left_join( diff --git a/R/match_on_ltcs.R b/R/match_on_ltcs.R index 42345655a..f83f31325 100644 --- a/R/match_on_ltcs.R +++ b/R/match_on_ltcs.R @@ -5,13 +5,17 @@ #' #' @param data episode files #' @param year financial year, e.g. '1920' +#' @param ltc_data The LTC data for the year #' #' @return data matched with long term conditions -match_on_ltcs <- function(data, year) { +match_on_ltcs <- function( + data, + year, + ltc_data = read_file(get_ltcs_path(year))) { # Match on LTC lookup matched <- dplyr::left_join( data, - read_file(get_ltcs_path(year)), + ltc_data, by = "chi", suffix = c("", "_ltc") ) %>% diff --git a/R/read_file.R b/R/read_file.R index 2941b62ed..be0a6fc65 100644 --- a/R/read_file.R +++ b/R/read_file.R @@ -27,6 +27,11 @@ read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) { "parquet" ) + # Return an empty tibble if trying to read the dummy path + if (path == get_dummy_boxi_extract_path()) { + return(tibble::tibble()) + } + ext <- fs::path_ext(path) if (ext == "gz") { diff --git a/_targets.R b/_targets.R index db26477ef..a9fa80d7a 100644 --- a/_targets.R +++ b/_targets.R @@ -543,11 +543,18 @@ list( source_sc_alarms_tele ) ), + tar_file_read(nsu_cohort, get_nsu_path(year), read_file(!!.x)), tar_target( episode_file, create_episode_file( processed_data_list, year, + dd_data = source_dd_extract, + nsu_cohort = nsu_cohort, + ltc_data = source_ltc_lookup, + slf_pc_lookup = source_pc_lookup, + slf_gpprac_lookup = source_gp_lookup, + slf_deaths_lookup = slf_deaths_lookup, write_to_disk ) ), diff --git a/man/add_nsu_cohort.Rd b/man/add_nsu_cohort.Rd index 723c105e1..4ea9324e0 100644 --- a/man/add_nsu_cohort.Rd +++ b/man/add_nsu_cohort.Rd @@ -4,12 +4,14 @@ \alias{add_nsu_cohort} \title{Add NSU cohort to working file} \usage{ -add_nsu_cohort(data, year) +add_nsu_cohort(data, year, nsu_cohort = read_file(get_nsu_path(year))) } \arguments{ \item{data}{The input data frame} \item{year}{The year being processed} + +\item{nsu_cohort}{The NSU data for the year} } \value{ A data frame containing the Non-Service Users as additional rows diff --git a/man/create_episode_file.Rd b/man/create_episode_file.Rd index e1bda32b9..c1ce0e063 100644 --- a/man/create_episode_file.Rd +++ b/man/create_episode_file.Rd @@ -2,11 +2,18 @@ % Please edit documentation in R/create_episode_file.R \name{create_episode_file} \alias{create_episode_file} -\title{Create the Source Episode file} +\title{Produce the Source Episode file} \usage{ create_episode_file( processed_data_list, year, + dd_data = read_file(get_source_extract_path(year, "DD")), + nsu_cohort = read_file(get_nsu_path(year)), + ltc_data = read_file(get_ltcs_path(year)), + slf_pc_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file(get_slf_gpprac_path(), col_select = c("gpprac", + "cluster", "hbpraccode")), + slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)), write_to_disk = TRUE, anon_chi_out = TRUE ) @@ -16,15 +23,27 @@ create_episode_file( \item{year}{The year to process, in FY format.} +\item{dd_data}{The processed DD extract} + +\item{nsu_cohort}{The NSU data for the year} + +\item{ltc_data}{The LTC data for the year} + +\item{slf_pc_lookup}{The SLF Postcode lookup} + +\item{slf_gpprac_lookup}{The SLF GP Practice lookup} + +\item{slf_deaths_lookup}{The SLF deaths lookup.} + \item{write_to_disk}{(optional) Should the data be written to disk default is \code{TRUE} i.e. write the data to disk.} \item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output -(instead of chi).} +(instead of chi)} } \value{ -the Source Episode file as a \link[tibble:tibble-package]{tibble}. +a \link[tibble:tibble-package]{tibble} containing the episode file } \description{ -Create the Source Episode file +Produce the Source Episode file } diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd index 4fd9a4a53..c4502e5ae 100644 --- a/man/create_individual_file.Rd +++ b/man/create_individual_file.Rd @@ -24,7 +24,7 @@ create_individual_file( (instead of chi).} \item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output -(instead of chi).} +(instead of chi)} } \value{ The processed individual file diff --git a/man/fill_geographies.Rd b/man/fill_geographies.Rd index 5308fd8d0..bb619405b 100644 --- a/man/fill_geographies.Rd +++ b/man/fill_geographies.Rd @@ -4,10 +4,19 @@ \alias{fill_geographies} \title{Fill postcode and GP practice geographies} \usage{ -fill_geographies(data) +fill_geographies( + data, + slf_pc_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file(get_slf_gpprac_path(), col_select = c("gpprac", + "cluster", "hbpraccode")) +) } \arguments{ \item{data}{the SLF} + +\item{slf_pc_lookup}{The SLF Postcode lookup} + +\item{slf_gpprac_lookup}{The SLF GP Practice lookup} } \value{ a \link[tibble:tibble-package]{tibble} of the SLF with improved diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd index 15a860a36..3ef549cc3 100644 --- a/man/join_cohort_lookups.Rd +++ b/man/join_cohort_lookups.Rd @@ -4,7 +4,15 @@ \alias{join_cohort_lookups} \title{Join cohort lookups} \usage{ -join_cohort_lookups(data, year, update = latest_update()) +join_cohort_lookups( + data, + year, + update = latest_update(), + demographic_cohort = read_file(get_demographic_cohorts_path(year, update), col_select = + c("chi", "demographic_cohort")), + service_use_cohort = read_file(get_service_use_cohorts_path(year, update), col_select = + c("chi", "service_use_cohort")) +) } \arguments{ \item{data}{The in-progress episode file data.} @@ -12,6 +20,8 @@ join_cohort_lookups(data, year, update = latest_update()) \item{year}{The year to process, in FY format.} \item{update}{The update to use} + +\item{demographic_cohort, service_use_cohort}{The cohort data} } \value{ The data including the Demographic and Service Use lookups. diff --git a/man/join_deaths_data.Rd b/man/join_deaths_data.Rd index 6508d7893..f3b68fe1a 100644 --- a/man/join_deaths_data.Rd +++ b/man/join_deaths_data.Rd @@ -7,7 +7,7 @@ join_deaths_data( data, year, - slf_deaths_lookup_path = get_slf_deaths_lookup_path(year) + slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)) ) } \arguments{ @@ -15,7 +15,7 @@ join_deaths_data( \item{year}{financial year, e.g. '1920'} -\item{slf_deaths_lookup_path}{Path to slf deaths lookup.} +\item{slf_deaths_lookup}{The SLF deaths lookup.} } \value{ The data including the deaths lookup matched diff --git a/man/match_on_ltcs.Rd b/man/match_on_ltcs.Rd index 0c7e7fb53..e0def00cc 100644 --- a/man/match_on_ltcs.Rd +++ b/man/match_on_ltcs.Rd @@ -4,12 +4,14 @@ \alias{match_on_ltcs} \title{Match on LTC DoB and dates of LTC incidence} \usage{ -match_on_ltcs(data, year) +match_on_ltcs(data, year, ltc_data = read_file(get_ltcs_path(year))) } \arguments{ \item{data}{episode files} \item{year}{financial year, e.g. '1920'} + +\item{ltc_data}{The LTC data for the year} } \value{ data matched with long term conditions