From ddea828a5f1f72d05f50d744a79f9da7480581e7 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Mon, 7 Oct 2024 10:52:28 +0100 Subject: [PATCH 1/7] distinct death date, keep the earliest one and remove na --- R/process_refined_death.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/process_refined_death.R b/R/process_refined_death.R index dc7663221..dcbef7135 100644 --- a/R/process_refined_death.R +++ b/R/process_refined_death.R @@ -49,8 +49,13 @@ process_refined_death <- function( dplyr::mutate( fy = phsmethods::extract_fin_year(death_date), fy = as.character(paste0(substr(fy, 3, 4), substr(fy, 6, 7))) - ) - # TODO: check distinct death data by chi while keeping chi==NA records + ) %>% + # no need to keep NA + dplyr::filter(!is.na(anon_chi)) %>% + dplyr::group_by(anon_chi) %>% + dplyr::arrange(death_date) %>% + dplyr::distinct(anon_chi, .keep_all = TRUE) %>% + dplyr::ungroup() if (write_to_disk) { write_file( From 7190e313c09965c7b1609c7bf8b4efd1881fac08 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Mon, 7 Oct 2024 10:53:15 +0100 Subject: [PATCH 2/7] add activity after death 100% accurate joining --- R/add_activity_after_death_flag.R | 98 +++++++++++++++++-------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index 5e800c80b..4ed1d8230 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -18,47 +18,44 @@ add_activity_after_death_flag <- function( # to skip warnings no visible binding for global variable ‘.’ . <- NULL + data = data %>% + dplyr::mutate(ep_row_id_death = dplyr::row_number()) + death_joined <- data %>% - dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$death_date, .data$deceased) %>% - dplyr::filter(!is.na(.data$chi) | .data$chi != "") %>% - dplyr::left_join( - deaths_data, - by = "chi", - suffix = c("", "_boxi") + dplyr::select( + "year", + "chi", + "recid", + "record_keydate1", + "record_keydate2", + "death_date", + "deceased", + "ep_row_id_death" ) %>% + dplyr::filter(!is.na(.data$chi) & .data$chi != "") %>% + dplyr::left_join(deaths_data, + by = "chi", + suffix = c("", "_refined")) %>% dplyr::filter(.data$deceased == TRUE) %>% dplyr::distinct() - - # Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date - check_death_date_match <- death_joined %>% - dplyr::filter(.data$death_date != .data$death_date_boxi) - - if (nrow(check_death_date_match) != 0) { - warning("There were records in the episode file which already have a death_date, but does not match the BOXI NRS death date.") - } - - - # Check and print error message for records which have a record_keydate1 after their BOXI death date - check_keydate1_death_date <- death_joined %>% - dplyr::filter(.data$record_keydate1 > .data$death_date_boxi) - - if (nrow(check_death_date_match) != 0) { - warning("There were records in the episode file which have a record_keydate1 after the BOXI NRS death date.") - } - - flag_data <- death_joined %>% dplyr::mutate( - flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_boxi, 1, 0), - flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_boxi, 1, 0), + flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_refined, 1, 0), + flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_refined, 1, 0), # Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in # in the current or a previous financial year. - flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | .data$record_keydate2 == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0), + flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | + .data$record_keydate2 == "") & + (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) + ), 1, 0), # Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year. - flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | .data$death_date == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0) + flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | + .data$death_date == "") & + (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) + ), 1, 0) ) %>% # These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate @@ -67,35 +64,46 @@ add_activity_after_death_flag <- function( dplyr::mutate(activity_after_death = purrr::pmap_dbl( dplyr::select(., tidyselect::contains("flag_")), ~ any(grepl("^1$", c(...)), - na.rm = TRUE - ) * 1 - )) - - - # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried - # over from previous financial years - flag_data <- flag_data %>% + na.rm = TRUE) * 1 + )) %>% + # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried + # over from previous financial years dplyr::filter(.data$activity_after_death == 1) %>% # Remove temporary flag variables used to create activity after death flag and fill in missing death_date - dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$activity_after_death, .data$death_date_boxi) %>% + dplyr::select( + year, + chi, + recid, + record_keydate1, + record_keydate2, + activity_after_death, + death_date_refined, + ep_row_id_death + ) %>% dplyr::distinct() # Match activity after death flag back to episode file final_data <- data %>% dplyr::left_join( flag_data, - # TODO: this join_by is not 100% accurate. Consider use ep_file_row_id to join - by = c("year", "chi", "record_keydate1", "record_keydate2"), + # this join_by is now 100% accurate. + by = c( + "year", + "chi", + "recid", + "record_keydate1", + "record_keydate2", + "ep_row_id_death" + ), na_matches = "never" ) %>% - dplyr::mutate(death_date = lubridate::as_date(ifelse(is.na(death_date) & !(is.na(death_date_boxi)), - death_date_boxi, death_date + dplyr::mutate(death_date = lubridate::as_date(ifelse( + is.na(death_date) & !(is.na(death_date_refined)), + death_date_refined, death_date ))) %>% - dplyr::select(-death_date_boxi) %>% + dplyr::select(-death_date_refined, -ep_row_id_death) %>% dplyr::distinct() - - return(final_data) } From f4bfd85e954e644f37422bcfca61a1ba6e864992 Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Mon, 7 Oct 2024 09:59:22 +0000 Subject: [PATCH 3/7] Style code --- R/add_activity_after_death_flag.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index 4ed1d8230..c9de1f879 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -18,7 +18,7 @@ add_activity_after_death_flag <- function( # to skip warnings no visible binding for global variable ‘.’ . <- NULL - data = data %>% + data <- data %>% dplyr::mutate(ep_row_id_death = dplyr::row_number()) death_joined <- data %>% @@ -34,8 +34,9 @@ add_activity_after_death_flag <- function( ) %>% dplyr::filter(!is.na(.data$chi) & .data$chi != "") %>% dplyr::left_join(deaths_data, - by = "chi", - suffix = c("", "_refined")) %>% + by = "chi", + suffix = c("", "_refined") + ) %>% dplyr::filter(.data$deceased == TRUE) %>% dplyr::distinct() @@ -47,14 +48,14 @@ add_activity_after_death_flag <- function( # Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in # in the current or a previous financial year. flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | - .data$record_keydate2 == "") & - (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) + .data$record_keydate2 == "") & + (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) ), 1, 0), # Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year. flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | - .data$death_date == "") & - (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) + .data$death_date == "") & + (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31")) ), 1, 0) ) %>% # These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate @@ -64,7 +65,8 @@ add_activity_after_death_flag <- function( dplyr::mutate(activity_after_death = purrr::pmap_dbl( dplyr::select(., tidyselect::contains("flag_")), ~ any(grepl("^1$", c(...)), - na.rm = TRUE) * 1 + na.rm = TRUE + ) * 1 )) %>% # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried # over from previous financial years From 883f762e6f93c7fc05b2cec3bd592368c80ce49e Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Tue, 8 Oct 2024 10:44:27 +0100 Subject: [PATCH 4/7] remove redundant combine death function --- R/add_activity_after_death_flag.R | 84 ------------------------------- 1 file changed, 84 deletions(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index c9de1f879..338588a68 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -108,87 +108,3 @@ add_activity_after_death_flag <- function( return(final_data) } - - -#' Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts -#' -#' @description The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed, -# but before an episode file has been produced. Therefore, all BOXI NRS years should be run before running episode files. -#' -#' @param ... additional arguments passed to [get_slf_deaths_lookup_path()] -#' @param update the update month (defaults to use [latest_update()]) -#' -#' @param write_to_disk (optional) Should the data be written to disk default is -#' `TRUE` i.e. write the data to disk. -#' -#' @return the final data as a [tibble][tibble::tibble-package]. -#' @export -#' -#' -#' -# Read data------------------------------------------------ - -process_combined_deaths_lookup <- function(update = latest_update(), - write_to_disk = TRUE, ...) { - dir_folder <- "/conf/hscdiip/SLF_Extracts/Deaths" - file_names <- list.files(dir_folder, - pattern = "^anon-slf_deaths_lookup_.*parquet", - full.names = TRUE - ) - - # read all year specific deaths lookups and bind them together - all_boxi_deaths <- lapply(file_names, arrow::read_parquet) %>% - data.table::rbindlist() %>% - # convert to chi for processing - slfhelper::get_chi() %>% - # Remove rows with missing or blank CHI number - could also use na.omit? - # na.omit(all_boxi_deaths) - dplyr::filter(!is.na(.data$chi) | .data$chi != "") - - # Check all CHI numbers are valid - chi_check <- all_boxi_deaths %>% - dplyr::pull(.data$chi) %>% - phsmethods::chi_check() - - if (!all(chi_check %in% c("Valid CHI", "Missing (Blank)", "Missing (NA)"))) { - # There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file - stop("There were bad CHI numbers in the BOXI NRS file") - } - - # Check and print error message for chi numbers with more than one death date - duplicates <- all_boxi_deaths %>% - janitor::get_dupes(.data$chi) - - if (nrow(duplicates) != 0) { - # There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file - warning("There were duplicate death dates in the BOXI NRS file.") - } - - - # We decided to include duplicates as unable to determine which is correct date (unless IT can tell us, however, they don't seem to know - # the process well enough), and overall impact will be negligible - # Get anon_chi and use this to match onto episode file later - all_boxi_deaths <- all_boxi_deaths %>% - slfhelper::get_anon_chi() - - # Save out duplicates for further investigation if needed (as anon_chi) - if (!missing(duplicates)) { - write_file( - duplicates, - fs::path(get_slf_dir(), "Deaths", - file_name = stringr::str_glue("slf_deaths_duplicates_{update}.parquet") - ) - ) - } - - # Maybe save as its own function - # Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file - if (write_to_disk) { - write_file( - all_boxi_deaths, - get_combined_slf_deaths_lookup_path() - ) - } - - return(all_boxi_deaths) -} From 8123e5f49932f330cd60811a5f38b8973949ac95 Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Tue, 8 Oct 2024 09:46:06 +0000 Subject: [PATCH 5/7] Update documentation --- NAMESPACE | 1 - man/process_combined_deaths_lookup.Rd | 26 -------------------------- 2 files changed, 27 deletions(-) delete mode 100644 man/process_combined_deaths_lookup.Rd diff --git a/NAMESPACE b/NAMESPACE index 6f1c88841..72ab76d43 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -89,7 +89,6 @@ export(midpoint_fy) export(next_fy) export(phs_db_connection) export(previous_update) -export(process_combined_deaths_lookup) export(process_costs_ch_rmd) export(process_costs_dn_rmd) export(process_costs_gp_ooh_rmd) diff --git a/man/process_combined_deaths_lookup.Rd b/man/process_combined_deaths_lookup.Rd deleted file mode 100644 index 7d0a75fc7..000000000 --- a/man/process_combined_deaths_lookup.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/add_activity_after_death_flag.R -\name{process_combined_deaths_lookup} -\alias{process_combined_deaths_lookup} -\title{Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts} -\usage{ -process_combined_deaths_lookup( - update = latest_update(), - write_to_disk = TRUE, - ... -) -} -\arguments{ -\item{update}{the update month (defaults to use \code{\link[=latest_update]{latest_update()}})} - -\item{write_to_disk}{(optional) Should the data be written to disk default is -\code{TRUE} i.e. write the data to disk.} - -\item{...}{additional arguments passed to \code{\link[=get_slf_deaths_lookup_path]{get_slf_deaths_lookup_path()}}} -} -\value{ -the final data as a \link[tibble:tibble-package]{tibble}. -} -\description{ -The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed, -} From e699f6678634ccbd84e1c6a1cd1e12a8e781e4cc Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Wed, 16 Oct 2024 13:05:00 +0100 Subject: [PATCH 6/7] fix NA in activity_after_death --- R/add_activity_after_death_flag.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index ade69f457..144a08632 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -102,7 +102,10 @@ add_activity_after_death_flag <- function( death_date_refined, death_date ))) %>% dplyr::select(-death_date_refined, -ep_row_id_death) %>% - dplyr::distinct() + dplyr::distinct() %>% + dplyr::mutate(dplyr::if_else(is.na(activity_after_death), + 0, + activity_after_death)) cli::cli_alert_info("Add activity after death flag function finished at {Sys.time()}") From 67b7d8876560bec44953ff402f228c0860b2a8de Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Wed, 16 Oct 2024 12:06:35 +0000 Subject: [PATCH 7/7] Style code --- R/add_activity_after_death_flag.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index 144a08632..1e664de98 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -104,8 +104,9 @@ add_activity_after_death_flag <- function( dplyr::select(-death_date_refined, -ep_row_id_death) %>% dplyr::distinct() %>% dplyr::mutate(dplyr::if_else(is.na(activity_after_death), - 0, - activity_after_death)) + 0, + activity_after_death + )) cli::cli_alert_info("Add activity after death flag function finished at {Sys.time()}")