diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index fdede9001..a45e4296a 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -125,6 +125,7 @@ add_activity_after_death_flag <- function( #' #' # Read data------------------------------------------------ + process_combined_deaths_lookup <- function(update = latest_update(), write_to_disk = TRUE, ...) { dir_folder <- "/conf/hscdiip/SLF_Extracts/Deaths" diff --git a/R/fill_ch_names.R b/R/fill_ch_names.R index cd8d18677..b6aa85bb7 100644 --- a/R/fill_ch_names.R +++ b/R/fill_ch_names.R @@ -213,7 +213,7 @@ fill_ch_names <- function(ch_data, "match_mean2", # "open_interval", "ch_admission_date", - "qtr_start", + "period_start_date", "ch_date_registered", "latest_close_date", "ch_active", @@ -305,7 +305,6 @@ fill_ch_names <- function(ch_data, "unique_identifier", "matching_quality_indicator", "sending_location", - "latest_sc_id", "chi", "ch_name", "ch_postcode", @@ -320,9 +319,6 @@ fill_ch_names <- function(ch_data, "ch_admission_date", "ch_discharge_date", "age", - "record_date", - "qtr_start", - "latest_flag", "gender", "dob", "postcode", @@ -763,7 +759,6 @@ fill_ch_names <- function(ch_data, ## produce output ---- col_output <- c( "sending_location", - "latest_sc_id", "chi", "ch_name", "ch_postcode", @@ -778,9 +773,6 @@ fill_ch_names <- function(ch_data, "ch_admission_date", "ch_discharge_date", "age", - "record_date", - "qtr_start", - "latest_flag", "gender", "dob", "postcode", diff --git a/R/process_extract_care_home.R b/R/process_extract_care_home.R index 54789eac9..db7997061 100644 --- a/R/process_extract_care_home.R +++ b/R/process_extract_care_home.R @@ -39,11 +39,11 @@ process_extract_care_home <- function( is_date_in_fyyear(year, .data$record_keydate1, .data$record_keydate2) ) %>% # remove any episodes where the latest submission was before the current year + # this is what stops cases being in future files dplyr::filter( substr(.data$sc_latest_submission, 1L, 4L) >= convert_fyyear_to_year(year) ) - # Data Cleaning --------------------------------------- source_ch_clean <- ch_data %>% # create variables diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R index d6e24c87f..1b29c414c 100644 --- a/R/process_lookup_sc_demographics.R +++ b/R/process_lookup_sc_demographics.R @@ -143,8 +143,8 @@ process_lookup_sc_demographics <- function( dplyr::ungroup() # check to make sure all cases of chi are still there - dplyr::n_distinct(sc_demog_lookup$chi) # 524810 - dplyr::n_distinct(sc_demog_lookup$social_care_id) # 636404 + dplyr::n_distinct(sc_demog_lookup$chi) # 525,834 + dplyr::n_distinct(sc_demog_lookup$social_care_id) # 637,422 sc_demog_lookup <- sc_demog_lookup %>% slfhelper::get_anon_chi() diff --git a/R/process_sc_all_care_home.R b/R/process_sc_all_care_home.R index 7b87d68f0..8492268f7 100644 --- a/R/process_sc_all_care_home.R +++ b/R/process_sc_all_care_home.R @@ -19,38 +19,38 @@ #' @family process extracts #' #' @export -#' process_sc_all_care_home <- function( data, sc_demog_lookup = read_file(get_sc_demog_lookup_path()) %>% slfhelper::get_chi(), refined_death = read_file(get_combined_slf_deaths_lookup_path()) %>% slfhelper::get_chi(), - ch_name_lookup_path = read_file(get_slf_ch_name_lookup_path()), - spd_path = read_file(get_spd_path()), + ch_name_lookup_path = get_slf_ch_name_lookup_path(), + spd_path = get_spd_path(), write_to_disk = TRUE) { ## Data Cleaning----------------------------------------------------- + ch_clean <- data %>% dplyr::mutate( - record_date = end_fy_quarter(.data[["period"]]), - qtr_start = start_fy_quarter(.data[["period"]]), - # Set missing admission date to start of the submitted quarter + # Set missing admission date to start of the submitted quarter (n = 2) ch_admission_date = dplyr::if_else( is.na(.data[["ch_admission_date"]]), - .data[["qtr_start"]], + .data[["period_start_date"]], .data[["ch_admission_date"]] ), - # TODO check if we should set the dis date to the end of the period? - # If the dis date is before admission, remove the dis date + # If the dis date is before admission, remove the dis date (n = 5) ch_discharge_date = dplyr::if_else( .data[["ch_admission_date"]] > .data[["ch_discharge_date"]], lubridate::NA_Date_, .data[["ch_discharge_date"]] ) ) %>% - dplyr::left_join(sc_demog_lookup, + dplyr::full_join(sc_demog_lookup, # this is the correct join. by = c("sending_location", "social_care_id") ) %>% - replace_sc_id_with_latest() + replace_sc_id_with_latest() %>% + dplyr::select(-latest_flag, -latest_sc_id) + + # cleaning and matching care home names name_postcode_clean <- fill_ch_names( ch_data = ch_clean, ch_name_lookup_path = ch_name_lookup_path, @@ -58,23 +58,27 @@ process_sc_all_care_home <- function( ) fixed_ch_provider <- name_postcode_clean %>% + dplyr::select(-ch_name_validated, -open_interval, -latest_close_date, -ch_name_old, -ch_postcode_old) %>% dplyr::mutate( - ch_provider = dplyr::if_else(is.na(.data[["ch_provider"]]), 6L, .data[["ch_provider"]]) + ch_provider = dplyr::if_else(is.na(.data[["ch_provider"]]), 6L, .data[["ch_provider"]]) # (n = 2) ) %>% # sort data dplyr::arrange( - "sending_location", - "social_care_id", - "ch_admission_date", - "period" + .data[["sending_location"]], + .data[["social_care_id"]], + .data[["period"]], + .data[["ch_admission_date"]] ) %>% dplyr::group_by( .data[["sending_location"]], .data[["social_care_id"]] ) %>% + # work out the min and max ch provider in an episode dplyr::mutate( min_ch_provider = min(.data[["ch_provider"]]), max_ch_provider = max(.data[["ch_provider"]]), + # if care home provider is different across cases, set to "6". + # tidy up ch_provider using 6 when disagreeing values ch_provider = dplyr::if_else( .data[["min_ch_provider"]] != .data[["max_ch_provider"]], 6L, @@ -85,12 +89,9 @@ process_sc_all_care_home <- function( -"min_ch_provider", -"max_ch_provider" ) %>% - # tidy up ch_provider using 6 when disagreeing values - tidyr::fill(.data[["ch_provider"]], .direction = "downup") %>% dplyr::ungroup() - fixed_nursing_provision <- fixed_ch_provider %>% dplyr::group_by( .data[["sending_location"]], @@ -98,7 +99,7 @@ process_sc_all_care_home <- function( .data[["ch_admission_date"]] ) %>% # fill in nursing care provision when missing - # but present in the following entry + # but present in the following entry (n = 0) dplyr::mutate( nursing_care_provision = dplyr::na_if(.data[["nursing_care_provision"]], 9L) ) %>% @@ -106,26 +107,44 @@ process_sc_all_care_home <- function( ready_to_merge <- fixed_nursing_provision %>% - # remove any duplicate records before merging for speed and simplicity - dplyr::distinct() %>% + # remove any duplicate records before merging + dplyr::distinct() %>% # (n = 3) + # sort data + dplyr::arrange( + .data[["sending_location"]], + .data[["social_care_id"]], + .data[["ch_admission_date"]], + .data[["period"]] + ) %>% + dplyr::group_by( + .data[["sending_location"]], + .data[["social_care_id"]], + .data[["ch_admission_date"]] + ) %>% # counter for split episodes - dplyr::mutate( - split_episode = tidyr::replace_na( - .data[["nursing_care_provision"]] != dplyr::lag( - .data[["nursing_care_provision"]] - ), - TRUE - ), - split_episode_counter = cumsum(.data[["split_episode"]]) + # a split episode is an episode where the admission date is the same but the nursing provider has changed. + # We want to keep the nursing provision changes when we merge cases that have the same admission date + dplyr::mutate(previous_nursing_care_provision = dplyr::lag(.data[["nursing_care_provision"]])) %>% + # create a T/F flag for if nursing provision was the same as previous record with same admission date + dplyr::mutate(split_episode = tidyr::replace_na(.data[["previous_nursing_care_provision"]] != nursing_care_provision, TRUE)) %>% + dplyr::group_by( + .data[["social_care_id"]], + .data[["sending_location"]], + .data[["split_episode"]] ) %>% - dplyr::ungroup() + # create a count of each time the nursing provision changes between records with the same admission date + dplyr::mutate(split_episode_counter = ifelse(split_episode == TRUE, dplyr::row_number(), NA)) %>% + dplyr::group_by( + .data[["social_care_id"]], + .data[["sending_location"]] + ) %>% + # fill split episode counter. This will create a new id number for each different nursing provision within an episode + tidyr::fill(split_episode_counter, .direction = c("down")) %>% + dplyr::select(-previous_nursing_care_provision, -split_episode) + - # Merge records to a single row per episode - # where admission is the same + # Merge records to a single row per episode where admission is the same ch_episode <- ready_to_merge %>% - # when nursing_care_provision is different on - # records within the episode, split the episode - # at this point. dplyr::group_by( .data[["chi"]], .data[["sending_location"]], @@ -138,8 +157,8 @@ process_sc_all_care_home <- function( dplyr::desc(.data[["period"]]), dplyr::desc(.data[["ch_discharge_date"]]), dplyr::desc(.data[["ch_provider"]]), - dplyr::desc(.data[["record_date"]]), - dplyr::desc(.data[["qtr_start"]]), + dplyr::desc(.data[["period_end_date"]]), + dplyr::desc(.data[["period_start_date"]]), dplyr::desc(.data[["ch_name"]]), dplyr::desc(.data[["ch_postcode"]]), dplyr::desc(.data[["reason_for_admission"]]), @@ -150,62 +169,39 @@ process_sc_all_care_home <- function( ) %>% dplyr::summarise( sc_latest_submission = dplyr::first(.data[["period"]]), - dplyr::across( - c( - "ch_discharge_date", - "ch_provider", - "record_date", - "qtr_start", - "ch_name", - "ch_postcode", - "reason_for_admission", - "type_of_admission" - ), - dplyr::first - ), + dplyr::across(c( + "ch_discharge_date", + "ch_provider", + "period_end_date", + "period_start_date", + "ch_name", + "ch_postcode", + "reason_for_admission", + "type_of_admission" + ), dplyr::first), dplyr::across(c("gender", "dob", "postcode"), dplyr::first) ) %>% - dplyr::ungroup() %>% - # Amend dates for split episodes - # Change the start and end date as appropriate when an episode is split, - # using the start / end date of the submission quarter - dplyr::group_by( - .data[["chi"]], - .data[["sending_location"]], - .data[["social_care_id"]], - .data[["ch_admission_date"]] - ) %>% - # counter for latest submission - # TODO check if this is the same as split_episode_counter? - dplyr::mutate( - latest_submission_counter = tidyr::replace_na( - .data[["sc_latest_submission"]] != dplyr::lag( - .data[["sc_latest_submission"]] - ), - TRUE - ), - sum_latest_submission = cumsum(.data[["latest_submission_counter"]]) - ) %>% + # If the admission date is missing use the period start date + # otherwise use the start of the quarter dplyr::mutate( - # If it's the first episode(s) then keep the admission date(s), - # otherwise use the start of the quarter - ch_admission_date = dplyr::if_else( - .data[["sum_latest_submission"]] == min(.data[["sum_latest_submission"]]), - .data[["ch_admission_date"]], - .data[["qtr_start"]] + ch_admission_date = dplyr::if_else(is.na(.data[["ch_admission_date"]]), + .data[["period_start_date"]], + .data[["ch_admission_date"]] ), # If it's the last episode(s) then keep the discharge date(s), otherwise # use the end of the quarter - ch_discharge_date = dplyr::if_else( - .data[["sum_latest_submission"]] == max(.data[["sum_latest_submission"]]), - .data[["ch_discharge_date"]], - .data[["record_date"]] + ch_discharge_date = dplyr::if_else(is.na(.data[["ch_discharge_date"]]), + .data[["period_end_date"]], + .data[["ch_discharge_date"]] ) ) %>% - dplyr::ungroup() + dplyr::ungroup() %>% + dplyr::select(-period_start_date, -split_episode_counter) + # Compare to Deaths Data # match ch_episode data with deaths data + # TO DO should this be boxi nrs death dates instead of IT extract deaths? matched_deaths_data <- ch_episode %>% dplyr::left_join(refined_death, by = "chi" @@ -228,7 +224,7 @@ process_sc_all_care_home <- function( dplyr::ungroup() %>% # remove any episodes where discharge is now before admission, # i.e. death was before admission - dplyr::filter( + dplyr::filter( # (n = 67) !tidyr::replace_na( .data[["ch_discharge_date"]] < .data[["ch_admission_date"]], FALSE @@ -237,39 +233,65 @@ process_sc_all_care_home <- function( # Continuous Care Home Stays # Stay will be continuous as long as the admission date is the next day or - # earlier than the previous discharge date - - ch_markers <- matched_deaths_data %>% - # ch_chi_cis + # earlier than the previous discharge date. + # creates a CIS flag for CHI across all of scotland + # and a CIS for social care ID and sending location for just that LA + ch_chi_markers <- matched_deaths_data %>% + # uses the chi to flag continuous stays. Will flag cases even if in another LA dplyr::group_by(.data[["chi"]]) %>% - dplyr::mutate( - continuous_stay_chi = tidyr::replace_na( - .data[["ch_admission_date"]] <= dplyr::lag( - .data[["ch_discharge_date"]] - ) + lubridate::days(1L), - TRUE - ), - ch_chi_cis = cumsum(.data[["continuous_stay_chi"]]) + # create variable for previous discharge date + 1 day + dplyr::mutate(previous_discharge_date_chi = dplyr::lag(.data[["ch_discharge_date"]]) + lubridate::days(1L)) %>% + # TRUE/FALSE flag for if admission date is before or equal to previous discharge date + 1 day + dplyr::mutate(continuous_stay_flag_chi = tidyr::replace_na(.data[["ch_admission_date"]] <= previous_discharge_date_chi, FALSE)) %>% + # different to code in above sections. + # we want to uniquely identify all cases where the flag is FALSE. and only the first case where the flag is TRUE + # to do this create a variable of the flag in the previous row + dplyr::mutate(previous_continuous_stay_flag_chi = tidyr::replace_na(dplyr::lag(.data[["continuous_stay_flag_chi"]]), FALSE)) %>% + dplyr::mutate(continuous_stay_chi = ifelse(continuous_stay_flag_chi == FALSE | + (continuous_stay_flag_chi == TRUE & previous_continuous_stay_flag_chi == FALSE), FALSE, TRUE)) %>% + dplyr::group_by( + .data[["chi"]], + .data[["continuous_stay_chi"]] ) %>% - dplyr::ungroup() %>% - # ch_sc_id_cis - # uses the social care id and sending location so can be used for - # episodes that are not attached to a CHI number - # This will restrict continuous stays to each Local Authority - dplyr::group_by(.data[["social_care_id"]], .data[["sending_location"]]) %>% - dplyr::mutate( - continuous_stay_sc = tidyr::replace_na( - .data[["ch_admission_date"]] <= dplyr::lag( - .data[["ch_discharge_date"]] - ) + lubridate::days(1L), - TRUE - ), - ch_sc_id_cis = cumsum(.data[["continuous_stay_sc"]]) + # gives cases their unique CIS identifier + dplyr::mutate(ch_chi_cis = ifelse(continuous_stay_chi == FALSE, dplyr::row_number(), NA)) %>% + dplyr::group_by( + .data[["social_care_id"]], + .data[["sending_location"]] ) %>% - dplyr::ungroup() + # fills in CIS identifier for all cases + tidyr::fill(ch_chi_cis, .direction = c("down")) + + + # This is the same but uses the social care id and sending location so can be used for + # episodes that are not attached to a CHI number + # This will restrict continuous stays to each Local Authority + sc_ch_id_markers <- ch_chi_markers %>% + dplyr::group_by(.data[["social_care_id"]], .data[["sending_location"]]) %>% + # create variable for previous discharge date + 1 day + dplyr::mutate(previous_discharge_date_sc = dplyr::lag(.data[["ch_discharge_date"]]) + lubridate::days(1L)) %>% + # TRUE/FALSE flag for if admission date is before or equal to previous discharge date + 1 day + dplyr::mutate(continuous_stay_flag_sc = tidyr::replace_na(.data[["ch_admission_date"]] <= previous_discharge_date_sc, FALSE)) %>% + # we want to uniquely identify all cases where the flag is FALSE. and only the first case where the flag is TRUE + # to do this create a variable of the flag in the previous row + dplyr::mutate(previous_continuous_stay_flag_sc = tidyr::replace_na(dplyr::lag(.data[["continuous_stay_flag_sc"]]), FALSE)) %>% + dplyr::mutate(continuous_stay_sc = ifelse(continuous_stay_flag_sc == FALSE | + (continuous_stay_flag_sc == TRUE & previous_continuous_stay_flag_sc == FALSE), FALSE, TRUE)) %>% + dplyr::group_by(.data[["social_care_id"]], .data[["sending_location"]], .data[["continuous_stay_sc"]]) %>% + # gives cases their unique CIS identifier + dplyr::mutate(ch_sc_id_cis = ifelse(continuous_stay_sc == FALSE, dplyr::row_number(), NA)) %>% + dplyr::group_by(.data[["social_care_id"]], .data[["sending_location"]]) %>% + # fills in CIS identifier for all cases + tidyr::fill(ch_sc_id_cis, .direction = c("down")) %>% + dplyr::select( + -previous_discharge_date_chi, -continuous_stay_flag_chi, -previous_continuous_stay_flag_chi, -continuous_stay_chi, + -previous_discharge_date_sc, -continuous_stay_flag_sc, -previous_continuous_stay_flag_sc, -continuous_stay_sc, + -dis_after_death + ) + - # Do a recode on the old reason for admission - adm_reason_recoded <- ch_markers %>% + # Do a recode on the old reason for admission for respite stays. + adm_reason_recoded <- sc_ch_id_markers %>% dplyr::group_by( .data[["social_care_id"]], .data[["sending_location"]], @@ -277,32 +299,32 @@ process_sc_all_care_home <- function( ) %>% dplyr::mutate( ch_ep_start = min(.data[["ch_admission_date"]]), + # Creates a vector for the earliest date out of the end of period and discharge date. + # And will then select what ever is the latest date out of those ch_ep_end = max( pmin( - .data[["record_date"]], + .data[["period_end_date"]], .data[["ch_discharge_date"]], na.rm = TRUE ) ) ) %>% dplyr::ungroup() %>% + # Flag respite stays. dplyr::mutate( - stay_los = lubridate::time_length( - lubridate::interval(.data[["ch_ep_start"]], .data[["ch_ep_end"]]), - "weeks" - ), + stay_los = lubridate::time_length(lubridate::interval(.data[["ch_ep_start"]], .data[["ch_ep_end"]]), "weeks"), stay_respite = .data[["stay_los"]] < 6.0, - type_of_admission = dplyr::if_else( - is.na(.data[["type_of_admission"]]), - dplyr::case_when( - .data[["reason_for_admission"]] == 1L ~ 1L, + type_of_admission = dplyr::if_else(is.na(.data[["type_of_admission"]]), + dplyr::case_when(.data[["reason_for_admission"]] == 1L ~ 1L, .data[["reason_for_admission"]] == 2L ~ 2L, - stay_respite ~ 1L, + stay_respite ~ 1L, # (n = 40573) .default = 3L ), .data[["type_of_admission"]] ) - ) + ) %>% + dplyr::select(-ch_ep_start, -ch_ep_end, -stay_los, -stay_respite) + ch_data_final <- adm_reason_recoded %>% create_person_id() %>% @@ -312,6 +334,15 @@ process_sc_all_care_home <- function( ch_adm_reason = "type_of_admission", ch_nursing = "nursing_care_provision" ) %>% + # recode the care home provider description + dplyr::mutate(ch_provider_description = dplyr::case_when( # from social care syntax + ch_provider == 1 ~ "LOCAL AUTHORITY/HSCP/NHS BOARD", + ch_provider == 2 ~ "PRIVATE", + ch_provider == 3 ~ "OTHER LOCAL AUTHORITY", + ch_provider == 4 ~ "THIRD SECTOR", + ch_provider == 5 ~ "NHS BOARD", + ch_provider == 6 ~ "OTHER" + )) %>% dplyr::select( "chi", "person_id", @@ -327,6 +358,7 @@ process_sc_all_care_home <- function( "ch_chi_cis", "ch_sc_id_cis", "ch_provider", + "ch_provider_description", "ch_nursing", "ch_adm_reason", "sc_latest_submission" diff --git a/R/read_sc_all_care_home.R b/R/read_sc_all_care_home.R index 89ef7951b..b11879487 100644 --- a/R/read_sc_all_care_home.R +++ b/R/read_sc_all_care_home.R @@ -42,7 +42,7 @@ read_sc_all_care_home <- function(sc_dvprod_connection = phs_db_connection(dsn = } ch_data <- ch_data %>% - # Correct FY 2017 + # Correct FY 2017 as data collection only started in 2017 Q4 dplyr::mutate(period = dplyr::if_else( .data$period == "2017", "2017Q4", diff --git a/man/process_sc_all_care_home.Rd b/man/process_sc_all_care_home.Rd index 7d4234b25..f689c19f4 100644 --- a/man/process_sc_all_care_home.Rd +++ b/man/process_sc_all_care_home.Rd @@ -9,8 +9,8 @@ process_sc_all_care_home( sc_demog_lookup = read_file(get_sc_demog_lookup_path()) \%>\% slfhelper::get_chi(), refined_death = read_file(get_combined_slf_deaths_lookup_path()) \%>\% slfhelper::get_chi(), - ch_name_lookup_path = read_file(get_slf_ch_name_lookup_path()), - spd_path = read_file(get_spd_path()), + ch_name_lookup_path = get_slf_ch_name_lookup_path(), + spd_path = get_spd_path(), write_to_disk = TRUE ) }