diff --git a/R/fix_sc_dates.R b/R/fix_sc_dates.R index c636980a6..117acbaab 100644 --- a/R/fix_sc_dates.R +++ b/R/fix_sc_dates.R @@ -9,7 +9,7 @@ #' @return A date vector with replaced end dates fix_sc_start_dates <- function(start_date, period_start) { # Fix sds_start_date is missing by setting start_date to be the start of - # financial year + # financial period start_date <- dplyr::if_else( is.na(start_date), period_start, @@ -30,12 +30,12 @@ fix_sc_start_dates <- function(start_date, period_start) { #' @param period Social care latest submission period. #' #' @return A date vector with replaced end dates -fix_sc_end_dates <- function(start_date, end_date, period) { +fix_sc_end_dates <- function(start_date, end_date, period_end_date) { # Fix sds_end_date is earlier than sds_start_date by setting end_date to be # the end of financial year end_date <- dplyr::if_else( start_date > end_date, - end_fy(year = stringr::str_sub(period, 1L, 4L), "alternate"), + period_end_date, end_date ) @@ -57,7 +57,7 @@ fix_sc_end_dates <- function(start_date, end_date, period) { #' @return A date vector with replaced end dates fix_sc_missing_end_dates <- function(end_date, period_end) { # Fix sds_end_date is earlier than sds_start_date by setting end_date to be - # the end of financial year + # the end of financial period end_date <- dplyr::if_else( is.na(end_date), period_end, diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R index 8c363f547..96adc985e 100644 --- a/R/process_lookup_sc_demographics.R +++ b/R/process_lookup_sc_demographics.R @@ -28,30 +28,46 @@ process_lookup_sc_demographics <- function( dplyr::pull(.data$pc7) - # Data Cleaning --------------------------------------- - + # Fill in missing data and flag latest cases to keep --------------------------------------- sc_demog <- data %>% - dplyr::mutate( - # use chi if upi is NA - upi = dplyr::coalesce(.data$upi, .data$chi_upi), - # check gender code - replace code 99 with 9 - submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L) + dplyr::rename( + chi = chi_upi, + gender = chi_gender_code, + dob = chi_date_of_birth ) %>% + # fill in missing demographic details + dplyr::arrange(period, social_care_id) %>% + dplyr::group_by(social_care_id, sending_location) %>% + tidyr::fill(chi, .direction = ("updown")) %>% + tidyr::fill(dob, .direction = ("updown")) %>% + tidyr::fill(date_of_death, .direction = ("updown")) %>% + tidyr::fill(gender, .direction = ("updown")) %>% + tidyr::fill(chi_postcode, .direction = ("updown")) %>% + tidyr::fill(submitted_postcode, .direction = ("updown")) %>% + dplyr::ungroup() %>% + # format postcodes using `phsmethods` + dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere? + + + # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate) + sc_demog <- sc_demog %>% + dplyr::group_by(chi, sending_location) %>% + dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi + dplyr::group_by(chi, social_care_id, sending_location) %>% + dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care + dplyr::group_by(chi, sending_location) %>% + dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>% dplyr::mutate( - # use CHI sex if available - gender = dplyr::if_else( - is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L, - .data$submitted_gender, - .data$chi_gender_code - ), - # Use CHI DoB if available - dob = dplyr::coalesce(.data$chi_date_of_birth, .data$submitted_date_of_birth) + latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0), + keep = ifelse(latest_sc_id == period, 1, 0) ) %>% - # format postcodes using `phsmethods` - dplyr::mutate(dplyr::across( - tidyselect::contains("postcode"), - ~ phsmethods::format_postcode(.x, format = "pc7") - )) + dplyr::ungroup() + + sc_demog <- sc_demog %>% + dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>% + dplyr::distinct() + + # postcodes --------------------------------------------------------------- # count number of na postcodes na_postcodes <- sc_demog %>% @@ -69,29 +85,32 @@ process_lookup_sc_demographics <- function( ~ dplyr::if_else(stringr::str_detect(.x, uk_pc_regexp), .x, NA) )) %>% dplyr::select( - "latest_record_flag", - "extract_date", "sending_location", "social_care_id", - "upi", + "chi", "gender", "dob", + "date_of_death", "submitted_postcode", - "chi_postcode" + "chi_postcode", + "keep", + "latest_flag" ) %>% # check if submitted_postcode matches with postcode lookup dplyr::mutate( - valid_pc = .data$submitted_postcode %in% valid_spd_postcodes + valid_pc_submitted = .data$submitted_postcode %in% valid_spd_postcodes, + valid_pc_chi = .data$chi_postcode %in% valid_spd_postcodes ) %>% # use submitted_postcode if valid, otherwise use chi_postcode dplyr::mutate(postcode = dplyr::case_when( - (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ .data$submitted_postcode, - (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ .data$chi_postcode + (!is.na(.data$chi_postcode) & .data$valid_pc_chi) ~ .data$chi_postcode, + ((is.na(.data$chi_postcode) | !(.data$valid_pc_chi)) & !(is.na(.data$submitted_postcode)) & .data$valid_pc_submitted) ~ .data$submitted_postcode, + (is.na(.data$submitted_postcode) & !.data$valid_pc_submitted) ~ .data$chi_postcode )) %>% dplyr::mutate(postcode_type = dplyr::case_when( - (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ "submitted", - (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ "chi", - (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode)) ~ "missing" + (postcode == chi_postcode) ~ "chi", + (postcode == submitted_postcode) ~ "submitted", + (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode) | is.na(.data$postcode)) ~ "missing" )) # Check where the postcodes are coming from @@ -102,26 +121,32 @@ process_lookup_sc_demographics <- function( na_replaced_postcodes <- sc_demog %>% dplyr::count(dplyr::across(tidyselect::ends_with("_postcode"), ~ is.na(.x))) - sc_demog_lookup <- sc_demog %>% + dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi + dplyr::select(-postcode_type, -valid_pc_submitted, -valid_pc_chi, -submitted_postcode, -chi_postcode) %>% + dplyr::distinct() %>% # group by sending location and ID - dplyr::group_by(.data$sending_location, .data$social_care_id) %>% + dplyr::group_by(.data$sending_location, .data$chi, .data$social_care_id, .data$latest_flag) %>% # arrange so latest submissions are last dplyr::arrange( .data$sending_location, .data$social_care_id, - .data$latest_record_flag, - .data$extract_date + .data$latest_flag ) %>% # summarise to select the last (non NA) submission dplyr::summarise( - chi = dplyr::last(.data$upi), gender = dplyr::last(.data$gender), dob = dplyr::last(.data$dob), - postcode = dplyr::last(.data$postcode) + postcode = dplyr::last(.data$postcode), + date_of_death = dplyr::last(.data$date_of_death) ) %>% dplyr::ungroup() + # check to make sure all cases of chi are still there + dplyr::n_distinct(sc_demog_lookup$chi) # 524810 + dplyr::n_distinct(sc_demog_lookup$social_care_id) # 636404 + + if (write_to_disk) { write_file( sc_demog_lookup, diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index f9ca52f24..453db3e40 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -15,14 +15,17 @@ process_sc_all_sds <- function( sc_demog_lookup, write_to_disk = TRUE) { # Match on demographics data (chi, gender, dob and postcode) - matched_sds_data <- data %>% - dplyr::left_join( + matched_sds_data <- data %>% # + dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>% + dplyr::right_join( sc_demog_lookup, by = c("sending_location", "social_care_id") ) %>% # when multiple social_care_id from sending_location for single CHI # replace social_care_id with latest - replace_sc_id_with_latest() + replace_sc_id_with_latest() %>% + dplyr::select(-latest_sc_id, -latest_flag, -sds_start_date_after_period_end_date) %>% + dplyr::distinct() # Data Cleaning --------------------------------------- sds_full_clean <- matched_sds_data %>% @@ -50,7 +53,7 @@ process_sc_all_sds <- function( .data$sds_start_date, .data$sds_period_start_date ), - # If SDS end date is missing, assign end of FY + # If SDS end date is missing, assign end of financial period sds_end_date = fix_sc_missing_end_dates( .data$sds_end_date, .data$sds_period_end_date @@ -59,14 +62,19 @@ process_sc_all_sds <- function( sds_end_date = fix_sc_end_dates( .data$sds_start_date, .data$sds_end_date, - .data$period + .data$sds_period_end_date ) ) %>% + dplyr::select( + -sds_period_start_date, -sds_period_end_date, + -sds_start_date_after_end_date + ) %>% # rename for matching source variables dplyr::rename( record_keydate1 = .data$sds_start_date, record_keydate2 = .data$sds_end_date ) %>% + dplyr::distinct() %>% # Pivot longer on sds option variables tidyr::pivot_longer( cols = tidyselect::contains("sds_option_"), @@ -103,6 +111,7 @@ process_sc_all_sds <- function( ) %>% dplyr::arrange(.data$period, .data$record_keydate1, + .data$record_keydate2, .by_group = TRUE ) %>% # Create a flag for episodes that are going to be merged diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R index fcdde5417..fe9a5e71f 100644 --- a/R/read_lookup_sc_demographics.R +++ b/R/read_lookup_sc_demographics.R @@ -12,16 +12,15 @@ read_lookup_sc_demographics <- function(sc_connection = phs_db_connection(dsn = ) %>% dplyr::select( "latest_record_flag", - "extract_date", + "period", "sending_location", + "sending_location_name", "social_care_id", - "upi", "chi_upi", - "submitted_postcode", - "chi_postcode", - "submitted_date_of_birth", "chi_date_of_birth", - "submitted_gender", + "date_of_death", + "chi_postcode", + "submitted_postcode", "chi_gender_code" ) %>% dplyr::collect() %>% @@ -29,10 +28,10 @@ read_lookup_sc_demographics <- function(sc_connection = phs_db_connection(dsn = dplyr::across(c( "latest_record_flag", "sending_location", - "submitted_gender", "chi_gender_code" ), as.integer) - ) + ) %>% + dplyr::distinct() return(sc_demog) } diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R index 18c5b52ec..ab9bb20e1 100644 --- a/R/read_sc_all_sds.R +++ b/R/read_sc_all_sds.R @@ -22,9 +22,8 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR "sds_option_1", "sds_option_2", "sds_option_3", - "sds_start_date_after_end_date", - "sds_start_date_after_period_end_date", - "sds_end_date_not_within_period" + "sds_start_date_after_end_date", # get fixed + "sds_start_date_after_period_end_date" # get removed ) %>% dplyr::collect() %>% dplyr::distinct() %>% @@ -33,8 +32,7 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR "sds_option_1", "sds_option_2", "sds_option_3" - ), as.integer)) %>% - dplyr::filter(.data$sds_start_date_after_period_end_date != 1) + ), as.integer)) return(sds_full_data) } diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R index 73c1a3706..2c32bbb93 100644 --- a/R/replace_sc_id_with_latest.R +++ b/R/replace_sc_id_with_latest.R @@ -7,33 +7,23 @@ replace_sc_id_with_latest <- function(data) { # Check for required variables check_variables_exist( data, - c("sending_location", "social_care_id", "chi", "period") + c("sending_location", "social_care_id", "chi", "latest_flag") ) # select variables we need filter_data <- data %>% dplyr::select( - "sending_location", "social_care_id", "chi", "period" + "sending_location", "social_care_id", "chi", "latest_flag" ) %>% - dplyr::filter(!(is.na(.data$chi))) + dplyr::filter(!(is.na(.data$chi))) %>% + dplyr::distinct() change_sc_id <- filter_data %>% - # Sort (by sending_location, chi and period) for unique chi/sending location - dplyr::arrange( - .data$sending_location, - .data$chi, - dplyr::desc(.data$period) - ) %>% - # Find the latest sc_id for each chi/sending location by keeping latest period - dplyr::distinct( - .data$sending_location, - .data$chi, - .keep_all = TRUE - ) %>% + dplyr::filter(latest_flag == 1) %>% # Rename for latest sc id dplyr::rename(latest_sc_id = "social_care_id") %>% - # drop period for matching - dplyr::select(-"period") + # drop latest_flag for matching + dplyr::select(-"latest_flag") return_data <- change_sc_id %>% # Match back onto data @@ -41,6 +31,7 @@ replace_sc_id_with_latest <- function(data) { by = c("sending_location", "chi"), multiple = "all" ) %>% + dplyr::filter(!(is.na(period))) %>% # Overwrite sc id with the latest dplyr::mutate( social_care_id = dplyr::if_else( diff --git a/man/fix_sc_end_dates.Rd b/man/fix_sc_end_dates.Rd index 1bf808bea..041751319 100644 --- a/man/fix_sc_end_dates.Rd +++ b/man/fix_sc_end_dates.Rd @@ -4,7 +4,7 @@ \alias{fix_sc_end_dates} \title{Fix sc end dates} \usage{ -fix_sc_end_dates(start_date, end_date, period) +fix_sc_end_dates(start_date, end_date, period_end_date) } \arguments{ \item{start_date}{A vector containing dates.}