Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SC Demographics and SDS #900

Merged
merged 21 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions R/fix_sc_dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @return A date vector with replaced end dates
fix_sc_start_dates <- function(start_date, period_start) {
# Fix sds_start_date is missing by setting start_date to be the start of
# financial year
# financial period
start_date <- dplyr::if_else(
is.na(start_date),
period_start,
Expand All @@ -30,12 +30,12 @@ fix_sc_start_dates <- function(start_date, period_start) {
#' @param period Social care latest submission period.
#'
#' @return A date vector with replaced end dates
fix_sc_end_dates <- function(start_date, end_date, period) {
fix_sc_end_dates <- function(start_date, end_date, period_end_date) {
# Fix sds_end_date is earlier than sds_start_date by setting end_date to be
# the end of financial year
end_date <- dplyr::if_else(
start_date > end_date,
end_fy(year = stringr::str_sub(period, 1L, 4L), "alternate"),
period_end_date,
end_date
)

Expand All @@ -57,7 +57,7 @@ fix_sc_end_dates <- function(start_date, end_date, period) {
#' @return A date vector with replaced end dates
fix_sc_missing_end_dates <- function(end_date, period_end) {
# Fix sds_end_date is earlier than sds_start_date by setting end_date to be
# the end of financial year
# the end of financial period
end_date <- dplyr::if_else(
is.na(end_date),
period_end,
Expand Down
97 changes: 61 additions & 36 deletions R/process_lookup_sc_demographics.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,46 @@ process_lookup_sc_demographics <- function(
dplyr::pull(.data$pc7)


# Data Cleaning ---------------------------------------

# Fill in missing data and flag latest cases to keep ---------------------------------------
sc_demog <- data %>%
dplyr::mutate(
# use chi if upi is NA
upi = dplyr::coalesce(.data$upi, .data$chi_upi),
# check gender code - replace code 99 with 9
submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L)
dplyr::rename(
chi = chi_upi,
gender = chi_gender_code,
dob = chi_date_of_birth
) %>%
# fill in missing demographic details
dplyr::arrange(period, social_care_id) %>%
dplyr::group_by(social_care_id, sending_location) %>%
tidyr::fill(chi, .direction = ("updown")) %>%
tidyr::fill(dob, .direction = ("updown")) %>%
tidyr::fill(date_of_death, .direction = ("updown")) %>%
tidyr::fill(gender, .direction = ("updown")) %>%
tidyr::fill(chi_postcode, .direction = ("updown")) %>%
tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
dplyr::ungroup() %>%
# format postcodes using `phsmethods`
dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?


# flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
sc_demog <- sc_demog %>%
dplyr::group_by(chi, sending_location) %>%
dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
dplyr::group_by(chi, social_care_id, sending_location) %>%
dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
dplyr::group_by(chi, sending_location) %>%
dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
dplyr::mutate(
# use CHI sex if available
gender = dplyr::if_else(
is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
.data$submitted_gender,
.data$chi_gender_code
),
# Use CHI DoB if available
dob = dplyr::coalesce(.data$chi_date_of_birth, .data$submitted_date_of_birth)
latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
keep = ifelse(latest_sc_id == period, 1, 0)
SwiftySalmon marked this conversation as resolved.
Show resolved Hide resolved
) %>%
# format postcodes using `phsmethods`
dplyr::mutate(dplyr::across(
tidyselect::contains("postcode"),
~ phsmethods::format_postcode(.x, format = "pc7")
))
dplyr::ungroup()

sc_demog <- sc_demog %>%
dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
dplyr::distinct()

# postcodes ---------------------------------------------------------------

# count number of na postcodes
na_postcodes <- sc_demog %>%
Expand All @@ -69,29 +85,32 @@ process_lookup_sc_demographics <- function(
~ dplyr::if_else(stringr::str_detect(.x, uk_pc_regexp), .x, NA)
)) %>%
dplyr::select(
"latest_record_flag",
"extract_date",
"sending_location",
"social_care_id",
"upi",
"chi",
"gender",
"dob",
"date_of_death",
"submitted_postcode",
"chi_postcode"
"chi_postcode",
"keep",
"latest_flag"
) %>%
# check if submitted_postcode matches with postcode lookup
dplyr::mutate(
valid_pc = .data$submitted_postcode %in% valid_spd_postcodes
valid_pc_submitted = .data$submitted_postcode %in% valid_spd_postcodes,
valid_pc_chi = .data$chi_postcode %in% valid_spd_postcodes
) %>%
# use submitted_postcode if valid, otherwise use chi_postcode
dplyr::mutate(postcode = dplyr::case_when(
(!is.na(.data$submitted_postcode) & .data$valid_pc) ~ .data$submitted_postcode,
(is.na(.data$submitted_postcode) & !.data$valid_pc) ~ .data$chi_postcode
(!is.na(.data$chi_postcode) & .data$valid_pc_chi) ~ .data$chi_postcode,
((is.na(.data$chi_postcode) | !(.data$valid_pc_chi)) & !(is.na(.data$submitted_postcode)) & .data$valid_pc_submitted) ~ .data$submitted_postcode,
(is.na(.data$submitted_postcode) & !.data$valid_pc_submitted) ~ .data$chi_postcode
)) %>%
dplyr::mutate(postcode_type = dplyr::case_when(
(!is.na(.data$submitted_postcode) & .data$valid_pc) ~ "submitted",
(is.na(.data$submitted_postcode) & !.data$valid_pc) ~ "chi",
(is.na(.data$submitted_postcode) & is.na(.data$chi_postcode)) ~ "missing"
(postcode == chi_postcode) ~ "chi",
(postcode == submitted_postcode) ~ "submitted",
(is.na(.data$submitted_postcode) & is.na(.data$chi_postcode) | is.na(.data$postcode)) ~ "missing"
))

# Check where the postcodes are coming from
Expand All @@ -102,26 +121,32 @@ process_lookup_sc_demographics <- function(
na_replaced_postcodes <- sc_demog %>%
dplyr::count(dplyr::across(tidyselect::ends_with("_postcode"), ~ is.na(.x)))


sc_demog_lookup <- sc_demog %>%
dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
dplyr::select(-postcode_type, -valid_pc_submitted, -valid_pc_chi, -submitted_postcode, -chi_postcode) %>%
dplyr::distinct() %>%
# group by sending location and ID
dplyr::group_by(.data$sending_location, .data$social_care_id) %>%
dplyr::group_by(.data$sending_location, .data$chi, .data$social_care_id, .data$latest_flag) %>%
# arrange so latest submissions are last
dplyr::arrange(
.data$sending_location,
.data$social_care_id,
.data$latest_record_flag,
.data$extract_date
.data$latest_flag
) %>%
# summarise to select the last (non NA) submission
dplyr::summarise(
chi = dplyr::last(.data$upi),
gender = dplyr::last(.data$gender),
dob = dplyr::last(.data$dob),
postcode = dplyr::last(.data$postcode)
postcode = dplyr::last(.data$postcode),
date_of_death = dplyr::last(.data$date_of_death)
Jennit07 marked this conversation as resolved.
Show resolved Hide resolved
) %>%
dplyr::ungroup()

# check to make sure all cases of chi are still there
dplyr::n_distinct(sc_demog_lookup$chi) # 524810
dplyr::n_distinct(sc_demog_lookup$social_care_id) # 636404


if (write_to_disk) {
write_file(
sc_demog_lookup,
Expand Down
19 changes: 14 additions & 5 deletions R/process_sc_all_sds.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ process_sc_all_sds <- function(
sc_demog_lookup,
write_to_disk = TRUE) {
# Match on demographics data (chi, gender, dob and postcode)
matched_sds_data <- data %>%
dplyr::left_join(
matched_sds_data <- data %>% #
dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>%
SwiftySalmon marked this conversation as resolved.
Show resolved Hide resolved
dplyr::right_join(
sc_demog_lookup,
by = c("sending_location", "social_care_id")
) %>%
# when multiple social_care_id from sending_location for single CHI
# replace social_care_id with latest
replace_sc_id_with_latest()
replace_sc_id_with_latest() %>%
dplyr::select(-latest_sc_id, -latest_flag, -sds_start_date_after_period_end_date) %>%
dplyr::distinct()
Jennit07 marked this conversation as resolved.
Show resolved Hide resolved

# Data Cleaning ---------------------------------------
sds_full_clean <- matched_sds_data %>%
Expand Down Expand Up @@ -50,7 +53,7 @@ process_sc_all_sds <- function(
.data$sds_start_date,
.data$sds_period_start_date
),
# If SDS end date is missing, assign end of FY
# If SDS end date is missing, assign end of financial period
sds_end_date = fix_sc_missing_end_dates(
.data$sds_end_date,
.data$sds_period_end_date
Expand All @@ -59,14 +62,19 @@ process_sc_all_sds <- function(
sds_end_date = fix_sc_end_dates(
.data$sds_start_date,
.data$sds_end_date,
.data$period
.data$sds_period_end_date
SwiftySalmon marked this conversation as resolved.
Show resolved Hide resolved
)
) %>%
dplyr::select(
-sds_period_start_date, -sds_period_end_date,
-sds_start_date_after_end_date
) %>%
# rename for matching source variables
dplyr::rename(
record_keydate1 = .data$sds_start_date,
record_keydate2 = .data$sds_end_date
) %>%
dplyr::distinct() %>%
Jennit07 marked this conversation as resolved.
Show resolved Hide resolved
# Pivot longer on sds option variables
tidyr::pivot_longer(
cols = tidyselect::contains("sds_option_"),
Expand Down Expand Up @@ -103,6 +111,7 @@ process_sc_all_sds <- function(
) %>%
dplyr::arrange(.data$period,
.data$record_keydate1,
.data$record_keydate2,
.by_group = TRUE
) %>%
# Create a flag for episodes that are going to be merged
Expand Down
15 changes: 7 additions & 8 deletions R/read_lookup_sc_demographics.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,26 @@ read_lookup_sc_demographics <- function(sc_connection = phs_db_connection(dsn =
) %>%
dplyr::select(
"latest_record_flag",
"extract_date",
"period",
"sending_location",
"sending_location_name",
"social_care_id",
"upi",
"chi_upi",
"submitted_postcode",
"chi_postcode",
"submitted_date_of_birth",
"chi_date_of_birth",
"submitted_gender",
"date_of_death",
"chi_postcode",
"submitted_postcode",
"chi_gender_code"
) %>%
dplyr::collect() %>%
dplyr::mutate(
dplyr::across(c(
"latest_record_flag",
"sending_location",
"submitted_gender",
"chi_gender_code"
), as.integer)
)
) %>%
dplyr::distinct()

return(sc_demog)
}
8 changes: 3 additions & 5 deletions R/read_sc_all_sds.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
"sds_option_1",
"sds_option_2",
"sds_option_3",
"sds_start_date_after_end_date",
"sds_start_date_after_period_end_date",
"sds_end_date_not_within_period"
"sds_start_date_after_end_date", # get fixed
"sds_start_date_after_period_end_date" # get removed
) %>%
dplyr::collect() %>%
dplyr::distinct() %>%
Expand All @@ -33,8 +32,7 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
"sds_option_1",
"sds_option_2",
"sds_option_3"
), as.integer)) %>%
dplyr::filter(.data$sds_start_date_after_period_end_date != 1)
), as.integer))

return(sds_full_data)
}
25 changes: 8 additions & 17 deletions R/replace_sc_id_with_latest.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,31 @@ replace_sc_id_with_latest <- function(data) {
# Check for required variables
check_variables_exist(
data,
c("sending_location", "social_care_id", "chi", "period")
c("sending_location", "social_care_id", "chi", "latest_flag")
)

# select variables we need
filter_data <- data %>%
dplyr::select(
"sending_location", "social_care_id", "chi", "period"
"sending_location", "social_care_id", "chi", "latest_flag"
) %>%
dplyr::filter(!(is.na(.data$chi)))
dplyr::filter(!(is.na(.data$chi))) %>%
dplyr::distinct()

change_sc_id <- filter_data %>%
# Sort (by sending_location, chi and period) for unique chi/sending location
dplyr::arrange(
.data$sending_location,
.data$chi,
dplyr::desc(.data$period)
) %>%
# Find the latest sc_id for each chi/sending location by keeping latest period
dplyr::distinct(
.data$sending_location,
.data$chi,
.keep_all = TRUE
) %>%
dplyr::filter(latest_flag == 1) %>%
# Rename for latest sc id
dplyr::rename(latest_sc_id = "social_care_id") %>%
# drop period for matching
dplyr::select(-"period")
# drop latest_flag for matching
dplyr::select(-"latest_flag")

return_data <- change_sc_id %>%
# Match back onto data
dplyr::right_join(data,
by = c("sending_location", "chi"),
multiple = "all"
) %>%
dplyr::filter(!(is.na(period))) %>%
# Overwrite sc id with the latest
dplyr::mutate(
social_care_id = dplyr::if_else(
Expand Down
2 changes: 1 addition & 1 deletion man/fix_sc_end_dates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.