From 6dab611f055f1ceaf3492103a3bd60039ccd0711 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Wed, 28 Feb 2024 12:46:17 +0000 Subject: [PATCH] 758 investigate extracts to identify areas of code which can be cut down for processing times (#899) * re-writing process_sc_all sds and alarm_telecare with data.table to improve the speed * Update documentation * Style code * changes in line with new process_sc_all_sds dplyr version * Style code * remove duplicate columns * remove duplicated columns --------- Co-authored-by: lizihao-anu Co-authored-by: Megan McNicol <43570769+SwiftySalmon@users.noreply.github.com> --- R/process_sc_all_alarms_telecare.R | 4 - R/process_sc_all_sds.R | 263 ++++++++++++++++------------- 2 files changed, 149 insertions(+), 118 deletions(-) diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R index 08cb9faa7..0c24892d0 100644 --- a/R/process_sc_all_alarms_telecare.R +++ b/R/process_sc_all_alarms_telecare.R @@ -110,12 +110,8 @@ process_sc_all_alarms_telecare <- function( # Summarize to merge episodes qtr_merge <- data[, .( - sending_location = data.table::last(sending_location), - social_care_id = data.table::last(social_care_id), sc_latest_submission = data.table::last(period), record_keydate2 = data.table::last(record_keydate2), - smrtype = data.table::last(smrtype), - pkg_count = data.table::last(pkg_count), chi = data.table::last(chi), gender = data.table::last(gender), dob = data.table::last(dob), diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index 453db3e40..a1a1db24a 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -15,7 +15,7 @@ process_sc_all_sds <- function( sc_demog_lookup, write_to_disk = TRUE) { # Match on demographics data (chi, gender, dob and postcode) - matched_sds_data <- data %>% # + matched_sds_data <- data %>% dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>% dplyr::right_join( sc_demog_lookup, @@ -24,123 +24,158 @@ process_sc_all_sds <- function( # when multiple social_care_id from sending_location for single CHI # replace social_care_id with latest replace_sc_id_with_latest() %>% - dplyr::select(-latest_sc_id, -latest_flag, -sds_start_date_after_period_end_date) %>% - dplyr::distinct() - - # Data Cleaning --------------------------------------- - sds_full_clean <- matched_sds_data %>% - # Deal with SDS option 4 - # First turn the option flags into a logical T/F - dplyr::mutate(dplyr::across( - tidyselect::starts_with("sds_option_"), - ~ dplyr::case_when( - .x == 1L ~ TRUE, - .x == 0L ~ FALSE, - is.na(.x) ~ FALSE - ) - )) %>% - # SDS option 4 is derived when a person receives more than one option. - # e.g. if a person has options 1 and 2 then option 4 will be derived - dplyr::mutate( - sds_option_4 = rowSums( - dplyr::pick(tidyselect::starts_with("sds_option_")) - ) > 1L, - .after = .data$sds_option_3 - ) %>% - # If SDS start date is missing, assign start of FY - dplyr::mutate( - sds_start_date = fix_sc_start_dates( - .data$sds_start_date, - .data$sds_period_start_date - ), - # If SDS end date is missing, assign end of financial period - sds_end_date = fix_sc_missing_end_dates( - .data$sds_end_date, - .data$sds_period_end_date - ), - # Fix sds_end_date is earlier than sds_start_date by setting end_date to be the end of fyear - sds_end_date = fix_sc_end_dates( - .data$sds_start_date, - .data$sds_end_date, - .data$sds_period_end_date - ) - ) %>% - dplyr::select( - -sds_period_start_date, -sds_period_end_date, - -sds_start_date_after_end_date - ) %>% - # rename for matching source variables - dplyr::rename( - record_keydate1 = .data$sds_start_date, - record_keydate2 = .data$sds_end_date - ) %>% + dplyr::select(-sds_start_date_after_period_end_date) %>% dplyr::distinct() %>% - # Pivot longer on sds option variables - tidyr::pivot_longer( - cols = tidyselect::contains("sds_option_"), - names_to = "sds_option", - names_prefix = "sds_option_", - names_transform = list(sds_option = ~ paste0("SDS-", .x)), - values_to = "received" - ) %>% - # Only keep rows where they received a package and remove duplicates - dplyr::filter(.data$received) %>% - dplyr::distinct() %>% - # Include source variables + # sds_options may contain only a few NA, replace NA by 0 dplyr::mutate( - smrtype = dplyr::case_when( - sds_option == "SDS-1" ~ "SDS-1", - sds_option == "SDS-2" ~ "SDS-2", - sds_option == "SDS-3" ~ "SDS-3", - sds_option == "SDS-4" ~ "SDS-4" - ), - recid = "SDS", - # Create person id variable - person_id = stringr::str_glue("{sending_location}-{social_care_id}"), - # Use function for creating sc send lca variables - sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location) + sds_option_1 = tidyr::replace_na(sds_option_1, 0), + sds_option_2 = tidyr::replace_na(sds_option_2, 0), + sds_option_3 = tidyr::replace_na(sds_option_3, 0) ) - final_data <- sds_full_clean %>% - # use as.data.table to change the data format to data.table to accelerate - data.table::as.data.table() %>% - dplyr::group_by( - .data$sending_location, - .data$social_care_id, - .data$smrtype - ) %>% - dplyr::arrange(.data$period, - .data$record_keydate1, - .data$record_keydate2, - .by_group = TRUE - ) %>% - # Create a flag for episodes that are going to be merged - # Create an episode counter - dplyr::mutate( - distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>% - tidyr::replace_na(TRUE), - episode_counter = cumsum(.data$distinct_episode) - ) %>% - # Group by episode counter and merge episodes - dplyr::group_by(.data$episode_counter, .add = TRUE) %>% - dplyr::summarise( - sc_latest_submission = dplyr::last(.data$period), - record_keydate1 = min(.data$record_keydate1), - record_keydate2 = max(.data$record_keydate2), - sending_location = dplyr::last(.data$sending_location), - social_care_id = dplyr::last(.data$social_care_id), - chi = dplyr::last(.data$chi), - gender = dplyr::last(.data$gender), - dob = dplyr::last(.data$dob), - postcode = dplyr::last(.data$postcode), - recid = dplyr::last(.data$recid), - person_id = dplyr::last(.data$person_id), - sc_send_lca = dplyr::last(.data$sc_send_lca) - ) %>% - dplyr::ungroup() %>% - dplyr::select(-.data$episode_counter) %>% - # change the data format from data.table to data.frame - tibble::as_tibble() + # Data Cleaning --------------------------------------- + # Convert matched_sds_data to data.table + sds_full_clean <- data.table::as.data.table(matched_sds_data) + rm(matched_sds_data) + + # Deal with SDS option 4 + # Convert option flags into logical T/F + cols_sds_option <- grep( + "^sds_option_", + names(sds_full_clean), + value = TRUE + ) + sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) { + data.table::fifelse(x == 1L, TRUE, FALSE) + }), + .SDcols = cols_sds_option + ] + + # Derived SDS option 4 when a person receives more than one option + sds_full_clean[, + sds_option_4 := rowSums(.SD) > 1L, + .SDcols = cols_sds_option + ] + + # If SDS start date or end date is missing, assign start/end of FY + sds_full_clean[ + , + sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date) + ] + sds_full_clean[ + , + sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date) + ] + sds_full_clean[ + , + sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date) + ] + + sds_full_clean[, c( + "sds_period_start_date", + "sds_period_end_date", + "sds_start_date_after_end_date" + ) := NULL] + + # Rename for matching source variables + data.table::setnames( + sds_full_clean, + c("sds_start_date", "sds_end_date"), + c("record_keydate1", "record_keydate2") + ) + + sds_full_clean <- unique(sds_full_clean) + + cols_sds_option <- grep( + "^sds_option_", + names(sds_full_clean), + value = TRUE + ) + # Pivot longer on sds option variables + sds_full_clean_long <- data.table::melt( + sds_full_clean, + id.vars = setdiff(names(sds_full_clean), cols_sds_option), + measure.vars = cols_sds_option, + variable.name = "sds_option", + value.name = "received" + ) + rm(sds_full_clean) + sds_full_clean_long <- sds_full_clean_long[received == TRUE, ] + sds_full_clean_long[ + , + sds_option := paste0("SDS-", sub("sds_option_", "", sds_option)) + ] + + # Filter rows where they received a package and remove duplicates + sds_full_clean_long <- unique(sds_full_clean_long) + + # Include source variables + sds_full_clean_long[, c( + "smrtype", + "recid", + "sc_send_lca" + ) := + list( + sds_option, + "SDS", + convert_sc_sending_location_to_lca(sending_location) + )] + sds_full_clean_long$person_id <- paste0( + sds_full_clean_long$sending_location, + "-", + sds_full_clean_long$social_care_id + ) + + # Group, arrange and create flags for episodes + sds_full_clean_long[, + c( + "period_rank", + "record_keydate1_rank", + "record_keydate2_rank" + ) := list( + rank(period), + rank(record_keydate1), + rank(record_keydate2) + ), + by = .(sending_location, social_care_id, smrtype) + ] + data.table::setorder( + sds_full_clean_long, + period_rank, + record_keydate1_rank, + record_keydate2_rank + ) + + sds_full_clean_long[, + distinct_episode := + (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>% + tidyr::replace_na(TRUE), + by = .(sending_location, social_care_id, smrtype) + ] + + sds_full_clean_long[, + episode_counter := cumsum(distinct_episode), + by = .(sending_location, social_care_id, smrtype) + ] + + # Merge episodes by episode counter + final_data <- sds_full_clean_long[, .( + sc_latest_submission = data.table::last(period), + record_keydate1 = min(record_keydate1), + record_keydate2 = max(record_keydate2), + chi = data.table::last(chi), + gender = data.table::last(gender), + dob = data.table::last(dob), + postcode = data.table::last(postcode), + recid = data.table::last(recid), + person_id = data.table::last(person_id), + sc_send_lca = data.table::last(sc_send_lca) + ), by = .(sending_location, social_care_id, smrtype, episode_counter)] + rm(sds_full_clean_long) + + # Drop episode_counter and convert back to data.frame if needed + final_data <- as.data.frame(final_data[, -"episode_counter"]) + # final_data now holds the processed data in the format of a data.frame if (write_to_disk) { write_file(