Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

social care demographics - decide what to do with cases where last 4 digits of c... #864

Closed
github-actions bot opened this issue Nov 22, 2023 · 2 comments
Assignees
Labels
requires investigation Improvement or bug fix that requires time to investigate Stale

Comments

@github-actions
Copy link

# TODO social care demographics - decide what to do with cases where last 4 digits of chi are different

  # Data Cleaning ---------------------------------------
# TODO social care demographics - tidy up  code and make more efficient.
  sc_demog <- data %>%
    dplyr::mutate(
      # if one chi is missing then use the other
      # TODO social care demographics - decide what to do with cases where last 4 digits of chi are different
      chi_upi = ifelse(is.na(chi_upi), upi, chi_upi),
      upi = ifelse(is.na(upi), chi_upi, upi),
      submitted_date_of_birth = ifelse(is.na(submitted_date_of_birth), chi_date_of_birth, submitted_date_of_birth),
      chi_date_of_birth = ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, chi_date_of_birth),
      chi_date_of_birth = lubridate::as_date(chi_date_of_birth),
      submitted_date_of_birth = lubridate::as_date(submitted_date_of_birth),
      # check gender code - replace code 99 with 9=

      # use CHI sex if available
       # TODO social care demographics - check gender matches chi for extra validation check
      submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L),
      gender = dplyr::if_else(
        is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
        .data$submitted_gender,
        .data$chi_gender_code
      )
    ) %>%
    # format postcodes using `phsmethods`
    dplyr::mutate(dplyr::across(
      tidyselect::contains("postcode"),
      ~ phsmethods::format_postcode(.x, format = "pc7"))) %>%
    dplyr::distinct() %>%
    # if only one option is available for chi then choose that
    dplyr::mutate(chi = ifelse(chi_upi == upi | is.na(upi), chi_upi,
                               ifelse(is.na(chi_upi), upi, NA)
                               )) %>%
    dplyr::mutate(
      # if only one option is available for DOB then choose that
      dob = ifelse(chi_date_of_birth == submitted_date_of_birth | is.na(submitted_date_of_birth), chi_date_of_birth,
        ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, NA)),
      dob = lubridate::as_date(dob)
    ) %>%
    dplyr::arrange(chi, dob) %>%
    dplyr::group_by(social_care_id, sending_location) %>%
    tidyr::fill(chi, .direction = c("down")) %>%
    tidyr::fill(dob, .direction = c("down")) %>%
    dplyr::ungroup() %>%
  # create string for DOB from CHI and the DOB to see if they match.
  dplyr::mutate(dob_from_chiupi = paste0(stringr::str_sub(chi, 1, 6))) %>%
    dplyr::mutate(dob_from_dob = paste0(
      stringr::str_sub(as.character(dob), 9, 10),
      stringr::str_sub(as.character(dob), 6, 7),
      stringr::str_sub(as.character(dob), 3, 4)
    )) %>%
    # validation flag. if dob goes with chi then flag as 1
    dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0))

# cases where all chi and dob are missing, nothing we can do about these and no validation
  missing <- sc_demog %>%
    dplyr::filter(is.na(chi_upi) & is.na(upi) & is.na(submitted_date_of_birth) & is.na(chi_date_of_birth) & is.na(chi) & is.na(dob)) %>%
    dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)

  # cases where chi and dob match
  validated <- sc_demog %>%
    dplyr::filter(chi_validation == 1) %>%
    dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)

  # match on either dob to chi
  sc_demog <- sc_demog %>%
    dplyr::anti_join(missing) %>%
    dplyr::filter(chi_validation != 1) %>%
    # get dob from chi and submitted and see if either match with chi
    dplyr::mutate(dob_from_chidob = paste0(
      stringr::str_sub(as.character(chi_date_of_birth), 9, 10),
      stringr::str_sub(as.character(chi_date_of_birth), 6, 7),
      stringr::str_sub(as.character(chi_date_of_birth), 3, 4)
    )) %>%
    dplyr::mutate(dob_from_submitteddob = paste0(
      stringr::str_sub(submitted_date_of_birth, 9, 10),
      stringr::str_sub(as.character(submitted_date_of_birth), 6, 7),
      stringr::str_sub(as.character(submitted_date_of_birth), 3, 4)
    )) %>%
    # if either dob matches with chi then use that dob
    dplyr::mutate(
      dob = ifelse(dob_from_chiupi == dob_from_chidob, chi_date_of_birth, dob),
      dob = ifelse(dob_from_chiupi == dob_from_submitteddob, submitted_date_of_birth, dob),
      dob = lubridate::as_date(dob)
    ) %>%
    dplyr::mutate(dob_from_dob = paste0(
      stringr::str_sub(as.character(dob), 9, 10),
      stringr::str_sub(as.character(dob), 6, 7),
      stringr::str_sub(as.character(dob), 3, 4)
    )) %>%
    # if dob and chi match then flag as validated
    dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0)) %>%
    dplyr::select(-dob_from_chidob, -dob_from_submitteddob)

# add the validated cases to validated df
  validated <- validated %>%
    rbind(sc_demog %>%
      dplyr::filter(chi_validation == 1) %>%
      dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth))

  # match on dob to either chi
  sc_demog <- sc_demog %>%
    dplyr::filter(chi_validation != 1) %>%
    # create dob from both chi numbers
    dplyr::mutate(dob_from_upi = paste0(stringr::str_sub(upi, 1, 6))) %>%
    dplyr::mutate(dob_from_chi_upi = paste0(stringr::str_sub(chi_upi, 1, 6))) %>%
    # use whichever one matches
    dplyr::mutate(chi = ifelse(dob_from_chi_upi == dob_from_dob, chi_upi, chi)) %>%
    dplyr::mutate(chi = ifelse(dob_from_upi == dob_from_dob, upi, chi)) %>%
    dplyr::mutate(dob_from_chi = paste0(stringr::str_sub(chi, 1, 6))) %>%
    # if chi and dob match then flag as validated
    dplyr::mutate(chi_validation = ifelse(dob_from_chi == dob_from_dob, 1, 0))

  # all validated cases
  validated <- validated %>%
    rbind(sc_demog %>%
      dplyr::filter(chi_validation == 1) %>%
      dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi))


  # TODO social care demographics - decide what to do with non-validated chi and cases where dob does not match chi
  # Need to decide what to do with social care cases where the chi and the dob do not match.
  # this is why I have kept the validated/non-validated df seperate. Hoping we can get back and sort this out.
  sc_demog <- sc_demog %>%
    dplyr::filter(chi_validation != 1) %>%   # all unvalidated cases. most of these are due to missing chi or dob so there is no way to validate.
    dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi) %>%
    rbind(validated) %>%
    rbind(missing)


  # count number of na postcodes
  na_postcodes <- sc_demog %>%
@github-actions github-actions bot added the todo label Nov 22, 2023
@Jennit07 Jennit07 added requires investigation Improvement or bug fix that requires time to investigate social-care labels Jan 8, 2024
Copy link
Author

This issue is stale because it has been open approximately 5 months with no activity.

@github-actions github-actions bot added the Stale label Jun 10, 2024
@SwiftySalmon
Copy link
Collaborator

closing this as it will be part of social care work to improve their processes and data quality

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
requires investigation Improvement or bug fix that requires time to investigate Stale
Projects
None yet
Development

No branches or pull requests

2 participants