From 439e430a2fff33eeb3be85193124a5fc61ae6aad Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Fri, 16 Aug 2024 08:20:42 +0100
Subject: [PATCH] Add new nsu (#991)

* Add NSU code to github
Includes extracting the service user cohort to send to the chili team and then NSU extraction.

* Style code

* Add compression and package library

* Style code

* pick up latest geography file, and save out with compression (#983)

* pick up latest geography file, and save out with compression

* Style code

* use `get_spd_path`

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

---------

Co-authored-by: marjom02 <megan.mcnicol2@nhs.scot>
Co-authored-by: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>

---------

Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: Megan McNicol <43570769+SwiftySalmon@users.noreply.github.com>
Co-authored-by: marjom02 <megan.mcnicol2@nhs.scot>
Co-authored-by: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 .../filter_nsu_duplicates.R                   | 192 ++++++++++++++++++
 .../get_service_use_cohort.R                  |  56 +++++
 2 files changed, 248 insertions(+)
 create mode 100644 extract_new_nsu_cohort/filter_nsu_duplicates.R
 create mode 100644 extract_new_nsu_cohort/get_service_use_cohort.R

diff --git a/extract_new_nsu_cohort/filter_nsu_duplicates.R b/extract_new_nsu_cohort/filter_nsu_duplicates.R
new file mode 100644
index 000000000..bbe8265e8
--- /dev/null
+++ b/extract_new_nsu_cohort/filter_nsu_duplicates.R
@@ -0,0 +1,192 @@
+################################################################################
+# Name of file - filter_nsu_duplicates.R
+# Original Authors - James McMahon, Jennifer Thom
+# Original Date - August 2021
+# Update - June 2024
+#
+# Written/run on - RStudio Server
+# Version of R - 3.6.1
+#
+# Description - Use this script to filter NSU duplicates when taking a new
+#               extract from the CHILI team.
+#
+# Steps for requesting a new NSU extract for SLFs:
+#       1. Send an email to [phs.chi-recordlinkage@phs.scot] to request a new NSU
+#          extract after the JUNE update.
+#       2. Prepare a service use extract. Run script `get_service_use_cohort.R` to
+#          extract a list of CHI's from the most recent 'full' file.
+#       3. Once the chili team come back to us, send the service use extract to
+#          the analyst directly. Do not send the list of CHIs to the mailbox for
+#          Information Governance purposes.
+#       4. CHILI team will then process the new NSU extract based on who is not in
+#          the service use extract.
+#       5. Run the script `filter_nsu_duplicates.R` to collect the new NSU
+#          extract from the analysts SMRA space - see lines 46-47 and change
+#          username accordingly. Save the extract in:
+#          "/conf/hscdiip/SLF_Extracts/NSU"
+################################################################################
+
+library(dplyr)
+library(purrr)
+library(stringr)
+library(PostcodesioR)
+library(janitor)
+library(fs)
+library(glue)
+
+
+## Setup------------------------------------------------------------------------
+
+## Update line 41##
+# The year of new NSU extract
+year <- "2324"
+
+# Update lines 45-46 ##
+# Analysts username and schema to collect the data.
+analyst <- "ROBERM18"
+schema <- "FINAL_2"
+
+#  setup directory
+nsu_dir <- path("/conf/hscdiip/SLF_Extracts/NSU")
+
+# latest geography file
+spd_path <- get_spd_path()
+
+# Set up connection to SMRA-----------------------------------------------------
+db_connection <- odbc::dbConnect(
+  odbc::odbc(),
+  dsn = "SMRA",
+  uid = Sys.getenv("USER"),
+  pwd = rstudioapi::askForPassword("password")
+)
+
+
+# Read data---------------------------------------------------------------------
+
+# Read NSU data with duplicates from analyst's SMRA space.
+nsu_data <-
+  tbl(db_connection, dbplyr::in_schema(analyst, schema)) %>%
+  collect() %>%
+  clean_names()
+
+
+# Data cleaning-----------------------------------------------------------------
+
+# Find the records with duplicates
+nsu_pc_duplicates <- nsu_data %>%
+  group_by(upi_number) %>%
+  mutate(postcode_count = n_distinct(postcode)) %>%
+  ungroup() %>%
+  filter(postcode_count > 1)
+
+# Get the latest SPD
+spd <- read_file(spd_path) %>%
+  select(pc7, date_of_introduction, date_of_deletion)
+
+# Load some regex to check if a postcode is valid
+pc_regex <-
+  "([Gg][Ii][Rr] 0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9][A-Za-z]?))))\\s?[0-9][A-Za-z]{2})"
+
+# Main code to check postcodes in various ways
+nsu_pc_duplicates_checked <- nsu_pc_duplicates %>%
+  select(
+    upi_number,
+    start_date,
+    postcode,
+    date_address_changed,
+    gp_prac_no,
+    date_gp_acceptance
+  ) %>%
+  # First check against the regex
+  mutate(invalid_pc = str_detect(postcode, pc_regex, negate = TRUE)) %>%
+  # Now check against the SPD
+  left_join(spd, by = c("postcode" = "pc7")) %>%
+  # Now check against postcodes.io
+  left_join(
+    # Filter to only postcodes that need checking
+    group_by(., upi_number) %>%
+      # UPI has no postcode which matched the SPD
+      filter(
+        all(is.na(
+          date_of_introduction
+        ))
+      ) %>%
+      ungroup() %>%
+      # No need to check invalid postcodes
+      filter(!invalid_pc) %>%
+      # Pass the unique list of postcodes to
+      # postcodes.io
+      pull(postcode) %>%
+      unique() %>%
+      list(postcodes = .) %>%
+      # This function will fail if more than 100 pcs
+      PostcodesioR::bulk_postcode_lookup() %>%
+      # Parse the result, we only want the country
+      map_dfr(~ tibble(
+        postcode = .x$query,
+        # Create an order to make sorting nice later
+        country = ordered(.x$result$country, c("Scotland", "Wales", "England"))
+      ))
+  ) %>%
+  # Sort so that the 'best' postcode is top of the list
+  mutate(priority = case_when(
+    # If they matched SPD,
+    !is.na(date_of_introduction) & is.na(date_of_deletion) ~ 0,
+    # If the matched SPD (and had a d_o_d)
+    !is.na(date_of_introduction) ~ 1,
+    # If they matched the postcodes.io API request
+    !is.na(country) ~ 2,
+    # Invalid postcodes come last
+    invalid_pc ~ Inf,
+    TRUE ~ 99
+  )) %>%
+  arrange(
+    upi_number,
+    priority,
+    # newest introduced come first
+    desc(date_of_introduction),
+    # latest deleted will be first
+    desc(date_of_deletion),
+    # Scotland will be preferred etc.
+    country
+  ) %>%
+  # Flag each row with the assigned priority
+  group_by(upi_number) %>%
+  mutate(keep_priority = row_number()) %>%
+  ungroup()
+
+# Check
+nsu_pc_duplicates_checked %>%
+  count(priority, keep_priority)
+
+final_data <- nsu_data %>%
+  # Filter the main dataset to remove
+  # the duplicate postcodes we decided not to keep
+  anti_join(nsu_pc_duplicates_checked %>%
+    filter(keep_priority > 1)) %>%
+  # Filter any remaining duplicates (none on this test)
+  distinct(upi_number, .keep_all = TRUE) %>%
+  select(
+    chi = upi_number,
+    dob = date_of_birth,
+    postcode,
+    gpprac = gp_prac_no,
+    gender = sex
+  ) %>%
+  mutate(
+    year = year, .before = everything(),
+    dob = as.Date(dob),
+    across(c(gender, gpprac), as.integer)
+  ) %>%
+  arrange(chi) %>%
+  # Save as anon chi on disk
+  slfhelper::get_anon_chi()
+
+# Save data out to be used
+final_data %>%
+  arrow::write_parquet(path(nsu_dir, glue::glue("anon-All_CHIs_20{year}.parquet")),
+    compression = "zstd"
+  )
+
+
+## End of Script ##
diff --git a/extract_new_nsu_cohort/get_service_use_cohort.R b/extract_new_nsu_cohort/get_service_use_cohort.R
new file mode 100644
index 000000000..c29063a32
--- /dev/null
+++ b/extract_new_nsu_cohort/get_service_use_cohort.R
@@ -0,0 +1,56 @@
+################################################################################
+# Name of file - get_service_use_cohort.R
+# Original Authors - Jennifer Thom
+# Original Date - August 2021
+# Update - June 2024
+#
+# Written/run on - RStudio Server
+# Version of R - 3.6.1
+#
+# Description - Use this script to return a list of CHIs from the most recent
+#               SLF episode file (service users) in preparation for requesting
+#               a new NSU cohort for the latest 'full year'
+#
+# Steps for requesting a new NSU extract for SLFs:
+#       1. Send an email to [phs.chi-recordlinkage@phs.scot] to request a new NSU
+#          extract after the JUNE update.
+#       2. Prepare a service use extract. Run script `get_service_use_cohort.R` to
+#          extract a list of CHI's from the most recent 'full' file.
+#       3. Once the chili team come back to us, send the service use extract to
+#          the analyst directly. Do not send the list of CHIs to the mailbox for
+#          Information Governance purposes.
+#       4. CHILI team will then process the new NSU extract based on who is not in
+#          the service use extract.
+#       5. Run the script `filter_nsu_duplicates.R` to collect the new NSU
+#          extract from the analysts SMRA space - see lines 46-47 and change
+#          username accordingly. Save the extract in:
+#          "/conf/hscdiip/SLF_Extracts/NSU"
+#
+################################################################################
+
+# Setup-------------------------------------------------------------------------
+library(fs)
+library(tidyverse)
+
+## Update ##
+# The year of the new NSU extract we want
+year <- "2324"
+
+nsu_dir <- path("/conf/hscdiip/SLF_Extracts/NSU/")
+
+# Read data---------------------------------------------------------------------
+episode_file <- slfhelper::read_slf_episode(year, col_select = "anon_chi") %>%
+  # Remove blank CHI
+  dplyr::filter(!is.na(anon_chi)) %>%
+  # Get CHI version for sending to the CHILI team.
+  # For saving this on disk we want the anon-chi version, save this after sending
+  # to the CHILI team.
+  slfhelper::get_chi()
+
+# Save a parquet file
+episode_file %>%
+  arrow::write_parquet(path(nsu_dir, glue::glue("service_user_extract_{year}.parquet")),
+    compression = "zstd"
+  )
+
+## End of Script ##