From ae9c38e1f3ebb02c874a8e329c7e9c73243743a0 Mon Sep 17 00:00:00 2001
From: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Date: Tue, 9 Jan 2024 14:17:03 +0000
Subject: [PATCH 01/15] Style code

---
 R/create_individual_file.R                     | 3 ++-
 R/get_fy_quarter_dates.R                       | 8 ++++----
 Run_SLF_Files_manually/run_episode_file_1718.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_1819.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_1920.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_2021.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_2122.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_2223.R | 3 ++-
 Run_SLF_Files_manually/run_episode_file_2324.R | 3 ++-
 9 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d9316b41b..4ca2f96d7 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -483,7 +483,8 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_ep_end = dplyr::if_else(
         eval(condition),
         .data$record_keydate2,
-        lubridate::NA_Date_  ),
+        lubridate::NA_Date_
+      ),
       # If end date is missing use the first day of next FY quarter
       ch_ep_end = dplyr::if_else(
         eval(condition) & is.na(.data$ch_ep_end),
diff --git a/R/get_fy_quarter_dates.R b/R/get_fy_quarter_dates.R
index cd4c3492c..a772099b8 100644
--- a/R/get_fy_quarter_dates.R
+++ b/R/get_fy_quarter_dates.R
@@ -15,7 +15,7 @@
 start_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -47,7 +47,7 @@ start_fy_quarter <- function(quarter) {
 end_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -80,7 +80,7 @@ end_fy_quarter <- function(quarter) {
 start_next_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -112,7 +112,7 @@ start_next_fy_quarter <- function(quarter) {
 end_next_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
diff --git a/Run_SLF_Files_manually/run_episode_file_1718.R b/Run_SLF_Files_manually/run_episode_file_1718.R
index 9be2eb9c6..ab75b94d7 100644
--- a/Run_SLF_Files_manually/run_episode_file_1718.R
+++ b/Run_SLF_Files_manually/run_episode_file_1718.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1718"
 
 processed_data_list <- targets::tar_read("processed_data_list_1718",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_1819.R b/Run_SLF_Files_manually/run_episode_file_1819.R
index 7dec9e5c1..cd5a7435f 100644
--- a/Run_SLF_Files_manually/run_episode_file_1819.R
+++ b/Run_SLF_Files_manually/run_episode_file_1819.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1819"
 
 processed_data_list <- targets::tar_read("processed_data_list_1819",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_1920.R b/Run_SLF_Files_manually/run_episode_file_1920.R
index 066bd27b7..a9dc591b1 100644
--- a/Run_SLF_Files_manually/run_episode_file_1920.R
+++ b/Run_SLF_Files_manually/run_episode_file_1920.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1920"
 
 processed_data_list <- targets::tar_read("processed_data_list_1920",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2021.R b/Run_SLF_Files_manually/run_episode_file_2021.R
index 8354f49ae..37708ee8b 100644
--- a/Run_SLF_Files_manually/run_episode_file_2021.R
+++ b/Run_SLF_Files_manually/run_episode_file_2021.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2021"
 
 processed_data_list <- targets::tar_read("processed_data_list_2021",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2122.R b/Run_SLF_Files_manually/run_episode_file_2122.R
index 4057770d1..47400e2d1 100644
--- a/Run_SLF_Files_manually/run_episode_file_2122.R
+++ b/Run_SLF_Files_manually/run_episode_file_2122.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2122"
 
 processed_data_list <- targets::tar_read("processed_data_list_2122",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2223.R b/Run_SLF_Files_manually/run_episode_file_2223.R
index 5df7b5db6..e64a57f32 100644
--- a/Run_SLF_Files_manually/run_episode_file_2223.R
+++ b/Run_SLF_Files_manually/run_episode_file_2223.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2223"
 
 processed_data_list <- targets::tar_read("processed_data_list_2223",
-                      store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2324.R b/Run_SLF_Files_manually/run_episode_file_2324.R
index af9a3efe5..4a7f0ad29 100644
--- a/Run_SLF_Files_manually/run_episode_file_2324.R
+++ b/Run_SLF_Files_manually/run_episode_file_2324.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2324"
 
 processed_data_list <- targets::tar_read("processed_data_list_2324",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%

From d4db230ac24aa30a54695f0825264dbc059e9cb4 Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Thu, 11 Jan 2024 17:40:41 +0000
Subject: [PATCH 02/15] # read in sc demographics

different variables - removed extract date as not accurate, using chi over upi after discussion with social care data management. Added in date of death just for fun.
---
 R/read_lookup_sc_demographics.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R
index fcdde5417..af8327a3c 100644
--- a/R/read_lookup_sc_demographics.R
+++ b/R/read_lookup_sc_demographics.R
@@ -12,27 +12,27 @@ read_lookup_sc_demographics <- function(sc_connection = phs_db_connection(dsn =
   ) %>%
     dplyr::select(
       "latest_record_flag",
-      "extract_date",
+      "period",
       "sending_location",
+      "sending_location_name",
       "social_care_id",
-      "upi",
       "chi_upi",
-      "submitted_postcode",
-      "chi_postcode",
-      "submitted_date_of_birth",
       "chi_date_of_birth",
-      "submitted_gender",
+      "date_of_death",
+      "chi_postcode",
+      "submitted_postcode",
       "chi_gender_code"
-    ) %>%
-    dplyr::collect() %>%
+    ) %>% dplyr::collect() %>%
     dplyr::mutate(
       dplyr::across(c(
         "latest_record_flag",
         "sending_location",
-        "submitted_gender",
         "chi_gender_code"
       ), as.integer)
-    )
+    )%>%
+    dplyr::distinct()
 
   return(sc_demog)
 }
+
+

From b159c9d518e9558f43ff41cda2e6d50d15b8ff70 Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Fri, 12 Jan 2024 13:57:42 +0000
Subject: [PATCH 03/15] social care demographics first draft

removed a lot of the submitted variables and instead using chi variables from chi seeding. Other changes:
- Fill in missing values,
- create flag for latest social care id (one from database is not accurate), this makes sure that each chi only has ONE sc id as the latest to stop it creating duplicates
- change postcode to choose chi over submitted
---
 R/process_lookup_sc_demographics.R | 98 ++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 33 deletions(-)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index 8c363f547..c0523d478 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -28,36 +28,63 @@ process_lookup_sc_demographics <- function(
     dplyr::pull(.data$pc7)
 
 
-  # Data Cleaning ---------------------------------------
-
-  sc_demog <- data %>%
-    dplyr::mutate(
-      # use chi if upi is NA
-      upi = dplyr::coalesce(.data$upi, .data$chi_upi),
-      # check gender code - replace code 99 with 9
-      submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L)
-    ) %>%
-    dplyr::mutate(
-      # use CHI sex if available
-      gender = dplyr::if_else(
-        is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
-        .data$submitted_gender,
-        .data$chi_gender_code
-      ),
-      # Use CHI DoB if available
-      dob = dplyr::coalesce(.data$chi_date_of_birth, .data$submitted_date_of_birth)
-    ) %>%
+  #  Fill in missing data and flag latest cases to keep ---------------------------------------
+  sc_demog1 <- sc_demog %>%
+  # sc_demog <- data %>%
+      dplyr::rename(upi = chi_upi,
+                  gender = chi_gender_code,
+                  dob = chi_date_of_birth) %>%
+    # fill in missing demographic details
+    dplyr::arrange(period, social_care_id) %>%
+    dplyr::group_by(social_care_id, sending_location) %>%
+    tidyr::fill(upi, .direction = ("updown")) %>%
+    tidyr::fill(dob, .direction = ("updown")) %>%
+    tidyr::fill(date_of_death, .direction = ("updown")) %>%
+    tidyr::fill(gender , .direction = ("updown")) %>%
+    tidyr::fill(chi_postcode, .direction = ("updown")) %>%
+    tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
     # format postcodes using `phsmethods`
-    dplyr::mutate(dplyr::across(
-      tidyselect::contains("postcode"),
-      ~ phsmethods::format_postcode(.x, format = "pc7")
-    ))
+    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7")))# are sc postcodes even used anywhere?
+
+    # 4924132
+    # 4946071
+  # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
+  sc_demog2 <- sc_demog1 %>%
+    dplyr::group_by(upi) %>%
+    dplyr::mutate(latest = dplyr::last(period)) %>%  # flag latest period for chi
+    dplyr::group_by(upi, social_care_id) %>%
+    dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
+    dplyr::group_by(upi) %>%
+    dplyr::mutate(latest_flag = ifelse(latest == period | is.na(upi), 1, 0),
+                  keep = ifelse(latest_sc_id == period, 1, 0))#
+
+ # dplyr::n_distinct(sc_demog2$upi) # 524810
+  #dplyr::n_distinct(sc_demog2$social_care_id) # 636404
+
+  sc_demog3 <- sc_demog2 %>%
+    dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
+    dplyr::group_by(upi, social_care_id) %>%
+    dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
+    dplyr::distinct() %>%
+    dplyr::ungroup()
+
+  test <- sc_demog3 %>%
+    dplyr::group_by(social_care_id, sending_location) %>%
+    dplyr::mutate(count_scid = dplyr::n()) %>%
+    dplyr::group_by(upi)
+
+  # check to make sure all cases of chi are still there
+ # dplyr::n_distinct(sc_demog3$upi) # 524810
+  #dplyr::n_distinct(sc_demog3$social_care_id) # 636404
+
+
+# postcodes ---------------------------------------------------------------
 
   # count number of na postcodes
-  na_postcodes <- sc_demog %>%
+  na_postcodes1 <- sc_demog3 %>%
     dplyr::count(dplyr::across(tidyselect::contains("postcode"), ~ is.na(.x)))
 
-  sc_demog <- sc_demog %>%
+  sc_demog4 <- sc_demog3 %>%
     # remove dummy postcodes invalid postcodes missed by regex check
     dplyr::mutate(dplyr::across(
       tidyselect::ends_with("_postcode"),
@@ -69,15 +96,16 @@ process_lookup_sc_demographics <- function(
       ~ dplyr::if_else(stringr::str_detect(.x, uk_pc_regexp), .x, NA)
     )) %>%
     dplyr::select(
-      "latest_record_flag",
-      "extract_date",
       "sending_location",
       "social_care_id",
       "upi",
       "gender",
       "dob",
+      "date_of_death",
       "submitted_postcode",
-      "chi_postcode"
+      "chi_postcode",
+      "period", "latest_record_flag", "latest", "latest_sc_id", "keep",
+      "latest_flag"
     ) %>%
     # check if submitted_postcode matches with postcode lookup
     dplyr::mutate(
@@ -95,23 +123,27 @@ process_lookup_sc_demographics <- function(
     ))
 
   # Check where the postcodes are coming from
-  sc_demog %>%
+  sc_demog4 %>%
     dplyr::count(.data$postcode_type)
 
   # count number of replaced postcode - compare with count above
-  na_replaced_postcodes <- sc_demog %>%
+  na_replaced_postcodes <- sc_demog4 %>%
     dplyr::count(dplyr::across(tidyselect::ends_with("_postcode"), ~ is.na(.x)))
 
 
-  sc_demog_lookup <- sc_demog %>%
+  sc_demog_lookup <- sc_demog4 %>%
+    dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
+      dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
+    dplyr::group_by(upi, social_care_id) %>%
+    dplyr::distinct() %>%
+   # dplyr::ungroup()
     # group by sending location and ID
     dplyr::group_by(.data$sending_location, .data$social_care_id) %>%
     # arrange so latest submissions are last
     dplyr::arrange(
       .data$sending_location,
       .data$social_care_id,
-      .data$latest_record_flag,
-      .data$extract_date
+      .data$latest_flag
     ) %>%
     # summarise to select the last (non NA) submission
     dplyr::summarise(

From 7fef07327883924f6218ace1abd71429e4b3b982 Mon Sep 17 00:00:00 2001
From: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:04:55 +0000
Subject: [PATCH 04/15] Style code

---
 R/process_lookup_sc_demographics.R | 40 ++++++++++++++++--------------
 R/read_lookup_sc_demographics.R    |  7 +++---
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index c0523d478..2df526fac 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -30,36 +30,40 @@ process_lookup_sc_demographics <- function(
 
   #  Fill in missing data and flag latest cases to keep ---------------------------------------
   sc_demog1 <- sc_demog %>%
-  # sc_demog <- data %>%
-      dplyr::rename(upi = chi_upi,
-                  gender = chi_gender_code,
-                  dob = chi_date_of_birth) %>%
+    # sc_demog <- data %>%
+    dplyr::rename(
+      upi = chi_upi,
+      gender = chi_gender_code,
+      dob = chi_date_of_birth
+    ) %>%
     # fill in missing demographic details
     dplyr::arrange(period, social_care_id) %>%
     dplyr::group_by(social_care_id, sending_location) %>%
     tidyr::fill(upi, .direction = ("updown")) %>%
     tidyr::fill(dob, .direction = ("updown")) %>%
     tidyr::fill(date_of_death, .direction = ("updown")) %>%
-    tidyr::fill(gender , .direction = ("updown")) %>%
+    tidyr::fill(gender, .direction = ("updown")) %>%
     tidyr::fill(chi_postcode, .direction = ("updown")) %>%
     tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
     # format postcodes using `phsmethods`
-    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7")))# are sc postcodes even used anywhere?
+    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?
 
-    # 4924132
-    # 4946071
+  # 4924132
+  # 4946071
   # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
   sc_demog2 <- sc_demog1 %>%
     dplyr::group_by(upi) %>%
-    dplyr::mutate(latest = dplyr::last(period)) %>%  # flag latest period for chi
+    dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
     dplyr::group_by(upi, social_care_id) %>%
     dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
     dplyr::group_by(upi) %>%
-    dplyr::mutate(latest_flag = ifelse(latest == period | is.na(upi), 1, 0),
-                  keep = ifelse(latest_sc_id == period, 1, 0))#
+    dplyr::mutate(
+      latest_flag = ifelse(latest == period | is.na(upi), 1, 0),
+      keep = ifelse(latest_sc_id == period, 1, 0)
+    ) #
 
- # dplyr::n_distinct(sc_demog2$upi) # 524810
-  #dplyr::n_distinct(sc_demog2$social_care_id) # 636404
+  # dplyr::n_distinct(sc_demog2$upi) # 524810
+  # dplyr::n_distinct(sc_demog2$social_care_id) # 636404
 
   sc_demog3 <- sc_demog2 %>%
     dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
@@ -74,11 +78,11 @@ process_lookup_sc_demographics <- function(
     dplyr::group_by(upi)
 
   # check to make sure all cases of chi are still there
- # dplyr::n_distinct(sc_demog3$upi) # 524810
-  #dplyr::n_distinct(sc_demog3$social_care_id) # 636404
+  # dplyr::n_distinct(sc_demog3$upi) # 524810
+  # dplyr::n_distinct(sc_demog3$social_care_id) # 636404
 
 
-# postcodes ---------------------------------------------------------------
+  # postcodes ---------------------------------------------------------------
 
   # count number of na postcodes
   na_postcodes1 <- sc_demog3 %>%
@@ -133,10 +137,10 @@ process_lookup_sc_demographics <- function(
 
   sc_demog_lookup <- sc_demog4 %>%
     dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
-      dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
+    dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
     dplyr::group_by(upi, social_care_id) %>%
     dplyr::distinct() %>%
-   # dplyr::ungroup()
+    # dplyr::ungroup()
     # group by sending location and ID
     dplyr::group_by(.data$sending_location, .data$social_care_id) %>%
     # arrange so latest submissions are last
diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R
index af8327a3c..fe9a5e71f 100644
--- a/R/read_lookup_sc_demographics.R
+++ b/R/read_lookup_sc_demographics.R
@@ -22,17 +22,16 @@ read_lookup_sc_demographics <- function(sc_connection = phs_db_connection(dsn =
       "chi_postcode",
       "submitted_postcode",
       "chi_gender_code"
-    ) %>% dplyr::collect() %>%
+    ) %>%
+    dplyr::collect() %>%
     dplyr::mutate(
       dplyr::across(c(
         "latest_record_flag",
         "sending_location",
         "chi_gender_code"
       ), as.integer)
-    )%>%
+    ) %>%
     dplyr::distinct()
 
   return(sc_demog)
 }
-
-

From ff11e58f8232ae385b94ea2d2eab4c6975a01922 Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Fri, 12 Jan 2024 14:11:31 +0000
Subject: [PATCH 05/15] had a github error? Not sure what happened but
 commiting first draft of sc demographics

---
 R/process_lookup_sc_demographics.R | 104 +++++++++++++----------------
 1 file changed, 48 insertions(+), 56 deletions(-)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index 2df526fac..4080a5099 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -29,66 +29,54 @@ process_lookup_sc_demographics <- function(
 
 
   #  Fill in missing data and flag latest cases to keep ---------------------------------------
-  sc_demog1 <- sc_demog %>%
-    # sc_demog <- data %>%
-    dplyr::rename(
-      upi = chi_upi,
-      gender = chi_gender_code,
-      dob = chi_date_of_birth
-    ) %>%
+  sc_demog <- data %>%
+    dplyr::rename(chi = chi_upi,
+                  gender = chi_gender_code,
+                  dob = chi_date_of_birth) %>%
     # fill in missing demographic details
     dplyr::arrange(period, social_care_id) %>%
     dplyr::group_by(social_care_id, sending_location) %>%
-    tidyr::fill(upi, .direction = ("updown")) %>%
+    tidyr::fill(chi, .direction = ("updown")) %>%
     tidyr::fill(dob, .direction = ("updown")) %>%
     tidyr::fill(date_of_death, .direction = ("updown")) %>%
-    tidyr::fill(gender, .direction = ("updown")) %>%
+    tidyr::fill(gender , .direction = ("updown")) %>%
     tidyr::fill(chi_postcode, .direction = ("updown")) %>%
     tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
     # format postcodes using `phsmethods`
-    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?
+    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7")))# are sc postcodes even used anywhere?
+
 
-  # 4924132
-  # 4946071
   # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
-  sc_demog2 <- sc_demog1 %>%
-    dplyr::group_by(upi) %>%
-    dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
-    dplyr::group_by(upi, social_care_id) %>%
+  sc_demog <- sc_demog %>%
+    dplyr::group_by(chi) %>%
+    dplyr::mutate(latest = dplyr::last(period)) %>%  # flag latest period for chi
+    dplyr::group_by(chi, social_care_id) %>%
     dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
-    dplyr::group_by(upi) %>%
-    dplyr::mutate(
-      latest_flag = ifelse(latest == period | is.na(upi), 1, 0),
-      keep = ifelse(latest_sc_id == period, 1, 0)
-    ) #
-
-  # dplyr::n_distinct(sc_demog2$upi) # 524810
-  # dplyr::n_distinct(sc_demog2$social_care_id) # 636404
-
-  sc_demog3 <- sc_demog2 %>%
-    dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
-    dplyr::group_by(upi, social_care_id) %>%
-    dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
-    dplyr::distinct() %>%
+    dplyr::group_by(chi) %>%
+    dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
+    dplyr::mutate(latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
+                  keep = ifelse(latest_sc_id == period, 1, 0)) %>%
     dplyr::ungroup()
 
-  test <- sc_demog3 %>%
-    dplyr::group_by(social_care_id, sending_location) %>%
-    dplyr::mutate(count_scid = dplyr::n()) %>%
-    dplyr::group_by(upi)
+  dplyr::n_distinct(sc_demog2$chi) # 524810
+  dplyr::n_distinct(sc_demog2$social_care_id) # 636404
+
+  sc_demog <- sc_demog %>%
+    dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
+    dplyr::distinct()
 
   # check to make sure all cases of chi are still there
-  # dplyr::n_distinct(sc_demog3$upi) # 524810
-  # dplyr::n_distinct(sc_demog3$social_care_id) # 636404
+  dplyr::n_distinct(sc_demog3$chi) # 524810
+  dplyr::n_distinct(sc_demog3$social_care_id) # 636404
 
 
   # postcodes ---------------------------------------------------------------
 
   # count number of na postcodes
-  na_postcodes1 <- sc_demog3 %>%
+  na_postcodes <- sc_demog3 %>%
     dplyr::count(dplyr::across(tidyselect::contains("postcode"), ~ is.na(.x)))
 
-  sc_demog4 <- sc_demog3 %>%
+  sc_demog <- sc_demog %>%
     # remove dummy postcodes invalid postcodes missed by regex check
     dplyr::mutate(dplyr::across(
       tidyselect::ends_with("_postcode"),
@@ -102,47 +90,46 @@ process_lookup_sc_demographics <- function(
     dplyr::select(
       "sending_location",
       "social_care_id",
-      "upi",
+      "chi",
       "gender",
       "dob",
       "date_of_death",
       "submitted_postcode",
       "chi_postcode",
-      "period", "latest_record_flag", "latest", "latest_sc_id", "keep",
+      "keep",
       "latest_flag"
     ) %>%
     # check if submitted_postcode matches with postcode lookup
     dplyr::mutate(
-      valid_pc = .data$submitted_postcode %in% valid_spd_postcodes
+      valid_pc_submitted = .data$submitted_postcode %in% valid_spd_postcodes,
+      valid_pc_chi = .data$chi_postcode %in% valid_spd_postcodes
     ) %>%
     # use submitted_postcode if valid, otherwise use chi_postcode
     dplyr::mutate(postcode = dplyr::case_when(
-      (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ .data$submitted_postcode,
-      (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ .data$chi_postcode
+      (!is.na(.data$chi_postcode) & .data$valid_pc_chi) ~ .data$chi_postcode,
+      ((is.na(.data$chi_postcode) | !(.data$valid_pc_chi)) & !(is.na(.data$submitted_postcode)) & .data$valid_pc_submitted) ~ .data$submitted_postcode,
+      (is.na(.data$submitted_postcode) & !.data$valid_pc_submitted) ~ .data$chi_postcode
     )) %>%
     dplyr::mutate(postcode_type = dplyr::case_when(
-      (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ "submitted",
-      (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ "chi",
-      (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode)) ~ "missing"
+      (postcode == chi_postcode) ~ "chi",
+      (postcode == submitted_postcode )~ "submitted",
+      (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode) | is.na(.data$postcode)) ~ "missing"
     ))
 
   # Check where the postcodes are coming from
-  sc_demog4 %>%
+  sc_demog %>%
     dplyr::count(.data$postcode_type)
 
   # count number of replaced postcode - compare with count above
-  na_replaced_postcodes <- sc_demog4 %>%
+  na_replaced_postcodes <- sc_demog %>%
     dplyr::count(dplyr::across(tidyselect::ends_with("_postcode"), ~ is.na(.x)))
 
-
-  sc_demog_lookup <- sc_demog4 %>%
+  sc_demog_lookup <- sc_demog %>%
     dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
-    dplyr::select(-period, -latest_record_flag, -latest, -latest_sc_id, -keep) %>%
-    dplyr::group_by(upi, social_care_id) %>%
+    dplyr::select(-postcode_type, -valid_pc_submitted, -valid_pc_chi, -submitted_postcode, -chi_postcode) %>%
     dplyr::distinct() %>%
-    # dplyr::ungroup()
     # group by sending location and ID
-    dplyr::group_by(.data$sending_location, .data$social_care_id) %>%
+    dplyr::group_by(.data$sending_location, .data$chi, .data$social_care_id, .data$latest_flag) %>%
     # arrange so latest submissions are last
     dplyr::arrange(
       .data$sending_location,
@@ -151,13 +138,18 @@ process_lookup_sc_demographics <- function(
     ) %>%
     # summarise to select the last (non NA) submission
     dplyr::summarise(
-      chi = dplyr::last(.data$upi),
       gender = dplyr::last(.data$gender),
       dob = dplyr::last(.data$dob),
-      postcode = dplyr::last(.data$postcode)
+      postcode = dplyr::last(.data$postcode),
+      date_of_death = dplyr::last(.data$date_of_death)
     ) %>%
     dplyr::ungroup()
 
+  # check to make sure all cases of chi are still there
+  dplyr::n_distinct(sc_demog_lookup$chi) # 524810
+  dplyr::n_distinct(sc_demog_lookup$social_care_id) # 636404
+
+
   if (write_to_disk) {
     write_file(
       sc_demog_lookup,

From 7e3164a2679e4c65bd82e8124b56fb6f5b9f039f Mon Sep 17 00:00:00 2001
From: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:13:12 +0000
Subject: [PATCH 06/15] Style code

---
 R/process_lookup_sc_demographics.R | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index 4080a5099..1cb38c024 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -30,32 +30,36 @@ process_lookup_sc_demographics <- function(
 
   #  Fill in missing data and flag latest cases to keep ---------------------------------------
   sc_demog <- data %>%
-    dplyr::rename(chi = chi_upi,
-                  gender = chi_gender_code,
-                  dob = chi_date_of_birth) %>%
+    dplyr::rename(
+      chi = chi_upi,
+      gender = chi_gender_code,
+      dob = chi_date_of_birth
+    ) %>%
     # fill in missing demographic details
     dplyr::arrange(period, social_care_id) %>%
     dplyr::group_by(social_care_id, sending_location) %>%
     tidyr::fill(chi, .direction = ("updown")) %>%
     tidyr::fill(dob, .direction = ("updown")) %>%
     tidyr::fill(date_of_death, .direction = ("updown")) %>%
-    tidyr::fill(gender , .direction = ("updown")) %>%
+    tidyr::fill(gender, .direction = ("updown")) %>%
     tidyr::fill(chi_postcode, .direction = ("updown")) %>%
     tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
     # format postcodes using `phsmethods`
-    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7")))# are sc postcodes even used anywhere?
+    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?
 
 
   # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
   sc_demog <- sc_demog %>%
     dplyr::group_by(chi) %>%
-    dplyr::mutate(latest = dplyr::last(period)) %>%  # flag latest period for chi
+    dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
     dplyr::group_by(chi, social_care_id) %>%
     dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
     dplyr::group_by(chi) %>%
     dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
-    dplyr::mutate(latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
-                  keep = ifelse(latest_sc_id == period, 1, 0)) %>%
+    dplyr::mutate(
+      latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
+      keep = ifelse(latest_sc_id == period, 1, 0)
+    ) %>%
     dplyr::ungroup()
 
   dplyr::n_distinct(sc_demog2$chi) # 524810
@@ -112,7 +116,7 @@ process_lookup_sc_demographics <- function(
     )) %>%
     dplyr::mutate(postcode_type = dplyr::case_when(
       (postcode == chi_postcode) ~ "chi",
-      (postcode == submitted_postcode )~ "submitted",
+      (postcode == submitted_postcode) ~ "submitted",
       (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode) | is.na(.data$postcode)) ~ "missing"
     ))
 

From 78987e01d436d65b08b94c447f9e11948e803e79 Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Tue, 16 Jan 2024 18:04:18 +0000
Subject: [PATCH 07/15] first draft sds. No major changes - only how
 demographics is matched on and how latest social care id is selected

---
 R/fix_sc_dates.R                   |  8 ++++----
 R/process_lookup_sc_demographics.R | 14 ++------------
 R/process_sc_all_sds.R             | 22 +++++++++++++++-------
 R/read_sc_all_sds.R                |  8 +++-----
 R/replace_sc_id_with_latest.R      | 25 ++++++++-----------------
 5 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/R/fix_sc_dates.R b/R/fix_sc_dates.R
index c636980a6..117acbaab 100644
--- a/R/fix_sc_dates.R
+++ b/R/fix_sc_dates.R
@@ -9,7 +9,7 @@
 #' @return A date vector with replaced end dates
 fix_sc_start_dates <- function(start_date, period_start) {
   # Fix sds_start_date is missing by setting start_date to be the start of
-  # financial year
+  # financial period
   start_date <- dplyr::if_else(
     is.na(start_date),
     period_start,
@@ -30,12 +30,12 @@ fix_sc_start_dates <- function(start_date, period_start) {
 #' @param period Social care latest submission period.
 #'
 #' @return A date vector with replaced end dates
-fix_sc_end_dates <- function(start_date, end_date, period) {
+fix_sc_end_dates <- function(start_date, end_date, period_end_date) {
   # Fix sds_end_date is earlier than sds_start_date by setting end_date to be
   # the end of financial year
   end_date <- dplyr::if_else(
     start_date > end_date,
-    end_fy(year = stringr::str_sub(period, 1L, 4L), "alternate"),
+    period_end_date,
     end_date
   )
 
@@ -57,7 +57,7 @@ fix_sc_end_dates <- function(start_date, end_date, period) {
 #' @return A date vector with replaced end dates
 fix_sc_missing_end_dates <- function(end_date, period_end) {
   # Fix sds_end_date is earlier than sds_start_date by setting end_date to be
-  # the end of financial year
+  # the end of financial period
   end_date <- dplyr::if_else(
     is.na(end_date),
     period_end,
diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index 4080a5099..7002ac18b 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -56,24 +56,14 @@ process_lookup_sc_demographics <- function(
     dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
     dplyr::mutate(latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
                   keep = ifelse(latest_sc_id == period, 1, 0)) %>%
-    dplyr::ungroup()
-
-  dplyr::n_distinct(sc_demog2$chi) # 524810
-  dplyr::n_distinct(sc_demog2$social_care_id) # 636404
-
-  sc_demog <- sc_demog %>%
+    dplyr::ungroup() %>%
     dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
     dplyr::distinct()
 
-  # check to make sure all cases of chi are still there
-  dplyr::n_distinct(sc_demog3$chi) # 524810
-  dplyr::n_distinct(sc_demog3$social_care_id) # 636404
-
-
   # postcodes ---------------------------------------------------------------
 
   # count number of na postcodes
-  na_postcodes <- sc_demog3 %>%
+  na_postcodes <- sc_demog %>%
     dplyr::count(dplyr::across(tidyselect::contains("postcode"), ~ is.na(.x)))
 
   sc_demog <- sc_demog %>%
diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index f9ca52f24..073205d0c 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -14,15 +14,19 @@ process_sc_all_sds <- function(
     data,
     sc_demog_lookup,
     write_to_disk = TRUE) {
+
   # Match on demographics data (chi, gender, dob and postcode)
-  matched_sds_data <- data %>%
-    dplyr::left_join(
-      sc_demog_lookup,
+  matched_sds_data <- sc_sds %>% # data %>%
+    dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>%
+    dplyr::full_join(
+      sc_demographics_lookup, # sc_demog_lookup,
       by = c("sending_location", "social_care_id")
     ) %>%
     # when multiple social_care_id from sending_location for single CHI
     # replace social_care_id with latest
-    replace_sc_id_with_latest()
+    replace_sc_id_with_latest() %>%
+    dplyr::select(-latest_sc_id, -latest_flag, -sds_start_date_after_period_end_date) %>%
+    dplyr::distinct()
 
   # Data Cleaning ---------------------------------------
   sds_full_clean <- matched_sds_data %>%
@@ -50,7 +54,7 @@ process_sc_all_sds <- function(
         .data$sds_start_date,
         .data$sds_period_start_date
       ),
-      # If SDS end date is missing, assign end of FY
+      # If SDS end date is missing, assign end of financial period
       sds_end_date = fix_sc_missing_end_dates(
         .data$sds_end_date,
         .data$sds_period_end_date
@@ -59,14 +63,17 @@ process_sc_all_sds <- function(
       sds_end_date = fix_sc_end_dates(
         .data$sds_start_date,
         .data$sds_end_date,
-        .data$period
+        .data$sds_period_end_date
       )
     ) %>%
+    dplyr::select(-sds_period_start_date, -sds_period_end_date,
+                  -sds_start_date_after_end_date) %>%
     # rename for matching source variables
     dplyr::rename(
       record_keydate1 = .data$sds_start_date,
       record_keydate2 = .data$sds_end_date
     ) %>%
+      dplyr::distinct() %>%
     # Pivot longer on sds option variables
     tidyr::pivot_longer(
       cols = tidyselect::contains("sds_option_"),
@@ -103,6 +110,7 @@ process_sc_all_sds <- function(
     ) %>%
     dplyr::arrange(.data$period,
       .data$record_keydate1,
+      .data$record_keydate2,
       .by_group = TRUE
     ) %>%
     # Create a flag for episodes that are going to be merged
@@ -111,7 +119,7 @@ process_sc_all_sds <- function(
       distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>%
         tidyr::replace_na(TRUE),
       episode_counter = cumsum(.data$distinct_episode)
-    ) %>%
+    ) #%>%
     # Group by episode counter and merge episodes
     dplyr::group_by(.data$episode_counter, .add = TRUE) %>%
     dplyr::summarise(
diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R
index 18c5b52ec..032311da6 100644
--- a/R/read_sc_all_sds.R
+++ b/R/read_sc_all_sds.R
@@ -22,9 +22,8 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
       "sds_option_1",
       "sds_option_2",
       "sds_option_3",
-      "sds_start_date_after_end_date",
-      "sds_start_date_after_period_end_date",
-      "sds_end_date_not_within_period"
+      "sds_start_date_after_end_date",# get fixed
+      "sds_start_date_after_period_end_date" # get removed
     ) %>%
     dplyr::collect() %>%
     dplyr::distinct() %>%
@@ -33,8 +32,7 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
       "sds_option_1",
       "sds_option_2",
       "sds_option_3"
-    ), as.integer)) %>%
-    dplyr::filter(.data$sds_start_date_after_period_end_date != 1)
+    ), as.integer))
 
   return(sds_full_data)
 }
diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R
index 73c1a3706..2c32bbb93 100644
--- a/R/replace_sc_id_with_latest.R
+++ b/R/replace_sc_id_with_latest.R
@@ -7,33 +7,23 @@ replace_sc_id_with_latest <- function(data) {
   # Check for required variables
   check_variables_exist(
     data,
-    c("sending_location", "social_care_id", "chi", "period")
+    c("sending_location", "social_care_id", "chi", "latest_flag")
   )
 
   # select variables we need
   filter_data <- data %>%
     dplyr::select(
-      "sending_location", "social_care_id", "chi", "period"
+      "sending_location", "social_care_id", "chi", "latest_flag"
     ) %>%
-    dplyr::filter(!(is.na(.data$chi)))
+    dplyr::filter(!(is.na(.data$chi))) %>%
+    dplyr::distinct()
 
   change_sc_id <- filter_data %>%
-    # Sort (by sending_location, chi and period) for unique chi/sending location
-    dplyr::arrange(
-      .data$sending_location,
-      .data$chi,
-      dplyr::desc(.data$period)
-    ) %>%
-    # Find the latest sc_id for each chi/sending location by keeping latest period
-    dplyr::distinct(
-      .data$sending_location,
-      .data$chi,
-      .keep_all = TRUE
-    ) %>%
+    dplyr::filter(latest_flag == 1) %>%
     # Rename for latest sc id
     dplyr::rename(latest_sc_id = "social_care_id") %>%
-    # drop period for matching
-    dplyr::select(-"period")
+    # drop latest_flag for matching
+    dplyr::select(-"latest_flag")
 
   return_data <- change_sc_id %>%
     # Match back onto data
@@ -41,6 +31,7 @@ replace_sc_id_with_latest <- function(data) {
       by = c("sending_location", "chi"),
       multiple = "all"
     ) %>%
+    dplyr::filter(!(is.na(period))) %>%
     # Overwrite sc id with the latest
     dplyr::mutate(
       social_care_id = dplyr::if_else(

From 6a8423347df8e391e227f6fae1d3a7f4b70fbba9 Mon Sep 17 00:00:00 2001
From: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Date: Wed, 17 Jan 2024 10:23:54 +0000
Subject: [PATCH 08/15] Update documentation

---
 DESCRIPTION             | 2 +-
 man/fix_sc_end_dates.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5123289dd..4bb0c6f18 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -73,4 +73,4 @@ Encoding: UTF-8
 Language: en-GB
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
diff --git a/man/fix_sc_end_dates.Rd b/man/fix_sc_end_dates.Rd
index 1bf808bea..041751319 100644
--- a/man/fix_sc_end_dates.Rd
+++ b/man/fix_sc_end_dates.Rd
@@ -4,7 +4,7 @@
 \alias{fix_sc_end_dates}
 \title{Fix sc end dates}
 \usage{
-fix_sc_end_dates(start_date, end_date, period)
+fix_sc_end_dates(start_date, end_date, period_end_date)
 }
 \arguments{
 \item{start_date}{A vector containing dates.}

From d7f2ee11c17053d58301275c76836e081ac4cf85 Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Tue, 23 Jan 2024 16:47:30 +0000
Subject: [PATCH 09/15] demographics - add sending location to group by

---
 R/process_lookup_sc_demographics.R |  8 ++++----
 R/process_sc_all_sds.R             | 17 +++++++++--------
 R/replace_sc_id_with_latest.R      |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index da5e839fb..7c6e01766 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -50,11 +50,11 @@ process_lookup_sc_demographics <- function(
 
   # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
   sc_demog <- sc_demog %>%
-    dplyr::group_by(chi) %>%
+    dplyr::group_by(chi, sending_location) %>%
     dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
-    dplyr::group_by(chi, social_care_id) %>%
+    dplyr::group_by(chi, social_care_id, sending_location) %>%
     dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
-    dplyr::group_by(chi) %>%
+    dplyr::group_by(chi, sending_location) %>%
     dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
     dplyr::mutate(
       latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
@@ -63,7 +63,7 @@ process_lookup_sc_demographics <- function(
     dplyr::ungroup()
 
   sc_demog <- sc_demog %>%
-  dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
+    dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
     dplyr::distinct()
 
   # postcodes ---------------------------------------------------------------
diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index 073205d0c..453db3e40 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -14,12 +14,11 @@ process_sc_all_sds <- function(
     data,
     sc_demog_lookup,
     write_to_disk = TRUE) {
-
   # Match on demographics data (chi, gender, dob and postcode)
-  matched_sds_data <- sc_sds %>% # data %>%
+  matched_sds_data <- data %>% #
     dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>%
-    dplyr::full_join(
-      sc_demographics_lookup, # sc_demog_lookup,
+    dplyr::right_join(
+      sc_demog_lookup,
       by = c("sending_location", "social_care_id")
     ) %>%
     # when multiple social_care_id from sending_location for single CHI
@@ -66,14 +65,16 @@ process_sc_all_sds <- function(
         .data$sds_period_end_date
       )
     ) %>%
-    dplyr::select(-sds_period_start_date, -sds_period_end_date,
-                  -sds_start_date_after_end_date) %>%
+    dplyr::select(
+      -sds_period_start_date, -sds_period_end_date,
+      -sds_start_date_after_end_date
+    ) %>%
     # rename for matching source variables
     dplyr::rename(
       record_keydate1 = .data$sds_start_date,
       record_keydate2 = .data$sds_end_date
     ) %>%
-      dplyr::distinct() %>%
+    dplyr::distinct() %>%
     # Pivot longer on sds option variables
     tidyr::pivot_longer(
       cols = tidyselect::contains("sds_option_"),
@@ -119,7 +120,7 @@ process_sc_all_sds <- function(
       distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>%
         tidyr::replace_na(TRUE),
       episode_counter = cumsum(.data$distinct_episode)
-    ) #%>%
+    ) %>%
     # Group by episode counter and merge episodes
     dplyr::group_by(.data$episode_counter, .add = TRUE) %>%
     dplyr::summarise(
diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R
index 2c32bbb93..4ae3c06ed 100644
--- a/R/replace_sc_id_with_latest.R
+++ b/R/replace_sc_id_with_latest.R
@@ -11,7 +11,7 @@ replace_sc_id_with_latest <- function(data) {
   )
 
   # select variables we need
-  filter_data <- data %>%
+  filter_data <- data %>% # matched_sds_data %>% #
     dplyr::select(
       "sending_location", "social_care_id", "chi", "latest_flag"
     ) %>%
@@ -27,7 +27,7 @@ replace_sc_id_with_latest <- function(data) {
 
   return_data <- change_sc_id %>%
     # Match back onto data
-    dplyr::right_join(data,
+    dplyr::right_join(data, # matched_sds_data,#
       by = c("sending_location", "chi"),
       multiple = "all"
     ) %>%

From ab2f5f6160e3e2f8d6b0754b863ed19c3c61133a Mon Sep 17 00:00:00 2001
From: SwiftySalmon <SwiftySalmon@users.noreply.github.com>
Date: Tue, 23 Jan 2024 16:56:25 +0000
Subject: [PATCH 10/15] Style code

---
 R/read_sc_all_sds.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R
index 032311da6..ab9bb20e1 100644
--- a/R/read_sc_all_sds.R
+++ b/R/read_sc_all_sds.R
@@ -22,7 +22,7 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
       "sds_option_1",
       "sds_option_2",
       "sds_option_3",
-      "sds_start_date_after_end_date",# get fixed
+      "sds_start_date_after_end_date", # get fixed
       "sds_start_date_after_period_end_date" # get removed
     ) %>%
     dplyr::collect() %>%

From 245b8c9c2de48d3dc633df1b1a1ba1106d8e3ebb Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 24 Jan 2024 13:25:39 +0000
Subject: [PATCH 11/15] Update documentation

---
 DESCRIPTION                    | 2 +-
 man/calculate_stay.Rd          | 4 ++--
 man/compute_mid_year_age.Rd    | 4 ++--
 man/convert_date_to_numeric.Rd | 4 ++--
 man/convert_numeric_to_date.Rd | 4 ++--
 man/end_fy.Rd                  | 2 +-
 man/end_fy_quarter.Rd          | 2 +-
 man/end_next_fy_quarter.Rd     | 4 ++--
 man/fy_interval.Rd             | 4 ++--
 man/is_date_in_fyyear.Rd       | 4 ++--
 man/last_date_month.Rd         | 4 ++--
 man/midpoint_fy.Rd             | 4 ++--
 man/next_fy.Rd                 | 4 ++--
 man/start_fy.Rd                | 2 +-
 man/start_fy_quarter.Rd        | 2 +-
 man/start_next_fy_quarter.Rd   | 6 +++---
 16 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4bb0c6f18..3a75852e2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -73,4 +73,4 @@ Encoding: UTF-8
 Language: en-GB
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.0
+RoxygenNote: 7.3.1
diff --git a/man/calculate_stay.Rd b/man/calculate_stay.Rd
index 43b7bd166..5e9266b10 100644
--- a/man/calculate_stay.Rd
+++ b/man/calculate_stay.Rd
@@ -34,16 +34,16 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/compute_mid_year_age.Rd b/man/compute_mid_year_age.Rd
index 142fa4aab..5a50370e0 100644
--- a/man/compute_mid_year_age.Rd
+++ b/man/compute_mid_year_age.Rd
@@ -31,16 +31,16 @@ Other date functions:
 \code{\link{calculate_stay}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/convert_date_to_numeric.Rd b/man/convert_date_to_numeric.Rd
index 5511fec84..b67eaa778 100644
--- a/man/convert_date_to_numeric.Rd
+++ b/man/convert_date_to_numeric.Rd
@@ -24,16 +24,16 @@ Other date functions:
 \code{\link{calculate_stay}()},
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/convert_numeric_to_date.Rd b/man/convert_numeric_to_date.Rd
index f786e0319..a09b7b9b9 100644
--- a/man/convert_numeric_to_date.Rd
+++ b/man/convert_numeric_to_date.Rd
@@ -24,16 +24,16 @@ Other date functions:
 \code{\link{calculate_stay}()},
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/end_fy.Rd b/man/end_fy.Rd
index 2925ffe60..6220f5f32 100644
--- a/man/end_fy.Rd
+++ b/man/end_fy.Rd
@@ -34,8 +34,8 @@ Other date functions:
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/end_fy_quarter.Rd b/man/end_fy_quarter.Rd
index 0efe9624a..26c439a04 100644
--- a/man/end_fy_quarter.Rd
+++ b/man/end_fy_quarter.Rd
@@ -33,8 +33,8 @@ Other date functions:
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/end_next_fy_quarter.Rd b/man/end_next_fy_quarter.Rd
index f9cc1720a..702446e82 100644
--- a/man/end_next_fy_quarter.Rd
+++ b/man/end_next_fy_quarter.Rd
@@ -26,15 +26,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/fy_interval.Rd b/man/fy_interval.Rd
index 12d1d36bb..00b9ea52c 100644
--- a/man/fy_interval.Rd
+++ b/man/fy_interval.Rd
@@ -26,15 +26,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/is_date_in_fyyear.Rd b/man/is_date_in_fyyear.Rd
index 97a0f3639..e74bd5734 100644
--- a/man/is_date_in_fyyear.Rd
+++ b/man/is_date_in_fyyear.Rd
@@ -41,15 +41,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/last_date_month.Rd b/man/last_date_month.Rd
index f52305356..3d3b9544e 100644
--- a/man/last_date_month.Rd
+++ b/man/last_date_month.Rd
@@ -25,15 +25,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/midpoint_fy.Rd b/man/midpoint_fy.Rd
index 7bac9b6b3..2363df773 100644
--- a/man/midpoint_fy.Rd
+++ b/man/midpoint_fy.Rd
@@ -27,15 +27,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/next_fy.Rd b/man/next_fy.Rd
index 19e1193f4..7524c5f11 100644
--- a/man/next_fy.Rd
+++ b/man/next_fy.Rd
@@ -27,15 +27,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
-\code{\link{start_fy_quarter}()},
 \code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()},
 \code{\link{start_next_fy_quarter}()}
 }
 \concept{date functions}
diff --git a/man/start_fy.Rd b/man/start_fy.Rd
index 4996bfb72..9951af2ec 100644
--- a/man/start_fy.Rd
+++ b/man/start_fy.Rd
@@ -27,8 +27,8 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
diff --git a/man/start_fy_quarter.Rd b/man/start_fy_quarter.Rd
index f5729dcb0..9936736a8 100644
--- a/man/start_fy_quarter.Rd
+++ b/man/start_fy_quarter.Rd
@@ -26,8 +26,8 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
diff --git a/man/start_next_fy_quarter.Rd b/man/start_next_fy_quarter.Rd
index 098f0bf73..fdac297a7 100644
--- a/man/start_next_fy_quarter.Rd
+++ b/man/start_next_fy_quarter.Rd
@@ -26,15 +26,15 @@ Other date functions:
 \code{\link{compute_mid_year_age}()},
 \code{\link{convert_date_to_numeric}()},
 \code{\link{convert_numeric_to_date}()},
-\code{\link{end_fy_quarter}()},
 \code{\link{end_fy}()},
+\code{\link{end_fy_quarter}()},
 \code{\link{end_next_fy_quarter}()},
 \code{\link{fy_interval}()},
 \code{\link{is_date_in_fyyear}()},
 \code{\link{last_date_month}()},
 \code{\link{midpoint_fy}()},
 \code{\link{next_fy}()},
-\code{\link{start_fy_quarter}()},
-\code{\link{start_fy}()}
+\code{\link{start_fy}()},
+\code{\link{start_fy_quarter}()}
 }
 \concept{date functions}

From d0a215118dc8d26e7a834889f2d08cc975cf3e9f Mon Sep 17 00:00:00 2001
From: marjom02 <megan.mcnicol2@nhs.scot>
Date: Mon, 29 Jan 2024 16:15:57 +0000
Subject: [PATCH 12/15] Added ungroup()

---
 R/process_lookup_sc_demographics.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
index 7c6e01766..96adc985e 100644
--- a/R/process_lookup_sc_demographics.R
+++ b/R/process_lookup_sc_demographics.R
@@ -44,6 +44,7 @@ process_lookup_sc_demographics <- function(
     tidyr::fill(gender, .direction = ("updown")) %>%
     tidyr::fill(chi_postcode, .direction = ("updown")) %>%
     tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
+    dplyr::ungroup() %>%
     # format postcodes using `phsmethods`
     dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?
 

From 5574f7455ba7023ff444f38788166c519e7323ab Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:24:53 +0000
Subject: [PATCH 13/15] Remove comments

---
 R/replace_sc_id_with_latest.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R
index 4ae3c06ed..14b44061a 100644
--- a/R/replace_sc_id_with_latest.R
+++ b/R/replace_sc_id_with_latest.R
@@ -27,7 +27,7 @@ replace_sc_id_with_latest <- function(data) {
 
   return_data <- change_sc_id %>%
     # Match back onto data
-    dplyr::right_join(data, # matched_sds_data,#
+    dplyr::right_join(data,
       by = c("sending_location", "chi"),
       multiple = "all"
     ) %>%

From 937943c8d94314b69d3795aad4edacf270c79f73 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:25:08 +0000
Subject: [PATCH 14/15] Remove comments

---
 R/replace_sc_id_with_latest.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R
index 14b44061a..3a85b0dbc 100644
--- a/R/replace_sc_id_with_latest.R
+++ b/R/replace_sc_id_with_latest.R
@@ -11,7 +11,7 @@ replace_sc_id_with_latest <- function(data) {
   )
 
   # select variables we need
-  filter_data <- data %>% # matched_sds_data %>% #
+  filter_data <- data %>% 
     dplyr::select(
       "sending_location", "social_care_id", "chi", "latest_flag"
     ) %>%

From 5d7f03b89e1345e819fe65e20840d07a77a59961 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:26:33 +0000
Subject: [PATCH 15/15] Style code

---
 R/replace_sc_id_with_latest.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/replace_sc_id_with_latest.R b/R/replace_sc_id_with_latest.R
index 3a85b0dbc..2c32bbb93 100644
--- a/R/replace_sc_id_with_latest.R
+++ b/R/replace_sc_id_with_latest.R
@@ -11,7 +11,7 @@ replace_sc_id_with_latest <- function(data) {
   )
 
   # select variables we need
-  filter_data <- data %>% 
+  filter_data <- data %>%
     dplyr::select(
       "sending_location", "social_care_id", "chi", "latest_flag"
     ) %>%