Merge branch 'sept-update-23' into add-sc-client-tests

Public-Health-Scotland · Sep 25, 2023 · 633d430 · 633d430
2 parents f103e7a + 15dde68
commit 633d430
Show file tree

Hide file tree

Showing 54 changed files with 726 additions and 76 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(add_homelessness_date_flags)
+export(add_homelessness_flag)
+export(add_hri_variables)
 export(add_nsu_cohort)
 export(check_year_format)
 export(clean_up_free_text)
@@ -13,6 +16,7 @@ export(convert_hscp_to_hscpnames)
 export(convert_numeric_to_date)
 export(convert_sending_location_to_lca)
 export(convert_year_to_fyyear)
+export(create_homelessness_lookup)
 export(create_individual_file)
 export(create_service_use_cohorts)
 export(end_fy)
@@ -29,6 +33,7 @@ export(get_demographic_cohorts_path)
 export(get_dev_dir)
 export(get_dn_costs_path)
 export(get_dn_raw_costs_path)
+export(get_existing_data_for_tests)
 export(get_file_path)
 export(get_gp_ooh_costs_path)
 export(get_gp_ooh_raw_costs_path)
@@ -133,6 +138,7 @@ export(process_tests_sc_ch_episodes)
 export(process_tests_sc_client_lookup)
 export(process_tests_sc_demographics)
 export(process_tests_sds)
+export(produce_episode_file_tests)
 export(produce_source_extract_tests)
 export(produce_test_comparison)
 export(read_extract_acute)

diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,9 @@
 # September 2023 Update - Unreleased
 * Update of 2017/18 onwards to include bug fixes within the files. 
-* New 2023/24 files. 
-* New NSU cohort for 2022/23 file. 
+* New 2023/24 files.
+  *No social care data available for new 2023/24 file.
+* New NSU cohort for 2022/23 file.
+* SPD and SIMD updated.
 * Re addition of:
   * HRIs in individual file.
   * Homelessness Flags.

diff --git a/R/00-update_refs.R b/R/00-update_refs.R
@@ -7,7 +7,7 @@
 #'
 #' @family initialisation
 latest_update <- function() {
-  "Jun_2023"
+  "Sep_2023"
 }
 
 #' Previous update
@@ -61,7 +61,7 @@ previous_update <- function(months_ago = 3L, override = NULL) {
 #'
 #' @family initialisation
 get_dd_period <- function() {
-  "Jul16_Mar23"
+  "Jul16_Jun23"
 }
 
 #' The latest financial year for Cost uplift setting
@@ -74,5 +74,5 @@ get_dd_period <- function() {
 #'
 #' @family initialisation
 latest_cost_year <- function() {
-  "2223"
+  "2324"
 }
diff --git a/R/add_hri_variables.R b/R/add_hri_variables.R
@@ -0,0 +1,142 @@
+#' Flag non-Scottish residents
+#'
+#' @details The variable keep flag can be in the range c(0:4) where
+#' \itemize{
+#' \item{keep_flag = 0 when resident is Scottish}
+#' \item{keep_flag = 1 when resident is not Scottish}
+#' \item{keep_flag = 2 when the postcode is missing or a dummy, and the gpprac is missing}
+#' \item{keep_flag = 3 when the gpprac is not English and the postcode is missing}
+#' \item{keep_flag = 4 when the gpprac is not English and the postcode is a dummy}
+#' }
+#' The intention is to only keep the records where keep_flag = 0
+#'
+#' @inheritParams add_hri_variables
+#'
+#' @return A data frame with the variable 'keep_flag'
+flag_non_scottish_residents <- function(
+    data,
+    slf_pc_lookup) {
+  check_variables_exist(data, c("postcode", "gpprac"))
+
+  # Make a lookup of postcode areas, which consist of the first characters
+  # of the postcode
+  pc_areas <- slf_pc_lookup %>%
+    dplyr::mutate(
+      pc_area = stringr::str_match(postcode, "^[A-Z]{1,3}"),
+      scot_flag = TRUE
+    ) %>%
+    dplyr::distinct(pc_area, scot_flag)
+
+  # Create a flag, 'keep_flag', to determine whether individuals are Scottish
+  # residents or not
+  return_data <- data %>%
+    dplyr::mutate(pc_area = stringr::str_match(postcode, "^[A-Z]{1,3}")) %>%
+    dplyr::left_join(pc_areas, by = "pc_area") %>%
+    dplyr::mutate(
+      dummy_postcode = .data$postcode %in% c("BF010AA", "NF1 1AB", "NK010AA") |
+        stringr::str_sub(.data$postcode, 1, 4) %in% c("ZZ01", "ZZ61"),
+      eng_prac = .data$gpprac %in% c(99942, 99957, 99961, 99976, 99981, 99995, 99999),
+      scottish_resident = dplyr::case_when(
+        .data$scot_flag ~ 0L,
+        (is_missing(.data$postcode) | .data$dummy_postcode) & is.na(.data$gpprac) ~ 2L,
+        !.data$eng_prac & is_missing(.data$postcode) ~ 3L,
+        !.data$eng_prac & .data$dummy_postcode ~ 4L,
+        .default = 1L
+      )
+    ) %>%
+    dplyr::select(-"dummy_postcode", -"eng_prac")
+
+  return(return_data)
+}
+
+#' Add HRI variables to an SLF Individual File
+#'
+#' @details Filters the dataset to only include Scottish residents, then
+#' creates a lookup where HRIs are calculated at Scotland, Health Board, and
+#' LCA level. Then joins on this lookup by chi/anon_chi.
+#'
+#' @param data An SLF individual file.
+#' @param slf_pc_lookup The Source postcode lookup, defaults
+#' to [get_slf_postcode_path()] read using [read_file()].
+#'
+#' @return The individual file with HRI variables matched on
+#' @export
+add_hri_variables <- function(
+    data,
+    chi_variable = "chi",
+    slf_pc_lookup = read_file(
+      get_slf_postcode_path(),
+      col_select = "postcode"
+    )) {
+  hri_lookup <- data %>%
+    dplyr::select(
+      "year",
+      chi_variable,
+      "postcode",
+      "gpprac",
+      "lca",
+      "hbrescode",
+      "health_net_cost",
+      "acute_episodes",
+      "mat_episodes",
+      "mh_episodes",
+      "gls_episodes",
+      "op_newcons_attendances",
+      # op_newcons_dnas,
+      "ae_attendances",
+      "pis_paid_items",
+      "ooh_cases"
+    ) %>%
+    flag_non_scottish_residents(slf_pc_lookup = slf_pc_lookup) %>%
+    dplyr::filter(scottish_resident == 0L) %>%
+    # Scotland cost and proportion
+    dplyr::mutate(
+      scotland_cost = sum(health_net_cost),
+      scotland_pct = (health_net_cost / scotland_cost) * 100
+    ) %>%
+    dplyr::arrange(dplyr::desc(health_net_cost)) %>%
+    dplyr::mutate(hri_scotp = cumsum(scotland_pct)) %>%
+    # Health Board
+    dplyr::group_by(hbrescode) %>%
+    dplyr::mutate(
+      hb_cost = sum(health_net_cost),
+      hb_pct = (health_net_cost / hb_cost) * 100
+    ) %>%
+    dplyr::arrange(dplyr::desc(health_net_cost), .by_group = TRUE) %>%
+    dplyr::mutate(hri_hbp = cumsum(hb_pct)) %>%
+    dplyr::ungroup() %>%
+    # LCA
+    dplyr::group_by(lca) %>%
+    dplyr::mutate(
+      lca_cost = sum(health_net_cost),
+      lca_pct = (health_net_cost / lca_cost) * 100
+    ) %>%
+    dplyr::arrange(dplyr::desc(health_net_cost), .by_group = TRUE) %>%
+    dplyr::mutate(hri_lcap = cumsum(lca_pct)) %>%
+    dplyr::ungroup() %>%
+    # Add HRI flags
+    dplyr::mutate(
+      hri_scot = hri_scotp <= 50.0,
+      hri_hb = hri_hbp <= 50.0,
+      hri_lca = hri_lcap <= 50.0,
+      # Deal with potential missing variables
+      hri_hb = dplyr::if_else(is_missing(hbrescode), FALSE, hri_hb),
+      hri_hbp = dplyr::if_else(is_missing(hbrescode), NA, hri_hbp),
+      hri_lca = dplyr::if_else(is_missing(lca), FALSE, hri_lca),
+      hri_lcap = dplyr::if_else(is_missing(lca), NA, hri_lcap)
+    ) %>%
+    # Select only required variables for the lookup
+    dplyr::select(
+      chi_variable,
+      "hri_scot",
+      "hri_scotp",
+      "hri_hb",
+      "hri_hbp",
+      "hri_lca",
+      "hri_lcap"
+    )
+
+  return_data <- dplyr::left_join(data, hri_lookup, by = chi_variable)
+
+  return(return_data)
+}
diff --git a/R/aggregate_by_chi.R b/R/aggregate_by_chi.R
@@ -7,7 +7,7 @@
 #' @importFrom data.table .SD
 #'
 #' @inheritParams create_individual_file
-aggregate_by_chi <- function(episode_file) {
+aggregate_by_chi <- function(episode_file, exclude_sc_var = FALSE) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
   # Convert to data.table
@@ -28,17 +28,33 @@ aggregate_by_chi <- function(episode_file) {
     )
   )
 
-  data.table::setnames(
-    episode_file,
-    c(
-      "ch_chi_cis", "cij_marker", "ooh_case_id"
-      # ,"hh_in_fy"
-    ),
-    c(
-      "ch_cis_episodes", "cij_total", "ooh_cases"
-      # ,"hl1_in_fy"
+  if (exclude_sc_var) {
+    data.table::setnames(
+      episode_file,
+      c(
+        "cij_marker",
+        "ooh_case_id"
+      ),
+      c(
+        "cij_total",
+        "ooh_cases"
+      )
     )
-  )
+  } else {
+    data.table::setnames(
+      episode_file,
+      c(
+        "ch_chi_cis",
+        "cij_marker",
+        "ooh_case_id"
+      ),
+      c(
+        "ch_cis_episodes",
+        "cij_total",
+        "ooh_cases"
+      )
+    )
+  }
 
   # column specification, grouped by chi
   # columns to select last
@@ -48,6 +64,9 @@ aggregate_by_chi <- function(episode_file) {
     "gpprac",
     vars_start_with(episode_file, "sc_")
   )
+  if (exclude_sc_var) {
+    cols2 <- cols2[!(cols2 %in% vars_start_with(episode_file, "sc_"))]
+  }
   # columns to count unique rows
   cols3 <- c(
     "ch_cis_episodes",
@@ -59,6 +78,9 @@ aggregate_by_chi <- function(episode_file) {
     "ooh_cases",
     "preventable_admissions"
   )
+  if (exclude_sc_var) {
+    cols3 <- cols3[!(cols3 %in% "ch_cis_episodes")]
+  }
   # columns to sum up
   cols4 <- c(
     vars_end_with(
@@ -91,6 +113,22 @@ aggregate_by_chi <- function(episode_file) {
     "health_net_cost_inc_dnas"
   )
   cols4 <- cols4[!(cols4 %in% "ch_cis_episodes")]
+  if (exclude_sc_var) {
+    cols4 <-
+      cols4[!(cols4 %in% c(
+        vars_end_with(
+          episode_file,
+          c(
+            "alarms",
+            "telecare"
+          )
+        ),
+        vars_start_with(
+          episode_file,
+          "sds_option"
+        )
+      ))]
+  }
   # columns to select maximum
   cols5 <- c("nsu", vars_contain(episode_file, "hl1_in_fy"))
   data.table::setnafill(episode_file, fill = 0L, cols = cols5)

diff --git a/R/check_year_valid.R b/R/check_year_valid.R
@@ -46,6 +46,8 @@ check_year_valid <- function(
     return(FALSE)
   } else if (year >= "2324" && type %in% c("SPARRA", "HHG")) {
     return(FALSE)
+  } else if (year >= "2324" && type %in% c("CH", "HC", "SDS", "AT")) {
+    return(FALSE)
   }
 
   return(TRUE)