From af181cc3ec4dcafe6874527a5cc83a45911a49eb Mon Sep 17 00:00:00 2001 From: Jennifer Thom Date: Thu, 7 Nov 2024 09:41:00 +0000 Subject: [PATCH] Create a new folder to organise scripts --- .../00_Sort_BI_Extracts.R | 0 .../filter_nsu_duplicates.R | 0 .../get_service_use_cohort.R | 0 Pre_processing_scripts/write_anon_chi_files.R | 70 +++++++++++++++++++ 4 files changed, 70 insertions(+) rename 00_Sort_BI_Extracts.R => Pre_processing_scripts/00_Sort_BI_Extracts.R (100%) rename {extract_new_nsu_cohort => Pre_processing_scripts/extract_new_nsu_cohort}/filter_nsu_duplicates.R (100%) rename {extract_new_nsu_cohort => Pre_processing_scripts/extract_new_nsu_cohort}/get_service_use_cohort.R (100%) create mode 100644 Pre_processing_scripts/write_anon_chi_files.R diff --git a/00_Sort_BI_Extracts.R b/Pre_processing_scripts/00_Sort_BI_Extracts.R similarity index 100% rename from 00_Sort_BI_Extracts.R rename to Pre_processing_scripts/00_Sort_BI_Extracts.R diff --git a/extract_new_nsu_cohort/filter_nsu_duplicates.R b/Pre_processing_scripts/extract_new_nsu_cohort/filter_nsu_duplicates.R similarity index 100% rename from extract_new_nsu_cohort/filter_nsu_duplicates.R rename to Pre_processing_scripts/extract_new_nsu_cohort/filter_nsu_duplicates.R diff --git a/extract_new_nsu_cohort/get_service_use_cohort.R b/Pre_processing_scripts/extract_new_nsu_cohort/get_service_use_cohort.R similarity index 100% rename from extract_new_nsu_cohort/get_service_use_cohort.R rename to Pre_processing_scripts/extract_new_nsu_cohort/get_service_use_cohort.R diff --git a/Pre_processing_scripts/write_anon_chi_files.R b/Pre_processing_scripts/write_anon_chi_files.R new file mode 100644 index 000000000..a344d68da --- /dev/null +++ b/Pre_processing_scripts/write_anon_chi_files.R @@ -0,0 +1,70 @@ +################################################################################ +# Name of file - Write_anon_chi_files.R +# +# Original Authors - Jennifer Thom, Zihao Li +# Original Date - July 2024 +# Written/run on - R Posit +# Version of R - 4.1.2 +# +# Description: Run this script in stages to convert chi to anon chi and save files. +# By default this is set up to take the delayed discharges file +# convert the chi to anon_chi and save to disk. Important for +# ensuring we do not save chi anywhere on disk. +# +################################################################################ + +## Stage 1 - Setup environment +-------------------------------------------------------------------------------- + +# Set up directory +source_dir <- "/conf/hscdiip/SLF_Extracts/Delayed_Discharges" + +# Specify type of files e.g parquet, rds, csv +pattern <- ".parquet" +cat(stringr::str_glue("Looking in '{source_dir}' for parquet files.")) + +# List all files in the directory +parquet_files <- list.files(source_dir, pattern = ".parquet", full.names = TRUE) +print(stringr::str_glue("Found {length(parquet_files)} parquet files to process.")) + +# Create a function to read variable names and check if CHI is in the file +is_chi_in_file <- function(filename) { + data <- arrow::read_parquet(filename, nrow = 5) + return(grepl("chi", names(data)) %>% any()) +} + + +# Stage 2 - In each file, convert chi to anon_chi and save to disk +-------------------------------------------------------------------------------- + + # create a loop for converting to anon chi in all listed files + for (data_file in parquet_files) { + # specify new name and new file path + save_file_path <- file.path(source_dir, paste0("anon-", basename(data_file))) + chi_in_file <- is_chi_in_file(data_file) + + # If chi is in the file, convert to anon_chi + if (chi_in_file) { + read_file(data_file) %>% + slfhelper::get_anon_chi() %>% + write_file(save_file_path) + + cat("Replaced chi with anon chi:", data_file, "to", save_file_path, "\n") + } else { + read_file(data_file) %>% + write_file(save_file_path) + cat("renamed file with anon chi:", data_file, "to", save_file_path, "\n") + } + } + + +# Stage 3 - Remove files with CHI +-------------------------------------------------------------------------------- + + # Create a loop for removing the old files with CHI + for (data_file in parquet_files) { + file.remove(data_file) + cat("Removed chi files:", data_file, "in", source_dir, "\n") + } + +# End of Script #