From d89d6e316f65451d0a1ca457984f84ada8ee8aca Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Mon, 2 Dec 2024 09:07:08 +0000 Subject: [PATCH] Refactor data loading and tidying into separate scripts This will improve readability and modularity and make it easier for us to make further changes later on. It's only a start, there's much more we could do. --- lib/functions/load_opensafely_outputs.R | 88 +++++++++++++++++++++++++ lib/functions/load_validation_data.R | 42 ++++++++++++ lib/functions/plot_measures.R | 6 ++ lib/functions/tidy_measures.R | 35 ++++++++++ 4 files changed, 171 insertions(+) create mode 100644 lib/functions/load_opensafely_outputs.R create mode 100644 lib/functions/load_validation_data.R diff --git a/lib/functions/load_opensafely_outputs.R b/lib/functions/load_opensafely_outputs.R new file mode 100644 index 0000000..46204bd --- /dev/null +++ b/lib/functions/load_opensafely_outputs.R @@ -0,0 +1,88 @@ +# Load data based on execution environment +if (Sys.getenv("OPENSAFELY_BACKEND") != "") { + # Load data from generate_pf_measures action + df_measures <- readr::read_csv( + here("output", "measures", "pf_codes_conditions_measures.csv") + ) + df_descriptive_stats <- read_csv( + here("output", "measures", "pf_descriptive_stats_measures.csv") + ) + df_pfmed <- read_csv( + here("output", "measures", "pf_medications_measures.csv") + ) + df_condition_provider <- read_csv( + here("output", "measures", "pf_condition_provider_measures.csv") + ) +} else { + # Load data from released_output directory + df_measures <- readr::read_csv( + here("released_output", "measures", "pf_codes_conditions_measures.csv") + ) + df_descriptive_stats <- read_csv( + here("released_output", "measures", "pf_descriptive_stats_measures.csv") + ) + df_pfmed <- read_csv( + here("released_output", "measures", "pf_medications_measures.csv") + ) + df_condition_provider <- read_csv( + here("released_output", "measures", "pf_condition_provider_measures.csv") + ) +} + +df_measures <- tidy_measures( + data = df_measures, + pf_measures_name_dict = pf_measures_name_dict, + pf_measures_name_mapping = pf_measures_name_mapping, + pf_measures_groupby_dict = pf_measures_groupby_dict +) + +df_measures$ethnicity <- factor( + df_measures$ethnicity, + levels = c( + "White", + "Mixed", + "Asian or Asian British", + "Black or Black British", + "Chinese or Other Ethnic Groups", + "Missing" + ), + ordered = TRUE +) + +df_measures$age_band <- factor( + df_measures$age_band, + levels = c( + "0-19", + "20-39", + "40-59", + "60-79", + "80+", + "Missing" + ), + ordered = TRUE +) + +df_measures$region <- factor( + df_measures$region, + levels = c( + "East", + "East Midlands", + "London", + "North East", + "North West", + "South East", + "South West", + "West Midlands", + "Yorkshire and The Humber", + "Missing" + ), + ordered = TRUE +) + +df_measures <- df_measures %>% + mutate(sex = factor(sex, + levels = c("female", "male"), + labels = c("Female", "Male") + )) + +df_measures$age_band[is.na(df_measures$age_band)] <- "Missing" \ No newline at end of file diff --git a/lib/functions/load_validation_data.R b/lib/functions/load_validation_data.R new file mode 100644 index 0000000..539dc41 --- /dev/null +++ b/lib/functions/load_validation_data.R @@ -0,0 +1,42 @@ +library(readr) +library(tidyr) +library(dplyr) +library(here) + +df_bsa_consultation_validation <- read_csv( + here("lib", "validation", "data", "pf_consultation_validation_data.csv") +) %>% + rename(count_100pct = count) |> + mutate(count_40pct = round(as.numeric(count_100pct * .4), digits = 0)) %>% + mutate(source = "nhs_bsa") |> + pivot_longer( + cols = c(count_100pct, count_40pct), + names_to = "count_method", + values_to = "count" + ) + +df_bsa_consultation_validation <- df_bsa_consultation_validation %>% + mutate(consultation_type = factor(consultation_type, + levels = c( + "sinusitis", + "infected_insect_bites", + "uncomplicated_uti", + "acute_otitis_media", + "acute_sore_throat", + "shingles", + "impetigo" + ), + labels = c( + "Acute Sinusitis", + "Infected Insect Bite", + "UTI", + "Acute Otitis Media", + "Acute Pharyngitis", + "Herpes Zoster", + "Impetigo" + ) + )) + +df_bsa_medication_validation <- read_csv( + here("lib", "validation", "data", "pf_medication_validation_data.csv") +) diff --git a/lib/functions/plot_measures.R b/lib/functions/plot_measures.R index 9525dfa..045ebcb 100644 --- a/lib/functions/plot_measures.R +++ b/lib/functions/plot_measures.R @@ -97,3 +97,9 @@ plot_measures <- function( plot_tmp } + +# Colour palettes +gradient_palette <- c("#001F4D", "#0056B3", "#007BFF", "#66B3E2", "#A4D8E1", "grey") +region_palette <- c("red", "navy", "#018701", "#ffa600ca", "purple", "brown", "#f4a5b2", "cyan", "green", "grey") +ethnicity_palette <- c("#42db0188", "#0056B3", "#ff0000c2", "#a52a2a5a", "purple", "grey") +sex_palette <- c("red", "blue") \ No newline at end of file diff --git a/lib/functions/tidy_measures.R b/lib/functions/tidy_measures.R index 2689ac4..44ab4e6 100644 --- a/lib/functions/tidy_measures.R +++ b/lib/functions/tidy_measures.R @@ -1,3 +1,38 @@ +# Define dictionaries with tidy names and mappings for measures +pf_measures_name_dict <- list( + consultation_service = "Consultation Service", + pharmacy_first_service = "Pharmacy First Consultation", + combined_pf_service = "Pharmacy First Consultations (Combined)", + acute_otitis_media = "Acute Otitis Media", + herpes_zoster = "Herpes Zoster", + acute_sinusitis = "Acute Sinusitis", + impetigo = "Impetigo", + infected_insect_bite = "Infected Insect Bite", + acute_pharyngitis = "Acute Pharyngitis", + uncomplicated_urinary_tract_infection = "UTI" +) + +pf_measures_name_mapping <- list( + consultation_service = "clinical_service", + pharmacy_first_service = "clinical_service", + combined_pf_service = "pharmacy_first_services", + acute_otitis_media = "clinical_condition", + herpes_zoster = "clinical_condition", + acute_sinusitis = "clinical_condition", + impetigo = "clinical_condition", + infected_insect_bite = "clinical_condition", + acute_pharyngitis = "clinical_condition", + uncomplicated_urinary_tract_infection = "clinical_condition" +) + +pf_measures_groupby_dict <- list( + age_band = "Age band", + sex = "Sex", + imd = "IMD", + region = "Region", + ethnicity = "Ethnicity" +) + #' Tidy measures data #' #' Creates a tidier dataframe of measures data.