diff --git a/.gitignore b/.gitignore index 5e16945..821e64f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ model.log __pycache__ .python-version /output/* +/post_check/* metadata/* venv/ .DS_Store diff --git a/analysis/additional_comorbidities.py b/analysis/additional_comorbidities.py index f9bd947..1a335e7 100644 --- a/analysis/additional_comorbidities.py +++ b/analysis/additional_comorbidities.py @@ -181,12 +181,6 @@ def filter_codes_by_category(codelist, include): .is_on_or_after(last_prior_event(codelists .copd_resolved_codelist).date)) ) - -#pulmonary fibrosis diagnosis -has_pulmonary_fibrosis = ( - has_prior_event(codelists - .pulmonary_fibrosis_codelist) -) #cystic fibrosis diagnosis has_cystic_fibrosis = ( @@ -195,6 +189,21 @@ def filter_codes_by_category(codelist, include): .exists_for_patient() ) +#pulmonary fibrosis diagnosis +has_pulmonary_fibrosis = ( + has_prior_event(codelists + .pulmonary_fibrosis_codelist) +) + +#Chronic Respiratory Disease +has_crd = has_prior_event(codelists.crd_codelist) + +#other chronic respiratory disease +has_other_resp = ( + (has_pulmonary_fibrosis | has_crd) & + (~has_asthma & ~ has_copd & ~ has_cystic_fibrosis) +) + #diabetes diagnosis diab_date = last_prior_event(codelists.diabetes_codelist).date dmres_date = last_prior_event(codelists.diabetes_resolved_codelist).date @@ -248,9 +257,13 @@ def filter_codes_by_category(codelist, include): otherwise = False ) -#Chronic Heart Disease -has_chd = has_prior_event(codelists.chd_codelist) - +#Chronic Heart Diseases +has_chd = ( + (has_prior_event(codelists.chd_codelist)) | + (has_prior_event(codelists.heart_failure_codelist)) | + (has_prior_event(codelists.coronary_heart_disease_codelist)) +) + #Chronic Kidney Disease ############################################################################### @@ -275,9 +288,6 @@ def filter_codes_by_category(codelist, include): #Chronic Neurological Disease including Significant Learning Disorder has_cnd = has_prior_event(codelists.cnd_codelist) -#Chronic Respiratory Disease -has_crd = has_prior_event(codelists.crd_codelist) - #Cancer within 3 years has_cancer = ( has_prior_event(codelists.cancer_codelist + @@ -320,12 +330,3 @@ def filter_codes_by_category(codelist, include): #Sickle Cell Disease has_sickle_cell = has_prior_event(codelists.sickle_cell_codelist) - -#Heart Failure -has_heart_failure = has_prior_event(codelists.heart_failure_codelist) - -#Coronary Heart Disease -has_coronary_heart_disease = ( - has_prior_event(codelists - .coronary_heart_disease_codelist) -) diff --git a/analysis/cohort_criteria.R b/analysis/cohort_criteria.R new file mode 100644 index 0000000..3957f68 --- /dev/null +++ b/analysis/cohort_criteria.R @@ -0,0 +1,92 @@ +library(tidyverse) +library(here) +library(arrow) +library(ggplot2) +library(data.table) +library(gtsummary) + +#define study start date and study end date +source(here("analysis", "design", "design.R")) +args <- commandArgs(trailingOnly = TRUE) +if (length(args) == 0) { + study_start_date <- "2016-09-01" + study_end_date <- "2017-08-31" + cohort <- "adults" +} else { + study_start_date <- study_dates[[args[[2]]]] + study_end_date <- study_dates[[args[[3]]]] + cohort <- args[[1]] +} + +patients_df <- read_csv( + here::here("output", "flow_chart", paste0(cohort, "_", year(study_start_date), + "_", year(study_end_date), "_flow_chart", ".csv"))) + +patients_df <- patients_df %>% + mutate( + has_imd = ifelse(is.na(patients_df$imd_rounded), F, T), + is_female_or_male = ifelse(patients_df$sex == "female" | patients_df$sex == "male", T, F) + ) + +if (cohort == "infants" | cohort == "infants_subgroup") { + is_appropriate_age = ifelse(patients_df$age >= 0 & patients_df$age <= 23, T, F) +} else if (cohort == "children_and_adolescents") { + is_appropriate_age = ifelse(patients_df$age >= 2 & patients_df$age <= 17, T, F) +} else if (cohort == "adults") { + is_appropriate_age = ifelse(patients_df$age >= 18 & patients_df$age <= 64, T, F) +} else { + is_appropriate_age = ifelse(patients_df$age >= 65, T, F) +} + +patients_df <- patients_df %>% + mutate(is_appropriate_age = is_appropriate_age) + +# Define counts based on inclusion and exclusion criteria +total <- nrow(patients_df) +registered_count <- sum(patients_df$registered) +non_registered_count <- total - registered_count +age_count <- if (cohort == "infants" | cohort == "infants_subgroup") { + sum(patients_df$is_appropriate_age) + } else { + sum(patients_df$is_appropriate_age & patients_df$registered, na.rm = TRUE) + } +not_age_count <- if (cohort == "infants" | cohort == "infants_subgroup") { + total - age_count +} else { + registered_count - age_count +} + +if (cohort == "older_adults") { + included_count <- sum(patients_df$registered & patients_df$is_female_or_male + & patients_df$is_appropriate_age & patients_df$has_imd + & !patients_df$care_home, na.rm = TRUE) + excluded_count <- sum(!patients_df$is_female_or_male |!patients_df$has_imd + | patients_df$care_home, na.rm = TRUE) - not_age_count +} else if (cohort == "infants" | cohort == "infants_subgroup") { + included_count <- sum(patients_df$is_female_or_male + & patients_df$is_appropriate_age & patients_df$has_imd + & !patients_df$risk_group_infants + & !patients_df$severe_immunodeficiency, na.rm = TRUE) + excluded_count <- sum(!patients_df$is_female_or_male |!patients_df$has_imd + | patients_df$risk_group_infants + | patients_df$severe_immunodeficiency, na.rm = TRUE) - + not_age_count +} else { + included_count <- sum(patients_df$registered & patients_df$is_female_or_male + & patients_df$is_appropriate_age & patients_df$has_imd, + na.rm = TRUE) + excluded_count <- sum(!patients_df$is_female_or_male + |!patients_df$has_imd, na.rm = TRUE) - not_age_count +} + +## create output directories ---- +fs::dir_create(here("output", "flow_chart")) + +#export flow chart numbers +table <- cbind(total, non_registered_count, registered_count, + not_age_count, age_count, excluded_count, included_count) +table <- table %>% + as.data.frame() %>% + write_csv(path = paste0(here::here("output", "flow_chart"), "/", + "flow_chart_processed_", cohort, "_", year(study_start_date), "_", + year(study_end_date), ".csv")) diff --git a/analysis/cohort_description.R b/analysis/cohort_description.R new file mode 100644 index 0000000..6bf7c15 --- /dev/null +++ b/analysis/cohort_description.R @@ -0,0 +1,112 @@ +library(tidyverse) +library(here) +library(arrow) +library(ggplot2) +library(data.table) +library(gtsummary) + +## create output directories ---- +fs::dir_create(here("analysis")) + +#import redaction functions +source(here("analysis", "functions", "redaction.R")) + +#define study start date and study end date +source(here("analysis", "design", "design.R")) +args <- commandArgs(trailingOnly = TRUE) +if (length(args) == 0) { + study_start_date <- "2016-09-01" + study_end_date <- "2017-08-31" + cohort <- "adults" + codelist_type <- "specific" + investigation_type <- "primary" +} else { + study_start_date <- study_dates[[args[[2]]]] + study_end_date <- study_dates[[args[[3]]]] + cohort <- args[[1]] + codelist_type <- args[[4]] + investigation_type <- args[[5]] +} +covid_season_min <- as.Date("2019-09-01") + +df_input <- read_feather( + here::here("output", "data", paste0("input_processed_", cohort, "_", + year(study_start_date), "_", year(study_end_date), "_", + codelist_type, "_", investigation_type,".arrow"))) + +df_datatable <- as.data.table(df_input) + +if (cohort == "infants") { + table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] + table <- df_datatable[registered == TRUE, .(Total, age_band, sex, + latest_ethnicity_group, imd_quintile, + rurality_classification)] + setnames(table, c("age_band", "sex", "latest_ethnicity_group", + "imd_quintile", "rurality_classification"), + c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality")) +} else if (cohort == "children_and_adolescents") { + table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] + table <- df_datatable[registered == TRUE, Reactive_Airway := ifelse(age <= 5, has_asthma_reactive_airway, F)] + table <- df_datatable[registered == TRUE, Asthma := ifelse(age > 5, has_asthma_reactive_airway, F)] + table <- df_datatable[registered == TRUE, .(Total, age_band, sex, + latest_ethnicity_group, imd_quintile, + rurality_classification, Asthma, Reactive_Airway, + flu_vaccination)] + setnames(table, c("age_band", "sex", "latest_ethnicity_group", + "imd_quintile", "rurality_classification", + "Reactive_Airway", "flu_vaccination"), + c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality", + "Reactive Airway", "Flu Vaccine")) + if (study_start_date >= covid_season_min) { + table[, covid_vaccination_count := df_datatable$covid_vaccination_count] + setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") + } +} else { + table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] + table <- df_datatable[registered == TRUE, .(Total, age_band, sex, + latest_ethnicity_group, imd_quintile, + rurality_classification, smoking_status, + hazardous_drinking, drug_usage, has_asthma, + has_copd, has_cystic_fibrosis, + has_other_resp, has_diabetes, has_addisons, + severe_obesity, has_chd, has_ckd, has_cld, has_cnd, + has_cancer, immunosuppressed, has_sickle_cell, + flu_vaccination)] + setnames(table, c("age_band", "sex", "latest_ethnicity_group", + "imd_quintile", "rurality_classification", + "smoking_status", "hazardous_drinking", "drug_usage", + "has_asthma", "has_copd", "has_cystic_fibrosis", + "has_other_resp", "has_diabetes", "has_addisons" , + "severe_obesity", "has_chd", "has_ckd", "has_cld", + "has_cnd", "has_cancer", "immunosuppressed", + "has_sickle_cell", "flu_vaccination"), + c("Age_Group", "Sex", "Ethnicity", "IMD", "Rurality", + "Smoking Status", "Hazardous Drinking", "Drug Usage", + "Asthma", "COPD", "Cystic Fibrosis", "Other Chronic Respiratory Diseases", + "Diabetes", "Addisons", "Severe Obesity", "Chronic Heart Diseases", + "Chronic Kidney Disease", "Chronic Liver Disease", + "Chronic Neurological Disease", "Cancer Within 3 Years", + "Immunosuppressed", "Sickle Cell Disease", "Flu Vaccine")) + if (study_start_date >= covid_season_min) { + table[, covid_vaccination_count := df_datatable$covid_vaccination_count] + setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") + } + } + +## create output directories ---- +fs::dir_create(here("output", "table1")) + +#export +table %>% + tbl_summary() %>% + as_gt() %>% + gt::gtsave(filename = paste0("table1_", cohort, "_", year(study_start_date), + "_", year(study_end_date), ".html"), + path = here::here("output", "table1")) + +table %>% + tbl_summary() %>% + as_tibble() %>% + write_csv(path = paste0(here::here("output", "table1"), "/", "table1_", + cohort, "_", year(study_start_date), "_", + year(study_end_date),".csv")) diff --git a/analysis/data_processing.R b/analysis/data_processing.R index a5905c3..c3c5c8b 100644 --- a/analysis/data_processing.R +++ b/analysis/data_processing.R @@ -1,9 +1,9 @@ -library("tidyverse") -library("here") -library("arrow") -library("ggplot2") -library("data.table") -library("lubridate") +library(tidyverse) +library(here) +library(arrow) +library(ggplot2) +library(data.table) +library(lubridate) ## create output directories ---- fs::dir_create(here("analysis")) @@ -11,16 +11,24 @@ fs::dir_create(here("analysis")) #define study start date and study end date source(here("analysis", "design", "design.R")) args <- commandArgs(trailingOnly = TRUE) -study_start_date <- study_dates[[args[[2]]]] -study_end_date <- study_dates[[args[[3]]]] -cohort <- args[[1]] -codelist_type <- args[[4]] -investigation_type <- args[[5]] +if (length(args) == 0) { + study_start_date <- "2016-09-01" + study_end_date <- "2017-08-31" + cohort <- "adults" + codelist_type <- "specific" + investigation_type <- "primary" +} else { + study_start_date <- study_dates[[args[[2]]]] + study_end_date <- study_dates[[args[[3]]]] + cohort <- args[[1]] + codelist_type <- args[[4]] + investigation_type <- args[[5]] +} df_input <- read_feather( - here::here("output", paste0("input_", cohort, "_", year(study_start_date), - "_", year(study_end_date), "_", codelist_type, "_", - investigation_type,".arrow"))) + here::here("output", "data", paste0("input_", cohort, "_", + year(study_start_date), "_", year(study_end_date), "_", + codelist_type, "_", investigation_type,".arrow"))) #assign ethnicity group df_input <- df_input %>% @@ -140,8 +148,11 @@ df_input <- df_input %>% df_input$time_mild <- difftime(df_input$end_time_mild, study_start_date, df_input, "weeks") df_input$time_severe <- difftime(df_input$end_time_severe, study_start_date, df_input, "weeks") +## create output directories ---- +fs::dir_create(here("output", "data")) + #write the new input file -write_feather(df_input, here::here("output", +write_feather(df_input, here::here("output", "data", paste0("input_processed_", cohort, "_", year(study_start_date), "_", year(study_end_date), "_", codelist_type, "_", investigation_type, ".arrow"))) diff --git a/analysis/dataset_definition.py b/analysis/dataset_definition.py index f28e900..52294ab 100644 --- a/analysis/dataset_definition.py +++ b/analysis/dataset_definition.py @@ -55,6 +55,7 @@ age_at_end = patients.age_on(study_end_date) age_months = (index_date - patients.date_of_birth).months age_at_start_months = (study_start_date - patients.date_of_birth).months +#age_at_end_months = (study_end_date - patients.date_of_birth).months #get patients who are registered, have sex, age, and imd info registered_patients = case( @@ -71,8 +72,8 @@ when(cohort == "older_adults").then((age_at_start <= 110) & (age_at_end >= 65)), when(cohort == "adults").then((age_at_start <= 64) & (age_at_end >= 18)), when(cohort == "children_adolescents").then((age_at_start <= 17) & (age_at_end >= 2)), - when(cohort == "infants").then(age_at_start_months <= 23), - when(cohort == "infants_subgroup").then(age_at_start_months <= 23) + when(cohort == "infants").then((age_at_start_months <= 23) & (age_at_start_months >= 0)), + when(cohort == "infants_subgroup").then((age_at_start_months <= 23) & (age_at_start_months >= 0)) ) has_imd = (addresses.for_patient_on(index_date).imd_rounded.is_not_null()) @@ -849,11 +850,10 @@ def first_infection_event(codelist, where = True): filter_codes_by_category, smoking_status, hazardous_drinking, drug_usage, has_asthma, has_reactive_airway, has_copd, - has_pulmonary_fibrosis, has_cystic_fibrosis, + has_cystic_fibrosis, has_other_resp, has_diabetes, has_addisons, severe_obesity, has_chd, has_ckd, has_cld, has_cnd, has_crd, has_cancer, immunosuppressed, has_sickle_cell, - has_heart_failure, has_coronary_heart_disease ) if cohort == "adults" or cohort == "older_adults" : @@ -863,8 +863,8 @@ def first_infection_event(codelist, where = True): dataset.drug_usage = drug_usage dataset.has_asthma = has_asthma dataset.has_copd = has_copd - dataset.has_pulmonary_fibrosis = has_pulmonary_fibrosis dataset.has_cystic_fibrosis = has_cystic_fibrosis + dataset.has_other_resp = has_other_resp dataset.has_diabetes = has_diabetes dataset.has_addisons = has_addisons dataset.severe_obesity = severe_obesity @@ -875,8 +875,6 @@ def first_infection_event(codelist, where = True): dataset.has_cancer = has_cancer dataset.immunosuppressed = immunosuppressed dataset.has_sickle_cell = has_sickle_cell - dataset.has_heart_failure = has_heart_failure - dataset.has_coronary_heart_disease = has_coronary_heart_disease if cohort == "children_and_adolescents" : diff --git a/analysis/dataset_definition_flow_chart.py b/analysis/dataset_definition_flow_chart.py index 9e0ca43..457c5da 100644 --- a/analysis/dataset_definition_flow_chart.py +++ b/analysis/dataset_definition_flow_chart.py @@ -2,7 +2,7 @@ from pathlib import Path from datetime import date, datetime -from ehrql import create_dataset, case, when, maximum_of, minimum_of, years, days +from ehrql import Dataset, case, when, maximum_of, minimum_of, years, days from ehrql.tables.tpp import ( patients, medications, @@ -27,8 +27,7 @@ import codelists -dataset = create_dataset() -dataset.configure_dummy_data(population_size = 100000) +dataset = Dataset() ####################################################################################### # Import study dates defined in "./analysis/design/study-dates.R" script and then exported @@ -54,6 +53,7 @@ age_at_end = patients.age_on(study_end_date) age_months = (index_date - patients.date_of_birth).months age_at_start_months = (study_start_date - patients.date_of_birth).months +#age_at_end_months = (study_end_date - patients.date_of_birth).months #get patients who are registered, have sex, age, and imd info registered_patients = case( @@ -70,8 +70,8 @@ when(cohort == "older_adults").then((age_at_start <= 110) & (age_at_end >= 65)), when(cohort == "adults").then((age_at_start <= 64) & (age_at_end >= 18)), when(cohort == "children_adolescents").then((age_at_start <= 17) & (age_at_end >= 2)), - when(cohort == "infants").then(age_at_start_months <= 23), - when(cohort == "infants_subgroup").then(age_at_start_months <= 23) + when(cohort == "infants").then((age_at_start_months <= 23) & (age_at_start_months >= 0)), + when(cohort == "infants_subgroup").then((age_at_start_months <= 23) & (age_at_start_months >= 0)) ) has_imd = (addresses.for_patient_on(index_date).imd_rounded.is_not_null()) @@ -215,6 +215,5 @@ def first_infection_event(codelist, where = True): when(cohort == "infants_subgroup").then(age_at_start_months) ) -dataset.is_female_or_male = is_female_or_male -dataset.is_appropriate_age = is_appropriate_age -dataset.has_imd = has_imd +#get patients IMD rank +dataset.imd_rounded = addresses.for_patient_on(index_date).imd_rounded diff --git a/analysis/dummydata/dummydata_adults.R b/analysis/dummydata/dummydata_adults.R index 196c53d..435c788 100644 --- a/analysis/dummydata/dummydata_adults.R +++ b/analysis/dummydata/dummydata_adults.R @@ -1,13 +1,13 @@ ##create a dummy dataset -library("tidyverse") -library("arrow") -library("here") -library("glue") -library("EnvStats") +library(tidyverse) +library(arrow) +library(here) +library(glue) +library(EnvStats) -remotes::install_github("https://github.com/wjchulme/dd4d") -library("dd4d") +#remotes::install_github("https://github.com/wjchulme/dd4d") +library(dd4d) ## create output directories ---- fs::dir_create(here("analysis", "dummydata")) @@ -110,7 +110,7 @@ sim_list = lst( "4", "5", "6" - ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0)) + ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0.05)) ), #household ID (to determine composition) @@ -156,16 +156,16 @@ sim_list = lst( I(smoking_status == "Former")*-0.1)) ), - #pulmonary fibrosis - has_pulmonary_fibrosis = bn_node( - ~ rbernoulli(n = ..n, p = 0.001) - ), - #cystic fibrosis has_cystic_fibrosis = bn_node( ~ rbernoulli(n = ..n, p = 0.02) ), + #other chronic respiratory diseases + has_other_resp = bn_node( + ~ rbernoulli(n = ..n, p = 0.01) + ), + #diabetes has_diabetes = bn_node( ~ rbernoulli(n = ..n, p = plogis(-1 + age*0.02 + I(sex == "female")*-0.2)) @@ -181,9 +181,9 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.1) ), - #chronic heart disease + #chronic heart diseases has_chd = bn_node( - ~ rbernoulli(n = ..n, p = 0.01) + ~ rbernoulli(n = ..n, p = 0.08) ), #chronic kidney disease @@ -221,16 +221,6 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.01) ), - #heart failure - has_heart_failure = bn_node( - ~ rbernoulli(n = ..n, p = 0.015) - ), - - #coronary heart disease - has_coronary_heart_disease = bn_node( - ~ rbernoulli(n = ..n, p = 0.1) - ), - #flu vaccination flu_vaccination = bn_node( ~ rbernoulli(n = ..n, p = 0.75) diff --git a/analysis/dummydata/dummydata_children_and_adolescents.R b/analysis/dummydata/dummydata_children_and_adolescents.R index 1a9f966..6fe6f8e 100644 --- a/analysis/dummydata/dummydata_children_and_adolescents.R +++ b/analysis/dummydata/dummydata_children_and_adolescents.R @@ -1,13 +1,13 @@ ##create a dummy dataset -library("tidyverse") -library("arrow") -library("here") -library("glue") -library("EnvStats") +library(tidyverse) +library(arrow) +library(here) +library(glue) +library(EnvStats) -remotes::install_github("https://github.com/wjchulme/dd4d") -library("dd4d") +#remotes::install_github("https://github.com/wjchulme/dd4d") +library(dd4d) ## create output directories ---- fs::dir_create(here("analysis", "dummydata")) @@ -110,7 +110,7 @@ sim_list = lst( "4", "5", "6" - ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0)) + ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0.05)) ), #household ID (to determine composition) diff --git a/analysis/dummydata/dummydata_infants.R b/analysis/dummydata/dummydata_infants.R index 2485d30..e55b416 100644 --- a/analysis/dummydata/dummydata_infants.R +++ b/analysis/dummydata/dummydata_infants.R @@ -1,13 +1,13 @@ ##create a dummy dataset -library("tidyverse") -library("arrow") -library("here") -library("glue") -library("EnvStats") +library(tidyverse) +library(arrow) +library(here) +library(glue) +library(EnvStats) -remotes::install_github("https://github.com/wjchulme/dd4d") -library("dd4d") +#remotes::install_github("https://github.com/wjchulme/dd4d") +library(dd4d) ## create output directories ---- fs::dir_create(here("analysis", "dummydata")) @@ -113,7 +113,7 @@ sim_list = lst( "4", "5", "6" - ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0)) + ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0.05)) ), #household ID (to determine composition) @@ -258,6 +258,18 @@ sim_list = lst( #date all_cause_mortality_day = bn_node( ~ as.integer(runif(n = ..n, index_day, index_day + 365)) + ), + + ##exclusion criteria + + #part of risk group + risk_group_infants = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) + ), + + #severe immunodeficiency + severe_immunodeficiency = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_infants_subgroup.R b/analysis/dummydata/dummydata_infants_subgroup.R index cd26b63..9b91c2c 100644 --- a/analysis/dummydata/dummydata_infants_subgroup.R +++ b/analysis/dummydata/dummydata_infants_subgroup.R @@ -1,13 +1,13 @@ ##create a dummy dataset -library("tidyverse") -library("arrow") -library("here") -library("glue") -library("EnvStats") +library(tidyverse) +library(arrow) +library(here) +library(glue) +library(EnvStats) -remotes::install_github("https://github.com/wjchulme/dd4d") -library("dd4d") +#remotes::install_github("https://github.com/wjchulme/dd4d") +library(dd4d) ## create output directories ---- fs::dir_create(here("analysis", "dummydata")) @@ -113,7 +113,7 @@ sim_list = lst( "4", "5", "6" - ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0)) + ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0.05)) ), #household ID (to determine composition) @@ -305,6 +305,18 @@ sim_list = lst( #date all_cause_mortality_day = bn_node( ~ as.integer(runif(n = ..n, index_day, index_day + 365)) + ), + + ##exclusion criteria + + #part of risk group + risk_group_infants = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) + ), + + #severe immunodeficiency + severe_immunodeficiency = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_older_adults.R b/analysis/dummydata/dummydata_older_adults.R index eed7df7..759731d 100644 --- a/analysis/dummydata/dummydata_older_adults.R +++ b/analysis/dummydata/dummydata_older_adults.R @@ -1,13 +1,13 @@ ##create a dummy dataset -library("tidyverse") -library("arrow") -library("here") -library("glue") -library("EnvStats") +library(tidyverse) +library(arrow) +library(here) +library(glue) +library(EnvStats) -remotes::install_github("https://github.com/wjchulme/dd4d") -library("dd4d") +#remotes::install_github("https://github.com/wjchulme/dd4d") +library(dd4d) ## create output directories ---- fs::dir_create(here("analysis", "dummydata")) @@ -110,7 +110,7 @@ sim_list = lst( "4", "5", "6" - ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0)) + ), p = c(0.81, 0.03, 0.1, 0.04, 0.02, 0.05)) ), #household ID (to determine composition) @@ -156,16 +156,16 @@ sim_list = lst( I(smoking_status == "Former")*-0.1)) ), - #pulmonary fibrosis - has_pulmonary_fibrosis = bn_node( - ~ rbernoulli(n = ..n, p = 0.001) - ), - #cystic fibrosis has_cystic_fibrosis = bn_node( ~ rbernoulli(n = ..n, p = 0.02) ), + #other chronic respiratory diseases + has_other_resp = bn_node( + ~ rbernoulli(n = ..n, p = 0.01) + ), + #diabetes has_diabetes = bn_node( ~ rbernoulli(n = ..n, p = plogis(-1 + age*0.02 + I(sex == "female")*-0.2)) @@ -181,9 +181,9 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.1) ), - #chronic heart disease + #chronic heart diseases has_chd = bn_node( - ~ rbernoulli(n = ..n, p = 0.01) + ~ rbernoulli(n = ..n, p = 0.08) ), #chronic kidney disease @@ -221,16 +221,6 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.01) ), - #heart failure - has_heart_failure = bn_node( - ~ rbernoulli(n = ..n, p = 0.015) - ), - - #coronary heart disease - has_coronary_heart_disease = bn_node( - ~ rbernoulli(n = ..n, p = 0.1) - ), - #flu vaccination flu_vaccination = bn_node( ~ rbernoulli(n = ..n, p = 0.75) @@ -375,6 +365,13 @@ sim_list = lst( #date all_cause_mortality_day = bn_node( ~ as.integer(runif(n = ..n, index_day, index_day + 365)) + ), + + ##exclusion criteria + + #care home resident + care_home = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummyextract_adults_2016_2017.arrow b/analysis/dummydata/dummyextract_adults_2016_2017.arrow index 8abf466..667e680 100644 Binary files a/analysis/dummydata/dummyextract_adults_2016_2017.arrow and b/analysis/dummydata/dummyextract_adults_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_children_adolescents_2016_2017.arrow b/analysis/dummydata/dummyextract_children_adolescents_2016_2017.arrow deleted file mode 100644 index 411271e..0000000 Binary files a/analysis/dummydata/dummyextract_children_adolescents_2016_2017.arrow and /dev/null differ diff --git a/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow b/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow index a176d57..e1a931f 100644 Binary files a/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow and b/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_infants_2016_2017.arrow b/analysis/dummydata/dummyextract_infants_2016_2017.arrow index e2ddef1..97f52c0 100644 Binary files a/analysis/dummydata/dummyextract_infants_2016_2017.arrow and b/analysis/dummydata/dummyextract_infants_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow b/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow index d4113f3..b1b1af4 100644 Binary files a/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow and b/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow b/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow index e6d21d9..7ed895f 100644 Binary files a/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow and b/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow differ diff --git a/analysis/flow_chart.R b/analysis/flow_chart.R deleted file mode 100644 index 853421c..0000000 --- a/analysis/flow_chart.R +++ /dev/null @@ -1,121 +0,0 @@ -library("tidyverse") -library("here") -library("arrow") -library("ggplot2") -library("data.table") -library("gtsummary") - -#define study start date and study end date -source(here("analysis", "design", "design.R")) -args <- commandArgs(trailingOnly = TRUE) -study_start_date <- study_dates[[args[[2]]]] -study_end_date <- study_dates[[args[[3]]]] -cohort <- args[[1]] - -patients_df <- read_feather( - here::here("output", paste0(cohort, "_", year(study_start_date), "_", - year(study_end_date), "_flow_chart", ".arrow"))) - - -library(Gmisc, quietly = TRUE) -library(glue) -library(htmlTable) -library(grid) -library(magrittr) - -# Define counts based on inclusion and exclusion criteria -registered_count <- sum(patients_df$registered) -gender_count <- sum(patients_df$is_female_or_male) -age_count <- sum(patients_df$is_appropriate_age & registered_count) -imd_count <- sum(patients_df$has_imd) -included_count <- sum(patients_df$registered & patients_df$is_female_or_male & patients_df$is_appropriate_age & patients_df$has_imd) -care_home_count <- sum(patients_df$registered & patients_df$is_appropriate_age & patients_df$care_home) -demographic_excl_count <- sum(patients_df$registered & patients_df$is_appropriate_age & !patients_df$care_home &!patients_df$is_female_or_male |!patients_df$has_imd) - -org_cohort <- boxGrob(glue("Total Patients", - "n = {pop}", - pop = txtInt(nrow(patients_df)), - .sep = "\n")) - -follow_up <- boxGrob(glue("At least one year of follow-up before {start}", - "n = {follow}", - start = study_start_date, - follow = txtInt(registered_count), - .sep = "\n")) - -follow_up_excl <- boxGrob(glue("Less than one year of prior follow-up", - "n = {follow_e}", - follow_e = txtInt(nrow(patients_df) - registered_count), - .sep = "\n")) - -age <- boxGrob(glue("Aged 65 or over", - "n = {age}", - age = txtInt(age_count), - .sep = "\n")) - -age_excl <- boxGrob(glue("Aged under 65 on {start}", - "n = {age_e}", - start = study_start_date, - age_e = txtInt(nrow(patients_df) - age_count), - .sep = "\n")) - -not_care_home <- boxGrob(glue("Not in long-term care facility", - "n = {nocare}", - nocare = txtInt(age_count - care_home_count), - .sep = "\n")) - -care_home <- boxGrob(glue("In long-term care facility", - "n = {care}", - care = txtInt(care_home_count), - .sep = "\n")) - -demographic_excl <- boxGrob(glue("Missing deomgraphic information", - "n = {excl}", - excl = txtInt(demographic_excl_count), - .sep = "\n")) - -included <- boxGrob(glue("Included", - "n = {incl}", - incl = txtInt(included_count), - .sep = "\n")) - -grid.newpage() -vert <- spreadVertical(org_cohort = org_cohort, follow_up = follow_up, - age = age, not_care_home = not_care_home, - included = included, .from = org_cohort, - .to = included, .type = "center") -vert$excluded <- NULL - -follow_up_excl <- moveBox(follow_up_excl, - x = 0.82, - y = coords(vert$follow_up)$top + distance(vert$age, vert$follow_up, half = TRUE, center = FALSE)) - -age_excl <- moveBox(age_excl, - x = 0.82, - y = coords(vert$age)$top + distance(vert$not_care_home, vert$age, half = TRUE, center = FALSE)) - -care_home <- moveBox(care_home, - x = 0.82, - y = coords(vert$not_care_home)$top + distance(vert$included, vert$not_care_home, half = TRUE, center = FALSE)) - -demographic_excl <- moveBox(demographic_excl, - x = 0.82, - y = coords(vert$included)$top + distance(vert$included, vert$not_care_home, half = TRUE, center = FALSE)) - - -for (i in 1:(length(vert) - 1)) { - connectGrob(vert[[i]], vert[[i + 1]], type = "vert") %>% - print -} - -connectGrob(vert$org_cohort, follow_up_excl, type = "L") -connectGrob(vert$follow_up, age_excl, type = "L") -connectGrob(vert$age, care_home, type = "L") -connectGrob(vert$not_care_home, demographic_excl, type = "L") - -# Print boxes -vert -follow_up_excl -age_excl -care_home -demographic_excl diff --git a/analysis/report.R b/analysis/report.R index fc0ef87..eb181d1 100644 --- a/analysis/report.R +++ b/analysis/report.R @@ -1,9 +1,9 @@ -library("tidyverse") -library("here") -library("arrow") -library("ggplot2") -library("data.table") -library("gtsummary") +library(tidyverse) +library(here) +library(arrow) +library(ggplot2) +library(data.table) +library(gtsummary) ## create output directories ---- fs::dir_create(here("analysis")) @@ -14,11 +14,19 @@ source(here("analysis", "functions", "redaction.R")) #define study start date and study end date source(here("analysis", "design", "design.R")) args <- commandArgs(trailingOnly = TRUE) -study_start_date <- study_dates[[args[[2]]]] -study_end_date <- study_dates[[args[[3]]]] -cohort <- args[[1]] -codelist_type <- args[[4]] -investigation_type <- args[[5]] +if (length(args) == 0) { + study_start_date <- "2016-09-01" + study_end_date <- "2017-08-31" + cohort <- "adults" + codelist_type <- "specific" + investigation_type <- "primary" +} else { + study_start_date <- study_dates[[args[[2]]]] + study_end_date <- study_dates[[args[[3]]]] + cohort <- args[[1]] + codelist_type <- args[[4]] + investigation_type <- args[[5]] +} covid_season_min <- as.Date("2019-09-01") # roundmid_any <- function(x, to=6){ @@ -27,153 +35,22 @@ covid_season_min <- as.Date("2019-09-01") # } df_input <- read_feather( - here::here("output", paste0("input_processed_", cohort, "_", year(study_start_date), - "_", year(study_end_date), "_", codelist_type, "_", - investigation_type,".arrow"))) - -# lab <- ifelse(cohort == "infants", "Age (Months)", -# ifelse(cohort == "infants_subgroup", "Age (Months)", "Age (Years)")) -# -# plot_age <- ggplot(data = df_input, aes(age, frequency(age))) + geom_col(width = 0.9) + -# xlab(lab) + ylab("Frequency") -# -# ggsave( -# plot = plot_age, -# filename = paste0("descriptive_", cohort, "_", year(study_start_date), -# "_", year(study_end_date), "_", codelist_type, "_", -# investigation_type,".png"), path = here::here("output"), -# ) - -df_datatable <- as.data.table(df_input) + here::here("output", "data", paste0("input_processed_", cohort, "_", + year(study_start_date), "_", year(study_end_date), "_", + codelist_type, "_", investigation_type,".arrow"))) -if (cohort == "infants") { - table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] - table <- df_datatable[registered == TRUE, .(Total, age_band, sex, - latest_ethnicity_group, imd_quintile, - rurality_classification)] - setnames(table, c("age_band", "sex", "latest_ethnicity_group", - "imd_quintile", "rurality_classification"), - c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality")) -} else if (cohort == "children_and_adolescents") { - if(investigation_type == "primary") { - table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] - table <- df_datatable[registered == TRUE, .(Total, age_band, sex, - latest_ethnicity_group, imd_quintile, - rurality_classification, flu_vaccination)] - setnames(table, c("age_band", "sex", "latest_ethnicity_group", - "imd_quintile", "rurality_classification", - "flu_vaccination"), - c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality", - "Flu Vaccine")) - if (study_start_date >= covid_season_min) { - table[, covid_vaccination_count := df_datatable$covid_vaccination_count] - setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") - } - } else { - table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] - table <- df_datatable[registered == TRUE, Reactive_Airway := ifelse(age <= 5, has_asthma_reactive_airway, F)] - table <- df_datatable[registered == TRUE, Asthma := ifelse(age > 5, has_asthma_reactive_airway, F)] - table <- df_datatable[registered == TRUE, .(Total, age_band, sex, - latest_ethnicity_group, imd_quintile, - rurality_classification, Asthma, Reactive_Airway, - flu_vaccination)] - setnames(table, c("age_band", "sex", "latest_ethnicity_group", - "imd_quintile", "rurality_classification", - "Reactive_Airway", "flu_vaccination"), - c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality", - "Reactive Airway", "Flu Vaccine")) - if (study_start_date >= covid_season_min) { - table[, covid_vaccination_count := df_datatable$covid_vaccination_count] - setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") - } - } -} else { - if (investigation_type == "primary") { - table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] - table <- df_datatable[registered == TRUE, .(Total, age_band, sex, - latest_ethnicity_group, imd_quintile, - rurality_classification, flu_vaccination)] - setnames(table, c("age_band", "sex", "latest_ethnicity_group", - "imd_quintile", "rurality_classification", - "flu_vaccination"), - c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality", - "Flu Vaccine")) - if (study_start_date >= covid_season_min) { - table[, covid_vaccination_count := df_datatable$covid_vaccination_count] - setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") - } - } else { - table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)] - table <- df_datatable[registered == TRUE, .(Total, age_band, sex, - latest_ethnicity_group, imd_quintile, - rurality_classification, smoking_status, - hazardous_drinking, drug_usage, has_asthma, - has_copd, has_pulmonary_fibrosis, - has_cystic_fibrosis, has_diabetes, has_addisons, - severe_obesity, has_chd, has_ckd, has_cld, has_cnd, - has_cancer, immunosuppressed, has_sickle_cell, - has_heart_failure, has_coronary_heart_disease, - flu_vaccination)] - setnames(table, c("age_band", "sex", "latest_ethnicity_group", - "imd_quintile", "rurality_classification", - "smoking_status", "hazardous_drinking", "drug_usage", - "has_asthma", "has_copd", "has_pulmonary_fibrosis", - "has_cystic_fibrosis", "has_diabetes", "has_addisons" , - "severe_obesity", "has_chd", "has_ckd", "has_cld", - "has_cnd", "has_cancer", "immunosuppressed", - "has_sickle_cell", "has_heart_failure", - "has_coronary_heart_disease", "flu_vaccination"), - c("Age_Group", "Sex", "Ethnicity", "IMD", "Rurality", - "Smoking Status", "Hazardous Drinking", "Drug Usage", - "Asthma", "COPD", "Pulmonary Fibrosis", "Cystic Fibrosis", - "Diabetes", "Addisons", "Severe Obesity", "Chronic Heart Disease", - "Chronic Kidney Disease", "Chronic Liver Disease", - "Chronic Neurological Disease", "Cancer Within 3 Years", - "Immunosuppressed", "Sickle Cell Disease", "Heart Failure", - "Coronary Heart Disease", "Flu Vaccine")) - if (study_start_date >= covid_season_min) { - table[, covid_vaccination_count := df_datatable$covid_vaccination_count] - setnames(table, "covid_vaccination_count", "Covid Vaccine Doses") - } - } -} +## create output directories ---- +fs::dir_create(here("output", "models")) -table %>% - tbl_summary() %>% - as_gt() %>% - gt::gtsave(filename = paste0("table1_", cohort, "_", year(study_start_date), - "_", year(study_end_date), "_", codelist_type, "_", - investigation_type,".html"), path = here::here("output")) +lab <- ifelse(cohort == "infants", "Age (Months)", + ifelse(cohort == "infants_subgroup", "Age (Months)", "Age (Years)")) -table %>% - tbl_summary() %>% - as_tibble() %>% - write_csv(path = paste0(here::here("output"), "/", "table1_", cohort, "_", year(study_start_date), - "_", year(study_end_date), "_", codelist_type, "_", - investigation_type,".csv")) +plot_age <- ggplot(data = df_input, aes(age, frequency(age))) + geom_col(width = 0.9) + + xlab(lab) + ylab("Frequency") -# table_sum <- tbl_summary(table) -# table_redacted <- redact_tblsummary(table_sum, 7) -# var_labels <- colnames(table_sum) -# raw_stats <- table_sum$meta_data %>% -# select(var_label, df_stats) %>% -# unnest(df_stats) -# threshold = 7 -# raw_stats_redacted <- raw_stats %>% -# mutate( -# n = roundmid_any(n, threshold), -# N = roundmid_any(N, threshold), -# p = n / N, -# N_miss = roundmid_any(N_miss, threshold), -# N_obs = roundmid_any(N_obs, threshold), -# p_miss = N_miss / N_obs, -# N_nonmiss = roundmid_any(N_nonmiss, threshold), -# p_nonmiss = N_nonmiss / N_obs, -# var_label = factor(var_label, levels = map_chr(var_labels[-c(1, 2)], ~ last(as.character(.)))), -# variable_levels = replace_na(as.character(variable_levels), "") -# ) -# -# raw_stats_redacted %>% -# write.csv(filename = paste0("table1_raw_", cohort, "_", year(study_start_date), -# "_", year(study_end_date), "_", codelist_type, "_", -# investigation_type,".xlsx"), path = here::here("output")) +ggsave( + plot = plot_age, + filename = paste0("descriptive_", cohort, "_", year(study_start_date), + "_", year(study_end_date), "_", codelist_type, "_", + investigation_type,".png"), path = here::here("output", "models"), +) diff --git a/analysis/table_collation.R b/analysis/table_collation.R new file mode 100644 index 0000000..c3a30db --- /dev/null +++ b/analysis/table_collation.R @@ -0,0 +1,53 @@ +library(tidyverse) +library(here) +library(arrow) +library(ggplot2) + +#define cohort +args <- commandArgs(trailingOnly = TRUE) +if (length(args) == 0) { + cohort <- "adults" +} else { + cohort <- args[[1]] +} + +# import flow chart info by cohort +collated_table1 = rbind( + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2016_2017.csv"))) %>% mutate(subset = "2016_17"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2017_2018.csv"))) %>% mutate(subset = "2017_18"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2018_2019.csv"))) %>% mutate(subset = "2018_19"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2019_2020.csv"))) %>% mutate(subset = "2019_20"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2020_2021.csv"))) %>% mutate(subset = "2020_21"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2021_2022.csv"))) %>% mutate(subset = "2021_22"), + read_csv(here::here("output", paste0("flow_chart_processed_", cohort, + "_2022_2023.csv"))) %>% mutate(subset = "2022_23") +) + +#save as csv +write_csv(collated_table1, paste0(here::here("output"), "/", "table1_collated", + cohort, "_", year(study_start_date), "_", year(study_end_date),".csv")) + +# import table 1 by cohort +collated_table1 = rbind( + read_csv(here::here("output", paste0("table1_", cohort, "_2016_2017.csv"))) %>% mutate(subset = "2016_17"), + read_csv(here::here("output", paste0("table1_", cohort, "_2017_2018.csv"))) %>% mutate(subset = "2017_18"), + read_csv(here::here("output", paste0("table1_", cohort, "_2018_2019.csv"))) %>% mutate(subset = "2018_19"), + read_csv(here::here("output", paste0("table1_", cohort, "_2019_2020.csv"))) %>% mutate(subset = "2019_20"), + read_csv(here::here("output", paste0("table1_", cohort, "_2020_2021.csv"))) %>% mutate(subset = "2020_21"), + read_csv(here::here("output", paste0("table1_", cohort, "_2021_2022.csv"))) %>% mutate(subset = "2021_22"), + read_csv(here::here("output", paste0("table1_", cohort, "_2022_2023.csv"))) %>% mutate(subset = "2022_23") +) + +## create output directories ---- +fs::dir_create(here("output", "table1")) + +#save as csv +write_csv(collated_table1, paste0(here::here("output", "table1"), "/", + "table1_collated", cohort, "_", year(study_start_date), "_", + year(study_end_date),".csv")) diff --git a/project.yaml b/project.yaml index fe5b6b2..c36a939 100644 --- a/project.yaml +++ b/project.yaml @@ -8,336 +8,406 @@ actions: ### exploratory analysis +## season 1 (2016/17) - cohort inclusion + generate_flow_chart_data_older_adults_s1: run: > ehrql:v1 generate-dataset analysis/dataset_definition_flow_chart.py - --output output/older_adults_2016_2017_flow_chart.arrow + --output output/flow_chart/older_adults_2016_2017_flow_chart.csv + --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow -- older_adults season1_start_date season1_end_date outputs: highly_sensitive: - dataset: output/older_adults_2016_2017_flow_chart.arrow - -# ### primary analysis -# -# ## season 1 (2016/17) -# -# generate_dataset_older_adults_s1_spec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_older_adults_2016_2017_specific_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow -# -- older_adults season1_start_date season1_end_date specific primary -# outputs: -# highly_sensitive: -# dataset: output/input_older_adults_2016_2017_specific_primary.arrow -# -# process_dataset_older_adults_s1_spec: -# run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date specific primary -# needs: [generate_dataset_older_adults_s1_spec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_older_adults_2016_2017_specific_primary.arrow -# -# describe_dataset_older_adults_s1_spec: -# run: r:latest analysis/report.R older_adults season1_start_date season1_end_date specific primary -# needs: [process_dataset_older_adults_s1_spec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_older_adults_2016_2017_specific_primary.png -# html: output/table1_older_adults_2016_2017_specific_primary.html -# csv: output/table1_older_adults_2016_2017_specific_primary.csv -# -# generate_dataset_adults_s1_spec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_adults_2016_2017_specific_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow -# -- adults season1_start_date season1_end_date specific primary -# outputs: -# highly_sensitive: -# dataset: output/input_adults_2016_2017_specific_primary.arrow -# -# process_dataset_adults_s1_spec: -# run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date specific primary -# needs: [generate_dataset_adults_s1_spec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_adults_2016_2017_specific_primary.arrow -# -# describe_dataset_adults_s1_spec: -# run: r:latest analysis/report.R adults season1_start_date season1_end_date specific primary -# needs: [process_dataset_adults_s1_spec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_adults_2016_2017_specific_primary.png -# html: output/table1_adults_2016_2017_specific_primary.html -# csv: output/table1_adults_2016_2017_specific_primary.csv -# -# generate_dataset_children_and_adolescents_s1_spec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_children_and_adolescents_2016_2017_specific_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow -# -- children_and_adolescents season1_start_date season1_end_date specific primary -# outputs: -# highly_sensitive: -# dataset: output/input_children_and_adolescents_2016_2017_specific_primary.arrow -# -# process_dataset_children_and_adolescents_s1_spec: -# run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date specific primary -# needs: [generate_dataset_children_and_adolescents_s1_spec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_children_and_adolescents_2016_2017_specific_primary.arrow -# -# describe_dataset_children_and_adolescents_s1_spec: -# run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date specific primary -# needs: [process_dataset_children_and_adolescents_s1_spec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_children_and_adolescents_2016_2017_specific_primary.png -# html: output/table1_children_and_adolescents_2016_2017_specific_primary.html -# csv: output/table1_children_and_adolescents_2016_2017_specific_primary.csv -# -# generate_dataset_infants_s1_spec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_infants_2016_2017_specific_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow -# -- infants season1_start_date season1_end_date specific primary -# outputs: -# highly_sensitive: -# dataset: output/input_infants_2016_2017_specific_primary.arrow -# -# process_dataset_infants_s1_spec: -# run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date specific primary -# needs: [generate_dataset_infants_s1_spec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_infants_2016_2017_specific_primary.arrow -# -# describe_dataset_infants_s1_spec: -# run: r:latest analysis/report.R infants season1_start_date season1_end_date specific primary -# needs: [process_dataset_infants_s1_spec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_infants_2016_2017_specific_primary.png -# html: output/table1_infants_2016_2017_specific_primary.html -# csv: output/table1_infants_2016_2017_specific_primary.csv -# -# ### sensitivity analysis -# -# ## season 1 (2016/17) -# -# generate_dataset_older_adults_s1_sens: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_older_adults_2016_2017_sensitive_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow -# -- older_adults season1_start_date season1_end_date sensitive primary -# outputs: -# highly_sensitive: -# dataset: output/input_older_adults_2016_2017_sensitive_primary.arrow -# -# process_dataset_older_adults_s1_sens: -# run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date sensitive primary -# needs: [generate_dataset_older_adults_s1_sens] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_older_adults_2016_2017_sensitive_primary.arrow -# -# describe_dataset_older_adults_s1_sens: -# run: r:latest analysis/report.R older_adults season1_start_date season1_end_date sensitive primary -# needs: [process_dataset_older_adults_s1_sens] -# outputs: -# moderately_sensitive: -# png: output/descriptive_older_adults_2016_2017_sensitive_primary.png -# html: output/table1_older_adults_2016_2017_sensitive_primary.html -# csv: output/table1_older_adults_2016_2017_sensitive_primary.csv -# -# generate_dataset_adults_s1_sens: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_adults_2016_2017_sensitive_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow -# -- adults season1_start_date season1_end_date sensitive primary -# outputs: -# highly_sensitive: -# dataset: output/input_adults_2016_2017_sensitive_primary.arrow -# -# process_dataset_adults_s1_sens: -# run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date sensitive primary -# needs: [generate_dataset_adults_s1_sens] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_adults_2016_2017_sensitive_primary.arrow -# -# describe_dataset_adults_s1_sens: -# run: r:latest analysis/report.R adults season1_start_date season1_end_date sensitive primary -# needs: [process_dataset_adults_s1_sens] -# outputs: -# moderately_sensitive: -# png: output/descriptive_adults_2016_2017_sensitive_primary.png -# html: output/table1_adults_2016_2017_sensitive_primary.html -# csv: output/table1_adults_2016_2017_sensitive_primary.csv -# -# generate_dataset_children_and_adolescents_s1_sens: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_children_and_adolescents_2016_2017_sensitive_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow -# -- children_and_adolescents season1_start_date season1_end_date sensitive primary -# outputs: -# highly_sensitive: -# dataset: output/input_children_and_adolescents_2016_2017_sensitive_primary.arrow -# -# process_dataset_children_and_adolescents_s1_sens: -# run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date sensitive primary -# needs: [generate_dataset_children_and_adolescents_s1_sens] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_children_and_adolescents_2016_2017_sensitive_primary.arrow -# -# describe_dataset_children_and_adolescents_s1_sens: -# run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date sensitive primary -# needs: [process_dataset_children_and_adolescents_s1_sens] -# outputs: -# moderately_sensitive: -# png: output/descriptive_children_and_adolescents_2016_2017_sensitive_primary.png -# html: output/table1_children_and_adolescents_2016_2017_sensitive_primary.html -# csv: output/table1_children_and_adolescents_2016_2017_sensitive_primary.csv -# -# generate_dataset_infants_s1_sens: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_infants_2016_2017_sensitive_primary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow -# -- infants season1_start_date season1_end_date sensitive primary -# outputs: -# highly_sensitive: -# dataset: output/input_infants_2016_2017_sensitive_primary.arrow -# -# process_dataset_infants_s1_sens: -# run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date sensitive primary -# needs: [generate_dataset_infants_s1_sens] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_infants_2016_2017_sensitive_primary.arrow -# -# describe_dataset_infants_s1_sens: -# run: r:latest analysis/report.R infants season1_start_date season1_end_date sensitive primary -# needs: [process_dataset_infants_s1_sens] -# outputs: -# moderately_sensitive: -# png: output/descriptive_infants_2016_2017_sensitive_primary.png -# html: output/table1_infants_2016_2017_sensitive_primary.html -# csv: output/table1_infants_2016_2017_sensitive_primary.csv -# -# ### secondary analysis -# -# ## season 1 (2016/17) -# -# generate_dataset_older_adults_s1_sens_sec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_older_adults_2016_2017_sensitive_secondary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow -# -- older_adults season1_start_date season1_end_date sensitive secondary -# outputs: -# highly_sensitive: -# dataset: output/input_older_adults_2016_2017_sensitive_secondary.arrow -# -# process_dataset_older_adults_s1_sens_sec: -# run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date sensitive secondary -# needs: [generate_dataset_older_adults_s1_sens_sec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_older_adults_2016_2017_sensitive_secondary.arrow -# -# describe_dataset_older_adults_s1_sens_sec: -# run: r:latest analysis/report.R older_adults season1_start_date season1_end_date sensitive secondary -# needs: [process_dataset_older_adults_s1_sens_sec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_older_adults_2016_2017_sensitive_secondary.png -# html: output/table1_older_adults_2016_2017_sensitive_secondary.html -# csv: output/table1_older_adults_2016_2017_sensitive_secondary.csv -# -# generate_dataset_adults_s1_sens_sec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_adults_2016_2017_sensitive_secondary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow -# -- adults season1_start_date season1_end_date sensitive secondary -# outputs: -# highly_sensitive: -# dataset: output/input_adults_2016_2017_sensitive_secondary.arrow -# -# process_dataset_adults_s1_sens_sec: -# run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date sensitive secondary -# needs: [generate_dataset_adults_s1_sens_sec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_adults_2016_2017_sensitive_secondary.arrow -# -# describe_dataset_adults_s1_sens_sec: -# run: r:latest analysis/report.R adults season1_start_date season1_end_date sensitive secondary -# needs: [process_dataset_adults_s1_sens_sec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_adults_2016_2017_sensitive_secondary.png -# html: output/table1_adults_2016_2017_sensitive_secondary.html -# csv: output/table1_adults_2016_2017_sensitive_secondary.csv -# -# generate_dataset_children_and_adolescents_s1_sens_sec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_children_and_adolescents_2016_2017_sensitive_secondary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow -# -- children_and_adolescents season1_start_date season1_end_date sensitive secondary -# outputs: -# highly_sensitive: -# dataset: output/input_children_and_adolescents_2016_2017_sensitive_secondary.arrow -# -# process_dataset_children_and_adolescents_s1_sens_sec: -# run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date sensitive secondary -# needs: [generate_dataset_children_and_adolescents_s1_sens_sec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_children_and_adolescents_2016_2017_sensitive_secondary.arrow -# -# describe_dataset_children_and_adolescents_s1_sens_sec: -# run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date sensitive secondary -# needs: [process_dataset_children_and_adolescents_s1_sens_sec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_children_and_adolescents_2016_2017_sensitive_secondary.png -# html: output/table1_children_and_adolescents_2016_2017_sensitive_secondary.html -# csv: output/table1_children_and_adolescents_2016_2017_sensitive_secondary.csv -# -# generate_dataset_infants_s1_sens_sec: -# run: > -# ehrql:v1 generate-dataset analysis/dataset_definition.py -# --output output/input_infants_2016_2017_sensitive_secondary.arrow -# --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow -# -- infants season1_start_date season1_end_date sensitive secondary -# outputs: -# highly_sensitive: -# dataset: output/input_infants_2016_2017_sensitive_secondary.arrow -# -# process_dataset_infants_s1_sens_sec: -# run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date sensitive secondary -# needs: [generate_dataset_infants_s1_sens_sec] -# outputs: -# highly_sensitive: -# cohort: output/input_processed_infants_2016_2017_sensitive_secondary.arrow -# -# describe_dataset_infants_s1_sens_sec: -# run: r:latest analysis/report.R infants season1_start_date season1_end_date sensitive secondary -# needs: [process_dataset_infants_s1_sens_sec] -# outputs: -# moderately_sensitive: -# png: output/descriptive_infants_2016_2017_sensitive_secondary.png -# html: output/table1_infants_2016_2017_sensitive_secondary.html -# csv: output/table1_infants_2016_2017_sensitive_secondary.csv -# \ No newline at end of file + dataset: output/flow_chart/older_adults_2016_2017_flow_chart.csv + + process_flow_chart_older_adults_s1: + run: r:latest analysis/cohort_criteria.R older_adults season1_start_date season1_end_date + needs: [generate_flow_chart_data_older_adults_s1] + outputs: + highly_sensitive: + cohort: output/flow_chart/flow_chart_processed_older_adults_2016_2017.csv + + generate_flow_chart_data_adults_s1: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition_flow_chart.py + --output output/flow_chart/adults_2016_2017_flow_chart.csv + --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow + -- adults season1_start_date season1_end_date + outputs: + highly_sensitive: + dataset: output/flow_chart/adults_2016_2017_flow_chart.csv + + process_flow_chart_adults_s1: + run: r:latest analysis/cohort_criteria.R adults season1_start_date season1_end_date + needs: [generate_flow_chart_data_adults_s1] + outputs: + highly_sensitive: + cohort: output/flow_chart/flow_chart_processed_adults_2016_2017.csv + + generate_flow_chart_data_children_and_adolescents_s1: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition_flow_chart.py + --output output/flow_chart/children_and_adolescents_2016_2017_flow_chart.csv + --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow + -- children_and_adolescents season1_start_date season1_end_date + outputs: + highly_sensitive: + dataset: output/flow_chart/children_and_adolescents_2016_2017_flow_chart.csv + + process_flow_chart_children_and_adolescents_s1: + run: r:latest analysis/cohort_criteria.R children_and_adolescents season1_start_date season1_end_date + needs: [generate_flow_chart_data_children_and_adolescents_s1] + outputs: + highly_sensitive: + cohort: output/flow_chart/flow_chart_processed_children_and_adolescents_2016_2017.csv + + generate_flow_chart_data_infants_s1: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition_flow_chart.py + --output output/flow_chart/infants_2016_2017_flow_chart.csv + --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow + -- infants season1_start_date season1_end_date + outputs: + highly_sensitive: + dataset: output/flow_chart/infants_2016_2017_flow_chart.csv + + process_flow_chart_infants_s1: + run: r:latest analysis/cohort_criteria.R infants season1_start_date season1_end_date + needs: [generate_flow_chart_data_infants_s1] + outputs: + highly_sensitive: + cohort: output/flow_chart/flow_chart_processed_infants_2016_2017.csv + +## season 1 (2016/17) - cohort description + + describe_cohort_older_adults_s1: + run: r:latest analysis/cohort_description.R older_adults season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_older_adults_s1_sens_sec] + outputs: + moderately_sensitive: + html: output/table1/table1_older_adults_2016_2017.html + csv: output/table1/table1_older_adults_2016_2017.csv + + describe_cohort_adults_s1: + run: r:latest analysis/cohort_description.R adults season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_adults_s1_sens_sec] + outputs: + moderately_sensitive: + html: output/table1/table1_adults_2016_2017.html + csv: output/table1/table1_adults_2016_2017.csv + + describe_cohort_children_and_adolescents_s1: + run: r:latest analysis/cohort_description.R children_and_adolescents season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_children_and_adolescents_s1_sens_sec] + outputs: + moderately_sensitive: + html: output/table1/table1_children_and_adolescents_2016_2017.html + csv: output/table1/table1_children_and_adolescents_2016_2017.csv + + describe_cohort_infants_s1: + run: r:latest analysis/cohort_description.R infants season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_infants_s1_sens_sec] + outputs: + moderately_sensitive: + html: output/table1/table1_infants_2016_2017.html + csv: output/table1/table1_infants_2016_2017.csv + +### primary analysis + +## season 1 (2016/17) + + generate_dataset_older_adults_s1_spec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_older_adults_2016_2017_specific_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow + -- older_adults season1_start_date season1_end_date specific primary + outputs: + highly_sensitive: + dataset: output/data/input_older_adults_2016_2017_specific_primary.arrow + + process_dataset_older_adults_s1_spec: + run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date specific primary + needs: [generate_dataset_older_adults_s1_spec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_older_adults_2016_2017_specific_primary.arrow + + describe_dataset_older_adults_s1_spec: + run: r:latest analysis/report.R older_adults season1_start_date season1_end_date specific primary + needs: [process_dataset_older_adults_s1_spec] + outputs: + moderately_sensitive: + png: output/models/descriptive_older_adults_2016_2017_specific_primary.png + + generate_dataset_adults_s1_spec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_adults_2016_2017_specific_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow + -- adults season1_start_date season1_end_date specific primary + outputs: + highly_sensitive: + dataset: output/data/input_adults_2016_2017_specific_primary.arrow + + process_dataset_adults_s1_spec: + run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date specific primary + needs: [generate_dataset_adults_s1_spec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_adults_2016_2017_specific_primary.arrow + + describe_dataset_adults_s1_spec: + run: r:latest analysis/report.R adults season1_start_date season1_end_date specific primary + needs: [process_dataset_adults_s1_spec] + outputs: + moderately_sensitive: + png: output/models/descriptive_adults_2016_2017_specific_primary.png + + generate_dataset_children_and_adolescents_s1_spec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_children_and_adolescents_2016_2017_specific_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow + -- children_and_adolescents season1_start_date season1_end_date specific primary + outputs: + highly_sensitive: + dataset: output/data/input_children_and_adolescents_2016_2017_specific_primary.arrow + + process_dataset_children_and_adolescents_s1_spec: + run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date specific primary + needs: [generate_dataset_children_and_adolescents_s1_spec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_children_and_adolescents_2016_2017_specific_primary.arrow + + describe_dataset_children_and_adolescents_s1_spec: + run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date specific primary + needs: [process_dataset_children_and_adolescents_s1_spec] + outputs: + moderately_sensitive: + png: output/models/descriptive_children_and_adolescents_2016_2017_specific_primary.png + + generate_dataset_infants_s1_spec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_infants_2016_2017_specific_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow + -- infants season1_start_date season1_end_date specific primary + outputs: + highly_sensitive: + dataset: output/data/input_infants_2016_2017_specific_primary.arrow + + process_dataset_infants_s1_spec: + run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date specific primary + needs: [generate_dataset_infants_s1_spec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_infants_2016_2017_specific_primary.arrow + + describe_dataset_infants_s1_spec: + run: r:latest analysis/report.R infants season1_start_date season1_end_date specific primary + needs: [process_dataset_infants_s1_spec] + outputs: + moderately_sensitive: + png: output/models/descriptive_infants_2016_2017_specific_primary.png + +### sensitivity analysis + +## season 1 (2016/17) + + generate_dataset_older_adults_s1_sens: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_older_adults_2016_2017_sensitive_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow + -- older_adults season1_start_date season1_end_date sensitive primary + outputs: + highly_sensitive: + dataset: output/data/input_older_adults_2016_2017_sensitive_primary.arrow + + process_dataset_older_adults_s1_sens: + run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date sensitive primary + needs: [generate_dataset_older_adults_s1_sens] + outputs: + highly_sensitive: + cohort: output/data/input_processed_older_adults_2016_2017_sensitive_primary.arrow + + describe_dataset_older_adults_s1_sens: + run: r:latest analysis/report.R older_adults season1_start_date season1_end_date sensitive primary + needs: [process_dataset_older_adults_s1_sens] + outputs: + moderately_sensitive: + png: output/models/descriptive_older_adults_2016_2017_sensitive_primary.png + + generate_dataset_adults_s1_sens: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_adults_2016_2017_sensitive_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow + -- adults season1_start_date season1_end_date sensitive primary + outputs: + highly_sensitive: + dataset: output/data/input_adults_2016_2017_sensitive_primary.arrow + + process_dataset_adults_s1_sens: + run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date sensitive primary + needs: [generate_dataset_adults_s1_sens] + outputs: + highly_sensitive: + cohort: output/data/input_processed_adults_2016_2017_sensitive_primary.arrow + + describe_dataset_adults_s1_sens: + run: r:latest analysis/report.R adults season1_start_date season1_end_date sensitive primary + needs: [process_dataset_adults_s1_sens] + outputs: + moderately_sensitive: + png: output/models/descriptive_adults_2016_2017_sensitive_primary.png + + generate_dataset_children_and_adolescents_s1_sens: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_children_and_adolescents_2016_2017_sensitive_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow + -- children_and_adolescents season1_start_date season1_end_date sensitive primary + outputs: + highly_sensitive: + dataset: output/data/input_children_and_adolescents_2016_2017_sensitive_primary.arrow + + process_dataset_children_and_adolescents_s1_sens: + run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date sensitive primary + needs: [generate_dataset_children_and_adolescents_s1_sens] + outputs: + highly_sensitive: + cohort: output/data/input_processed_children_and_adolescents_2016_2017_sensitive_primary.arrow + + describe_dataset_children_and_adolescents_s1_sens: + run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date sensitive primary + needs: [process_dataset_children_and_adolescents_s1_sens] + outputs: + moderately_sensitive: + png: output/models/descriptive_children_and_adolescents_2016_2017_sensitive_primary.png + + generate_dataset_infants_s1_sens: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_infants_2016_2017_sensitive_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow + -- infants season1_start_date season1_end_date sensitive primary + outputs: + highly_sensitive: + dataset: output/data/input_infants_2016_2017_sensitive_primary.arrow + + process_dataset_infants_s1_sens: + run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date sensitive primary + needs: [generate_dataset_infants_s1_sens] + outputs: + highly_sensitive: + cohort: output/data/input_processed_infants_2016_2017_sensitive_primary.arrow + + describe_dataset_infants_s1_sens: + run: r:latest analysis/report.R infants season1_start_date season1_end_date sensitive primary + needs: [process_dataset_infants_s1_sens] + outputs: + moderately_sensitive: + png: output/models/descriptive_infants_2016_2017_sensitive_primary.png + +### secondary analysis + +## season 1 (2016/17) + + generate_dataset_older_adults_s1_sens_sec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_older_adults_2016_2017_sensitive_secondary.arrow + --dummy-data-file analysis/dummydata/dummyextract_older_adults_2016_2017.arrow + -- older_adults season1_start_date season1_end_date sensitive secondary + outputs: + highly_sensitive: + dataset: output/data/input_older_adults_2016_2017_sensitive_secondary.arrow + + process_dataset_older_adults_s1_sens_sec: + run: r:latest analysis/data_processing.R older_adults season1_start_date season1_end_date sensitive secondary + needs: [generate_dataset_older_adults_s1_sens_sec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_older_adults_2016_2017_sensitive_secondary.arrow + + describe_dataset_older_adults_s1_sens_sec: + run: r:latest analysis/report.R older_adults season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_older_adults_s1_sens_sec] + outputs: + moderately_sensitive: + png: output/models/descriptive_older_adults_2016_2017_sensitive_secondary.png + + generate_dataset_adults_s1_sens_sec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_adults_2016_2017_sensitive_secondary.arrow + --dummy-data-file analysis/dummydata/dummyextract_adults_2016_2017.arrow + -- adults season1_start_date season1_end_date sensitive secondary + outputs: + highly_sensitive: + dataset: output/data/input_adults_2016_2017_sensitive_secondary.arrow + + process_dataset_adults_s1_sens_sec: + run: r:latest analysis/data_processing.R adults season1_start_date season1_end_date sensitive secondary + needs: [generate_dataset_adults_s1_sens_sec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_adults_2016_2017_sensitive_secondary.arrow + + describe_dataset_adults_s1_sens_sec: + run: r:latest analysis/report.R adults season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_adults_s1_sens_sec] + outputs: + moderately_sensitive: + png: output/models/descriptive_adults_2016_2017_sensitive_secondary.png + + generate_dataset_children_and_adolescents_s1_sens_sec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_children_and_adolescents_2016_2017_sensitive_secondary.arrow + --dummy-data-file analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow + -- children_and_adolescents season1_start_date season1_end_date sensitive secondary + outputs: + highly_sensitive: + dataset: output/data/input_children_and_adolescents_2016_2017_sensitive_secondary.arrow + + process_dataset_children_and_adolescents_s1_sens_sec: + run: r:latest analysis/data_processing.R children_and_adolescents season1_start_date season1_end_date sensitive secondary + needs: [generate_dataset_children_and_adolescents_s1_sens_sec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_children_and_adolescents_2016_2017_sensitive_secondary.arrow + + describe_dataset_children_and_adolescents_s1_sens_sec: + run: r:latest analysis/report.R children_and_adolescents season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_children_and_adolescents_s1_sens_sec] + outputs: + moderately_sensitive: + png: output/models/descriptive_children_and_adolescents_2016_2017_sensitive_secondary.png + + generate_dataset_infants_s1_sens_sec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_infants_2016_2017_sensitive_secondary.arrow + --dummy-data-file analysis/dummydata/dummyextract_infants_2016_2017.arrow + -- infants season1_start_date season1_end_date sensitive secondary + outputs: + highly_sensitive: + dataset: output/data/input_infants_2016_2017_sensitive_secondary.arrow + + process_dataset_infants_s1_sens_sec: + run: r:latest analysis/data_processing.R infants season1_start_date season1_end_date sensitive secondary + needs: [generate_dataset_infants_s1_sens_sec] + outputs: + highly_sensitive: + cohort: output/data/input_processed_infants_2016_2017_sensitive_secondary.arrow + + describe_dataset_infants_s1_sens_sec: + run: r:latest analysis/report.R infants season1_start_date season1_end_date sensitive secondary + needs: [process_dataset_infants_s1_sens_sec] + outputs: + moderately_sensitive: + png: output/models/descriptive_infants_2016_2017_sensitive_secondary.png \ No newline at end of file