Skip to content

Commit

Permalink
create participant characteristic tables
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Mar 7, 2024
1 parent 4165a06 commit 6e9e35b
Show file tree
Hide file tree
Showing 18 changed files with 330 additions and 141 deletions.
Binary file modified .RData
Binary file not shown.
8 changes: 4 additions & 4 deletions analysis/additional_comorbidities.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,13 @@ def filter_codes_by_category(codelist, include):
.exists_for_patient()
)
smoking_status = (case(
when(most_recent_smoking_code == "S").then("S"),
when(most_recent_smoking_code == "S").then("Current"),
when((most_recent_smoking_code == "E")
| ((most_recent_smoking_code == "N")
& (ever_smoked == True))).then("E"),
& (ever_smoked == True))).then("Former"),
when((most_recent_smoking_code == "N")
& (ever_smoked == False)).then("N"),
otherwise = "M")
& (ever_smoked == False)).then("Never"),
otherwise = "Unknown")
)

#drinking
Expand Down
2 changes: 1 addition & 1 deletion analysis/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@
)

# coronary heart disease
coronary_heard_disease_codelist = codelist_from_csv(
coronary_heart_disease_codelist = codelist_from_csv(
"codelists/nhsd-primary-care-domain-refsets-chd_cod.csv",
column = "code",
)
30 changes: 30 additions & 0 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,36 @@ df_input <- df_input %>%
TRUE ~ NA_character_
))

#reverse order of IMD classfications
recode(df_input$imd_quintile, "1 (most deprived)" = "5 (most deprived)",
"2" = "4", "3" = "3", "4" = "2", "5 (least deprived)" = "1 (least deprived)")

#recode rurality to 5 levels
df_input <- df_input %>%
mutate(
rurality_code = recode(rural_urban_classification, "1" = "1", "2" = "2",
"3" = "3", "4" = "3", "5" = "4", "6" = "4",
"7" = "5", "8" = "5")
)

#assign rurality classification
df_input <- df_input %>%
mutate(
rurality_classification = ifelse(df_input$rurality_code == "1", "Urban Major Conurbation",
ifelse(df_input$rurality_code == "2", "Urban Minor Conurbation",
ifelse(df_input$rurality_code == "3", "Urban City and Town",
ifelse(df_input$rurality_code == "4", "Rural Town",
ifelse(df_input$rurality_code == "5", "Rural Village", "Unknown"))))
))

#define household size categories
df_input <- df_input %>%
mutate(
household_size_cat = ifelse(df_input$household_size >= 1 & df_input$household_size <= 2, "1",
ifelse(df_input$household_size >= 3 & household_size <= 5, "2",
ifelse(df_input$household_size >= 6, "3", "Unknown")))
)

#define seasons for covid
covid_season_min = as.Date("2019-09-01", format = "%Y-%m-%d")

Expand Down
23 changes: 11 additions & 12 deletions analysis/dataset_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
.for_patient_on(registration_date).exists_for_patient()),
otherwise = practice_registrations.for_patient_on(index_date).exists_for_patient()
)
is_female_or_male = patients.sex.is_in(["female", "male"])
is_female_or_male = patients.sex.is_in(["Female", "Male"])
is_appropriate_age = case(
when(cohort == "older_adults").then((age_at_start <= 110) & (age_at_end >= 65)),
when(cohort == "adults").then((age_at_start <= 64) & (age_at_end >= 18)),
Expand Down Expand Up @@ -280,17 +280,17 @@ def first_infection_event(codelist, where = True):
covid_season_min = datetime.strptime("2019-09-01", "%Y-%m-%d")

#vaccinations
if cohort == "adults" or cohort == "older_adults" or cohort == "children_adolescents" :
flu_vaccination = (
if cohort == "adults" or cohort == "older_adults" or cohort == "children_and_adolescents" :
dataset.flu_vaccination = (
vaccinations.where(vaccinations.target_disease.is_in(["Influenza"]))
.sort_by(vaccinations.date)
.where(vaccinations.date.is_on_or_between(vaccination_date, index_date))
.exists_for_patient()
)

if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min :
if cohort == "adults" or cohort == "older_adults" or cohort == "children_adolescents" :
covid_vaccination_count = (
if cohort == "adults" or cohort == "older_adults" or cohort == "children_and_adolescents" :
dataset.covid_vaccination_count = (
vaccinations.where(vaccinations.target_disease.is_in(["SARS-COV-2"]))
.sort_by(vaccinations.date)
.where(vaccinations.date.is_on_or_before(index_date))
Expand Down Expand Up @@ -853,7 +853,7 @@ def first_infection_event(codelist, where = True):
has_diabetes, has_addisons, severe_obesity,
has_chd, has_ckd, has_cld, has_cnd, has_crd,
has_cancer, immunosuppressed, has_sickle_cell,
has_heart_failure, has_prior_mi
has_heart_failure, has_coronary_heart_disease
)

if cohort == "adults" or cohort == "older_adults" :
Expand All @@ -878,11 +878,10 @@ def first_infection_event(codelist, where = True):
dataset.has_heart_failure = has_heart_failure
dataset.has_coronary_heart_disease = has_coronary_heart_disease

if cohort == "children_and_adolscents" :
if cohort == "children_and_adolescents" :

dataset.has_reactive_airway = case(
when(dataset.age < 5).then(has_reactive_airway),
default = False
dataset.has_asthma_reactive_airway = case(
when(dataset.age <= 5).then(has_reactive_airway),
when(dataset.age > 5).then(has_asthma),
otherwise = False
)

dataset.has_asthma = has_asthma
14 changes: 7 additions & 7 deletions analysis/dummydata/dummydata_adults.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ sim_list = lst(
#sex of the patient
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"),
p = c(0.51, 0.49, 0, 0)), missing_rate = ~ 0.001
p = c(0.51, 0.49, 0, 0))
),

#age of the patient
Expand Down Expand Up @@ -128,10 +128,10 @@ sim_list = lst(
#smoking status
smoking_status = bn_node(
~ rfactor(n = ..n, levels = c(
"S", #smoker
"E", #ever-smoked
"N", #never smoked
"M" #missing
"Current", #smoker
"Former", #ever-smoked
"Never", #never smoked
"Unknown" #missing
), p = c(0.1, 0.2, 0.7, 0))
),

Expand All @@ -152,8 +152,8 @@ sim_list = lst(

#copd
has_copd = bn_node(
~ rbernoulli(n = ..n, p = plogis(-1 + I(smoking_status == "S")*-0.5 +
I(smoking_status == "E")*-0.1))
~ rbernoulli(n = ..n, p = plogis(-1 + I(smoking_status == "Current")*-0.5 +
I(smoking_status == "Former")*-0.1))
),

#pulmonary fibrosis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ sim_list = lst(
#sex of the patient
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"),
p = c(0.51, 0.49, 0, 0)), missing_rate = ~ 0.001
p = c(0.51, 0.49, 0, 0))
),

#age of the patient
Expand Down Expand Up @@ -126,20 +126,10 @@ sim_list = lst(
##comorbidities

#has asthma
has_asthma = bn_node(
has_asthma_reactive_airway = bn_node(
~ rbernoulli(n = ..n, p = 0.15)
),

#has reactive airway disease
has_reactive_airway = bn_node(
~ rbernoulli(n = ..n, p = 0.05)
),

# #diabetes
# has_diabetes = bn_node(
# ~ rbernoulli(n = ..n, p = plogis(-1 + age*0.02 + I(sex == "female")*-0.2))
# ),


#flu vaccination
flu_vaccination = bn_node(
~ rbernoulli(n = ..n, p = 0.75)
Expand Down Expand Up @@ -307,4 +297,4 @@ dummydata_processed <- dummydata %>%

fs::dir_create(here("analysis", "dummydata"))
write_feather(dummydata_processed, sink = here("analysis", "dummydata",
paste0("dummyextract_children_adolescents_", year(study_start_date), "_", year(study_end_date), ".arrow")))
paste0("dummyextract_children_and_adolescents_", year(study_start_date), "_", year(study_end_date), ".arrow")))
2 changes: 1 addition & 1 deletion analysis/dummydata/dummydata_infants.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ sim_list = lst(
#sex of the patient
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"),
p = c(0.51, 0.49, 0, 0)), missing_rate = ~0.001
p = c(0.51, 0.49, 0, 0))
),

#age of the patient (months)
Expand Down
10 changes: 5 additions & 5 deletions analysis/dummydata/dummydata_infants_subgroup.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ sim_list = lst(
#sex of the patient
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"),
p = c(0.51, 0.49, 0, 0)), missing_rate = ~0.001
p = c(0.51, 0.49, 0, 0))
),

#age of the patient
Expand Down Expand Up @@ -146,10 +146,10 @@ sim_list = lst(
#smoking status
maternal_smoking_code = bn_node(
~ rfactor(n = ..n, levels = c(
"S", #smoker
"E", #ever-smoked
"N", #never smoked
"M" #missing
"Current", #smoker
"Former", #ever-smoked
"Never", #never smoked
"Unknown" #missing
), p = c(0.1, 0.2, 0.7, 0))
),

Expand Down
14 changes: 7 additions & 7 deletions analysis/dummydata/dummydata_older_adults.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ sim_list = lst(
#sex of the patient
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"),
p = c(0.51, 0.49, 0, 0)), missing_rate = ~ 0.001
p = c(0.51, 0.49, 0, 0))
),

#age of the patient
Expand Down Expand Up @@ -128,10 +128,10 @@ sim_list = lst(
#smoking status
smoking_status = bn_node(
~ rfactor(n = ..n, levels = c(
"S", #smoker
"E", #ever-smoked
"N", #never smoked
"M" #missing
"Current", #smoker
"Former", #ever-smoked
"Never", #never smoked
"Unknown" #missing
), p = c(0.1, 0.2, 0.7, 0))
),

Expand All @@ -152,8 +152,8 @@ sim_list = lst(

#copd
has_copd = bn_node(
~ rbernoulli(n = ..n, p = plogis(-1 + I(smoking_status == "S")*-0.5 +
I(smoking_status == "E")*-0.1))
~ rbernoulli(n = ..n, p = plogis(-1 + I(smoking_status == "Current")*-0.5 +
I(smoking_status == "Former")*-0.1))
),

#pulmonary fibrosis
Expand Down
Binary file modified analysis/dummydata/dummyextract_adults_2016_2017.arrow
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified analysis/dummydata/dummyextract_infants_2016_2017.arrow
Binary file not shown.
Binary file modified analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow
Binary file not shown.
Binary file modified analysis/dummydata/dummyextract_older_adults_2016_2017.arrow
Binary file not shown.
101 changes: 101 additions & 0 deletions analysis/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ library("here")
library("arrow")
library("ggplot2")
library("data.table")
library("gtsummary")

## create output directories ----
fs::dir_create(here("analysis"))
Expand All @@ -15,6 +16,7 @@ study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]
codelist_type <- args[[4]]
investigation_type <- args[[5]]
covid_season_min <- as.Date("2019-09-01")

df_input <- read_feather(
here::here("output", paste0("input_processed_", cohort, "_", year(study_start_date),
Expand All @@ -34,4 +36,103 @@ ggsave(
investigation_type,".png"), path = here::here("output"),
)

df_datatable <- as.data.table(df_input)

if (cohort == "infants") {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality"))
} else if (cohort == "children_and_adolescents") {
if(investigation_type == "primary") {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"flu_vaccination"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
} else {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, Reactive_Airway := ifelse(age <= 5, has_asthma_reactive_airway, F)]
table <- df_datatable[registered == TRUE, Asthma := ifelse(age > 5, has_asthma_reactive_airway, F)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, Asthma, Reactive_Airway,
flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"Reactive_Airway", "flu_vaccination"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Reactive Airway", "Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
}
} else {
if (investigation_type == "primary") {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"flu_vaccination"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
} else {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, smoking_status,
hazardous_drinking, drug_usage, has_asthma,
has_copd, has_pulmonary_fibrosis,
has_cystic_fibrosis, has_diabetes, has_addisons,
severe_obesity, has_chd, has_ckd, has_cld, has_cnd,
has_cancer, immunosuppressed, has_sickle_cell,
has_heart_failure, has_coronary_heart_disease,
flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"smoking_status", "hazardous_drinking", "drug_usage",
"has_asthma", "has_copd", "has_pulmonary_fibrosis",
"has_cystic_fibrosis", "has_diabetes", "has_addisons" ,
"severe_obesity", "has_chd", "has_ckd", "has_cld",
"has_cnd", "has_cancer", "immunosuppressed",
"has_sickle_cell", "has_heart_failure",
"has_coronary_heart_disease", "flu_vaccination"),
c("Age_Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Smoking Status", "Hazardous Drinking", "Drug Usage",
"Asthma", "COPD", "Pulmonary Fibrosis", "Cystic Fibrosis",
"Diabetes", "Addisons", "Severe Obesity", "Chronic Heart Disease",
"Chronic Kidney Disease", "Chronic Liver Disease",
"Chronic Neurological Disease", "Cancer Within 3 Years",
"Immunosuppressed", "Sickle Cell Disease", "Heart Failure",
"Coronary Heart Disease", "Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
}
}

table %>%
tbl_summary() %>%
as_gt() %>%
gt::gtsave(filename = paste0("table1_", cohort, "_", year(study_start_date),
"_", year(study_end_date), "_", codelist_type, "_",
investigation_type,".html"), path = here::here("output"))
Loading

0 comments on commit 6e9e35b

Please sign in to comment.