Skip to content

Commit

Permalink
update workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Mar 13, 2024
1 parent e74f3cd commit 30a0d22
Show file tree
Hide file tree
Showing 22 changed files with 834 additions and 730 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ model.log
__pycache__
.python-version
/output/*
/post_check/*
metadata/*
venv/
.DS_Store
Expand Down
43 changes: 22 additions & 21 deletions analysis/additional_comorbidities.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,6 @@ def filter_codes_by_category(codelist, include):
.is_on_or_after(last_prior_event(codelists
.copd_resolved_codelist).date))
)

#pulmonary fibrosis diagnosis
has_pulmonary_fibrosis = (
has_prior_event(codelists
.pulmonary_fibrosis_codelist)
)

#cystic fibrosis diagnosis
has_cystic_fibrosis = (
Expand All @@ -195,6 +189,21 @@ def filter_codes_by_category(codelist, include):
.exists_for_patient()
)

#pulmonary fibrosis diagnosis
has_pulmonary_fibrosis = (
has_prior_event(codelists
.pulmonary_fibrosis_codelist)
)

#Chronic Respiratory Disease
has_crd = has_prior_event(codelists.crd_codelist)

#other chronic respiratory disease
has_other_resp = (
(has_pulmonary_fibrosis | has_crd) &
(~has_asthma & ~ has_copd & ~ has_cystic_fibrosis)
)

#diabetes diagnosis
diab_date = last_prior_event(codelists.diabetes_codelist).date
dmres_date = last_prior_event(codelists.diabetes_resolved_codelist).date
Expand Down Expand Up @@ -248,9 +257,13 @@ def filter_codes_by_category(codelist, include):
otherwise = False
)

#Chronic Heart Disease
has_chd = has_prior_event(codelists.chd_codelist)

#Chronic Heart Diseases
has_chd = (
(has_prior_event(codelists.chd_codelist)) |
(has_prior_event(codelists.heart_failure_codelist)) |
(has_prior_event(codelists.coronary_heart_disease_codelist))
)

#Chronic Kidney Disease

###############################################################################
Expand All @@ -275,9 +288,6 @@ def filter_codes_by_category(codelist, include):
#Chronic Neurological Disease including Significant Learning Disorder
has_cnd = has_prior_event(codelists.cnd_codelist)

#Chronic Respiratory Disease
has_crd = has_prior_event(codelists.crd_codelist)

#Cancer within 3 years
has_cancer = (
has_prior_event(codelists.cancer_codelist +
Expand Down Expand Up @@ -320,12 +330,3 @@ def filter_codes_by_category(codelist, include):

#Sickle Cell Disease
has_sickle_cell = has_prior_event(codelists.sickle_cell_codelist)

#Heart Failure
has_heart_failure = has_prior_event(codelists.heart_failure_codelist)

#Coronary Heart Disease
has_coronary_heart_disease = (
has_prior_event(codelists
.coronary_heart_disease_codelist)
)
92 changes: 92 additions & 0 deletions analysis/cohort_criteria.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
library(tidyverse)
library(here)
library(arrow)
library(ggplot2)
library(data.table)
library(gtsummary)

#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 0) {
study_start_date <- "2016-09-01"
study_end_date <- "2017-08-31"
cohort <- "adults"
} else {
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]
}

patients_df <- read_csv(
here::here("output", "flow_chart", paste0(cohort, "_", year(study_start_date),
"_", year(study_end_date), "_flow_chart", ".csv")))

patients_df <- patients_df %>%
mutate(
has_imd = ifelse(is.na(patients_df$imd_rounded), F, T),
is_female_or_male = ifelse(patients_df$sex == "female" | patients_df$sex == "male", T, F)
)

if (cohort == "infants" | cohort == "infants_subgroup") {
is_appropriate_age = ifelse(patients_df$age >= 0 & patients_df$age <= 23, T, F)
} else if (cohort == "children_and_adolescents") {
is_appropriate_age = ifelse(patients_df$age >= 2 & patients_df$age <= 17, T, F)
} else if (cohort == "adults") {
is_appropriate_age = ifelse(patients_df$age >= 18 & patients_df$age <= 64, T, F)
} else {
is_appropriate_age = ifelse(patients_df$age >= 65, T, F)
}

patients_df <- patients_df %>%
mutate(is_appropriate_age = is_appropriate_age)

# Define counts based on inclusion and exclusion criteria
total <- nrow(patients_df)
registered_count <- sum(patients_df$registered)
non_registered_count <- total - registered_count
age_count <- if (cohort == "infants" | cohort == "infants_subgroup") {
sum(patients_df$is_appropriate_age)
} else {
sum(patients_df$is_appropriate_age & patients_df$registered, na.rm = TRUE)
}
not_age_count <- if (cohort == "infants" | cohort == "infants_subgroup") {
total - age_count
} else {
registered_count - age_count
}

if (cohort == "older_adults") {
included_count <- sum(patients_df$registered & patients_df$is_female_or_male
& patients_df$is_appropriate_age & patients_df$has_imd
& !patients_df$care_home, na.rm = TRUE)
excluded_count <- sum(!patients_df$is_female_or_male |!patients_df$has_imd
| patients_df$care_home, na.rm = TRUE) - not_age_count
} else if (cohort == "infants" | cohort == "infants_subgroup") {
included_count <- sum(patients_df$is_female_or_male
& patients_df$is_appropriate_age & patients_df$has_imd
& !patients_df$risk_group_infants
& !patients_df$severe_immunodeficiency, na.rm = TRUE)
excluded_count <- sum(!patients_df$is_female_or_male |!patients_df$has_imd
| patients_df$risk_group_infants
| patients_df$severe_immunodeficiency, na.rm = TRUE) -
not_age_count
} else {
included_count <- sum(patients_df$registered & patients_df$is_female_or_male
& patients_df$is_appropriate_age & patients_df$has_imd,
na.rm = TRUE)
excluded_count <- sum(!patients_df$is_female_or_male
|!patients_df$has_imd, na.rm = TRUE) - not_age_count
}

## create output directories ----
fs::dir_create(here("output", "flow_chart"))

#export flow chart numbers
table <- cbind(total, non_registered_count, registered_count,
not_age_count, age_count, excluded_count, included_count)
table <- table %>%
as.data.frame() %>%
write_csv(path = paste0(here::here("output", "flow_chart"), "/",
"flow_chart_processed_", cohort, "_", year(study_start_date), "_",
year(study_end_date), ".csv"))
112 changes: 112 additions & 0 deletions analysis/cohort_description.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
library(tidyverse)
library(here)
library(arrow)
library(ggplot2)
library(data.table)
library(gtsummary)

## create output directories ----
fs::dir_create(here("analysis"))

#import redaction functions
source(here("analysis", "functions", "redaction.R"))

#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 0) {
study_start_date <- "2016-09-01"
study_end_date <- "2017-08-31"
cohort <- "adults"
codelist_type <- "specific"
investigation_type <- "primary"
} else {
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]
codelist_type <- args[[4]]
investigation_type <- args[[5]]
}
covid_season_min <- as.Date("2019-09-01")

df_input <- read_feather(
here::here("output", "data", paste0("input_processed_", cohort, "_",
year(study_start_date), "_", year(study_end_date), "_",
codelist_type, "_", investigation_type,".arrow")))

df_datatable <- as.data.table(df_input)

if (cohort == "infants") {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality"))
} else if (cohort == "children_and_adolescents") {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, Reactive_Airway := ifelse(age <= 5, has_asthma_reactive_airway, F)]
table <- df_datatable[registered == TRUE, Asthma := ifelse(age > 5, has_asthma_reactive_airway, F)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, Asthma, Reactive_Airway,
flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"Reactive_Airway", "flu_vaccination"),
c("Age Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Reactive Airway", "Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
} else {
table <- df_datatable[registered == TRUE, Total := n_distinct(patient_id)]
table <- df_datatable[registered == TRUE, .(Total, age_band, sex,
latest_ethnicity_group, imd_quintile,
rurality_classification, smoking_status,
hazardous_drinking, drug_usage, has_asthma,
has_copd, has_cystic_fibrosis,
has_other_resp, has_diabetes, has_addisons,
severe_obesity, has_chd, has_ckd, has_cld, has_cnd,
has_cancer, immunosuppressed, has_sickle_cell,
flu_vaccination)]
setnames(table, c("age_band", "sex", "latest_ethnicity_group",
"imd_quintile", "rurality_classification",
"smoking_status", "hazardous_drinking", "drug_usage",
"has_asthma", "has_copd", "has_cystic_fibrosis",
"has_other_resp", "has_diabetes", "has_addisons" ,
"severe_obesity", "has_chd", "has_ckd", "has_cld",
"has_cnd", "has_cancer", "immunosuppressed",
"has_sickle_cell", "flu_vaccination"),
c("Age_Group", "Sex", "Ethnicity", "IMD", "Rurality",
"Smoking Status", "Hazardous Drinking", "Drug Usage",
"Asthma", "COPD", "Cystic Fibrosis", "Other Chronic Respiratory Diseases",
"Diabetes", "Addisons", "Severe Obesity", "Chronic Heart Diseases",
"Chronic Kidney Disease", "Chronic Liver Disease",
"Chronic Neurological Disease", "Cancer Within 3 Years",
"Immunosuppressed", "Sickle Cell Disease", "Flu Vaccine"))
if (study_start_date >= covid_season_min) {
table[, covid_vaccination_count := df_datatable$covid_vaccination_count]
setnames(table, "covid_vaccination_count", "Covid Vaccine Doses")
}
}

## create output directories ----
fs::dir_create(here("output", "table1"))

#export
table %>%
tbl_summary() %>%
as_gt() %>%
gt::gtsave(filename = paste0("table1_", cohort, "_", year(study_start_date),
"_", year(study_end_date), ".html"),
path = here::here("output", "table1"))

table %>%
tbl_summary() %>%
as_tibble() %>%
write_csv(path = paste0(here::here("output", "table1"), "/", "table1_",
cohort, "_", year(study_start_date), "_",
year(study_end_date),".csv"))
41 changes: 26 additions & 15 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
library("tidyverse")
library("here")
library("arrow")
library("ggplot2")
library("data.table")
library("lubridate")
library(tidyverse)
library(here)
library(arrow)
library(ggplot2)
library(data.table)
library(lubridate)

## create output directories ----
fs::dir_create(here("analysis"))

#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]
codelist_type <- args[[4]]
investigation_type <- args[[5]]
if (length(args) == 0) {
study_start_date <- "2016-09-01"
study_end_date <- "2017-08-31"
cohort <- "adults"
codelist_type <- "specific"
investigation_type <- "primary"
} else {
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]
codelist_type <- args[[4]]
investigation_type <- args[[5]]
}

df_input <- read_feather(
here::here("output", paste0("input_", cohort, "_", year(study_start_date),
"_", year(study_end_date), "_", codelist_type, "_",
investigation_type,".arrow")))
here::here("output", "data", paste0("input_", cohort, "_",
year(study_start_date), "_", year(study_end_date), "_",
codelist_type, "_", investigation_type,".arrow")))

#assign ethnicity group
df_input <- df_input %>%
Expand Down Expand Up @@ -140,8 +148,11 @@ df_input <- df_input %>%
df_input$time_mild <- difftime(df_input$end_time_mild, study_start_date, df_input, "weeks")
df_input$time_severe <- difftime(df_input$end_time_severe, study_start_date, df_input, "weeks")

## create output directories ----
fs::dir_create(here("output", "data"))

#write the new input file
write_feather(df_input, here::here("output",
write_feather(df_input, here::here("output", "data",
paste0("input_processed_", cohort, "_", year(study_start_date),
"_", year(study_end_date), "_", codelist_type,
"_", investigation_type, ".arrow")))
Loading

0 comments on commit 30a0d22

Please sign in to comment.