Skip to content

Commit

Permalink
update data processing method
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Feb 13, 2024
1 parent efa46dc commit 2ef3a48
Show file tree
Hide file tree
Showing 16 changed files with 2,588 additions and 85 deletions.
Binary file modified .RData
Binary file not shown.
24 changes: 21 additions & 3 deletions analysis/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@
column = "code",
)
copd_medications = codelist_from_csv(
"codelists/bristol-copd-medications-bnf.csv",
column = "code",
"codelists/user-emprestige-copd-medications-new-dmd.csv",
column = "dmd_id",
)

# pulmonary fibrosis
Expand Down Expand Up @@ -131,6 +131,12 @@

# rsv primary - sensitive

# rsv primary - sensitive (prescriptions)
rsv_prescriptions_codelist = codelist_from_csv(
"codelists/user-emprestige-rsv-identification-prescriptions-maximal-sensitivity-dmd.csv",
column = "dmd_id",
)

# rsv secondary - sensitive

# covid primary - sensitive
Expand All @@ -139,6 +145,12 @@
column = "CTV3ID",
)

# covid primary - sensitive (prescriptions)
covid_prescriptions_codelist = codelist_from_csv(
"codelists/user-emprestige-covid-19-identification-prescriptions-dmd.csv",
column = "dmd_id",
)

# covid secondary - sensitive
covid_secondary_codelist = codelist_from_csv(
"codelists/opensafely-covid-identification.csv",
Expand All @@ -147,12 +159,18 @@

# flu primary - sensitive

# flu primary - sensitive (prescriptions)
flu_prescriptions_codelist = codelist_from_csv(
"codelists/user-emprestige-influenza-identification-prescriptions-maximal-sensitivity-dmd.csv",
column = "dmd_id",
)

# flu secondary - sensitive

##exclusion criteria

# care home
care_home_codelist = codelist_from_csv(
carehome_codelist = codelist_from_csv(
"codelists/nhsd-primary-care-domain-refsets-carehome_cod.csv",
column = "code",
)
Expand Down
81 changes: 81 additions & 0 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
library("tidyverse")
library("here")
library("arrow")
library("ggplot2")
library("data.table")
library("lubridate")

## create output directories ----
fs::dir_create(here("analysis"))

#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
study_start_date <- as.Date(as.numeric(study_dates[args[2]]), format = "%Y-%m-%d", origin = "1970-01-01")
study_end_date <- as.Date(as.numeric(study_dates[args[3]]), format = "%Y-%m-%d", origin = "1970-01-01")

df_input <- read_feather(
here::here("output", paste0("input_", args[[1]], "_", year(study_start_date),
"_", year(study_end_date), ".arrow")))

#assign ethnicity group
df_input <- df_input %>%
mutate(
latest_ethnicity_group = ifelse(df_input$latest_ethnicity_code == "1", "White",
ifelse(df_input$latest_ethnicity_code == "2", "Mixed",
ifelse(df_input$latest_ethnicity_code == "3", "Asian or Asian British",
ifelse(df_input$latest_ethnicity_code == "4", "Black or Black British",
ifelse(df_input$latest_ethnicity_code == "5", "Other Ethnic Groups", "Unknown"))))
))

#calculate age bands
if(args[[1]] == "older_adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 65 & df_input$age <= 74 ~ "65-74y",
df_input$age >= 75 & df_input$age <= 89 ~ "75-89y",
df_input$age >= 90 ~ "90+y",
TRUE ~ NA_character_
))
} else if(args[[1]] == "adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 18 & df_input$age <= 39 ~ "18-29y",
df_input$age >= 40 & df_input$age <= 64 ~ "40-64y",
TRUE ~ NA_character_
))
} else if(args[[1]] == "children_adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 2 & df_input$age <= 5 ~ "2-5y",
df_input$age >= 6 & df_input$age <= 9 ~ "6-9y",
df_input$age >= 10 & df_input$age <= 13 ~ "10-13y",
df_input$age >= 14 & df_input$age <= 17 ~ "14-17y",
TRUE ~ NA_character_
))
} else {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 0 & df_input$age <= 2 ~ "0-2m",
df_input$age >= 3 & df_input$age <= 5 ~ "3-5m",
df_input$age >= 6 & df_input$age <= 11 ~ "6-11m",
df_input$age >= 12 & df_input$age <= 23 ~ "12-23m",
TRUE ~ NA_character_
))
}

#calculate IMD quintile
df_input <- df_input %>%
mutate(imd_quintile = case_when(
df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1",
df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
TRUE ~ NA_character_
))

#write the new input file
write_feather(df_input, here::here("output",
paste0("input_processed_", args[[1]], "_", year(study_start_date),
"_", year(study_end_date), ".arrow")))
21 changes: 12 additions & 9 deletions analysis/dataset_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
dataset.registered = registered_patients
dataset.sex = patients.sex
dataset.age = case(
when(args[1] == "older_adults").then(age_at_start),
when(args[1] == "adults").then(age_at_start),
when(args[1] == "children_adolescents").then(age_at_start),
when(args[1] == "infants").then(age_at_start_months),
Expand Down Expand Up @@ -109,14 +110,15 @@
#medication date
medication_date = index_date - years(1)

#has asthma if there is a recent asthma diagnosis and a medication prescribed
dataset.has_asthma = (
clinical_events.where(clinical_events.snomedct_code.is_in(codelists.asthma_codelist))
.exists_for_patient() & medications.where(medications.dmd_code
.is_in(codelists.asthma_medications))
.where(medications.date.is_on_or_between(medication_date, index_date))
.exists_for_patient()
)
#has asthma if there is an asthma diagnosis and a recent medication prescribed
if args[1] != "infants" and args[1] != "infants_subgroup" :
dataset.has_asthma = (
clinical_events.where(clinical_events.snomedct_code.is_in(codelists.asthma_codelist))
.exists_for_patient() & medications.where(medications.dmd_code
.is_in(codelists.asthma_medications))
.where(medications.date.is_on_or_between(medication_date, index_date))
.exists_for_patient()
)

#reactive airway disease diagnosis
if args[1] == "children_adolescents" :
Expand Down Expand Up @@ -307,5 +309,6 @@ def has_prior_event(codelist, where=True):

#care home resident
if args[1] == "older_adults" :
dataset.care_home_tpp = (addresses.care_home_is_potential_match.if_null_then(False))
dataset.care_home_tpp = (addresses.for_patient_on(index_date)
.care_home_is_potential_match.when_null_then(False))
dataset.care_home_code = (has_prior_event(codelists.carehome_codelist))
2 changes: 1 addition & 1 deletion analysis/dummydata/dummydata_infants.R
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ sim_list = lst(
##exclusion criteria

#severe combined immunodeficiency syndrome
severe_immmunodeficiency = bn_node(
severe_immunodeficiency = bn_node(
~ rbernoulli(n = ..n, p = 0.02)
),

Expand Down
Binary file modified analysis/dummydata/dummyextract_infants_2016_2017.arrow
Binary file not shown.
54 changes: 0 additions & 54 deletions analysis/processing/data_processing_adults.R

This file was deleted.

24 changes: 24 additions & 0 deletions codelists/codelists.json
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,30 @@
"url": "https://www.opencodelists.org/codelist/nhsd-primary-care-domain-refsets/tempcarhome_cod/20200812/",
"downloaded_at": "2024-02-09 10:54:25.774557Z",
"sha": "931872c7b2c96884a1b4bdb5062356985875cea4"
},
"user-emprestige-rsv-identification-prescriptions-maximal-sensitivity-dmd.csv": {
"id": "user/emprestige/rsv-identification-prescriptions-maximal-sensitivity-dmd/65062298",
"url": "https://www.opencodelists.org/codelist/user/emprestige/rsv-identification-prescriptions-maximal-sensitivity-dmd/65062298/",
"downloaded_at": "2024-02-12 11:06:07.464507Z",
"sha": "3df89edeab0e2e010cda91cf8f00c03cd91c739d"
},
"user-emprestige-influenza-identification-prescriptions-maximal-sensitivity-dmd.csv": {
"id": "user/emprestige/influenza-identification-prescriptions-maximal-sensitivity-dmd/4bf6af63",
"url": "https://www.opencodelists.org/codelist/user/emprestige/influenza-identification-prescriptions-maximal-sensitivity-dmd/4bf6af63/",
"downloaded_at": "2024-02-12 11:06:07.639146Z",
"sha": "000e8053fa45aa912ac09bd9e7c51758fe69e04e"
},
"user-emprestige-covid-19-identification-prescriptions-dmd.csv": {
"id": "user/emprestige/covid-19-identification-prescriptions-dmd/32e73c2e",
"url": "https://www.opencodelists.org/codelist/user/emprestige/covid-19-identification-prescriptions-dmd/32e73c2e/",
"downloaded_at": "2024-02-12 11:13:31.540363Z",
"sha": "7c17a141fc9c02fa85d885fa6c21bc398e9a957e"
},
"user-emprestige-copd-medications-new-dmd.csv": {
"id": "user/emprestige/copd-medications-new-dmd/1f3d48e5",
"url": "https://www.opencodelists.org/codelist/user/emprestige/copd-medications-new-dmd/1f3d48e5/",
"downloaded_at": "2024-02-13 11:49:39.628031Z",
"sha": "8ca158de8df88cb9d1a961792c89ed650f5bd445"
}
}
}
6 changes: 5 additions & 1 deletion codelists/codelists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,8 @@ opensafely/covid-identification/2020-06-03
nhsd-primary-care-domain-refsets/carehome_cod/20211221
opensafely/chronic-cardiac-disease/2020-04-08
nhsd-primary-care-domain-refsets/pulmohyp_cod/20210127
nhsd-primary-care-domain-refsets/tempcarhome_cod/20200812
nhsd-primary-care-domain-refsets/tempcarhome_cod/20200812
user/emprestige/rsv-identification-prescriptions-maximal-sensitivity-dmd/65062298
user/emprestige/influenza-identification-prescriptions-maximal-sensitivity-dmd/4bf6af63
user/emprestige/covid-19-identification-prescriptions-dmd/32e73c2e
user/emprestige/copd-medications-new-dmd/1f3d48e5/
Loading

0 comments on commit 2ef3a48

Please sign in to comment.