Skip to content

Commit

Permalink
Trial for two diseases using more efficient code
Browse files Browse the repository at this point in the history
  • Loading branch information
markdrussell committed Jan 7, 2025
1 parent dd3a49b commit eaac9b3
Show file tree
Hide file tree
Showing 14 changed files with 1,246 additions and 1,136 deletions.
28 changes: 20 additions & 8 deletions analysis/100_incidence_graphs.do
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@ log using "$logdir/descriptive_tables.log", replace
**Set Ado file path
adopath + "$projectdir/analysis/extra_ados"

**Import and append measures datasets - could local this
import delimited "$projectdir/output/measures/measures_dataset_2016.csv", clear
**Import and append measures datasets for diseases
*local diseases "asthma copd chd stroke heart_failure dementia multiple_sclerosis epilepsy crohns_disease ulcerative_colitis dm_type2 dm_type1 ckd psoriasis atopic_dermatitis osteoporosis hiv depression coeliac pmr"
local diseases "copd chd"
local first_disease: word 1 of `diseases'

import delimited "$projectdir/output/measures/measures_dataset_`first_disease'.csv", clear
save "$projectdir/output/data/measures_appended.dta", replace

forval year = 2017(1)2024{
import delimited "$projectdir/output/measures/measures_dataset_`year'.csv", clear
foreach disease in `diseases' {
if "`disease'" != "`first_disease'" {
import delimited "$projectdir/output/measures/measures_dataset_`disease'.csv", clear
append using "$projectdir/output/data/measures_appended.dta"
save "$projectdir/output/data/measures_appended.dta", replace
}
Expand Down Expand Up @@ -72,18 +77,22 @@ gen measure_inc = 1 if substr(measure,-10,.) == "_incidence"
recode measure_inc .=0
gen measure_prev = 1 if substr(measure,-11,.) == "_prevalence"
recode measure_prev .=0

/*
gen measure_imd = 1 if substr(measure,-4,.) == "_imd"
recode measure_imd .=0
gen measure_ethnicity = 1 if substr(measure,-10,.) == "_ethnicity"
recode measure_ethnicity .=0
*/

gen measure_inc_any = 1 if measure_inc ==1 | measure_imd==1 | measure_ethnicity==1
gen measure_inc_any = 1 if measure_inc ==1
*gen measure_inc_any = 1 if measure_inc ==1 | measure_imd==1 | measure_ethnicity==1
recode measure_inc_any .=0

gen diseases_ = substr(measure, 1, strlen(measure) - 10) if measure_inc==1
replace diseases_ = substr(measure, 1, strlen(measure) - 11) if measure_prev==1
replace diseases_ = substr(measure, 1, strlen(measure) - 14) if measure_imd==1
replace diseases_ = substr(measure, 1, strlen(measure) - 20) if measure_ethnicity==1
*replace diseases_ = substr(measure, 1, strlen(measure) - 14) if measure_imd==1
*replace diseases_ = substr(measure, 1, strlen(measure) - 20) if measure_ethnicity==1
gen disease = strproper(subinstr(diseases_, "_", " ",.))

*Generate incidence and prevalence by months across ages, sexes, IMD and ethnicity
Expand Down Expand Up @@ -360,7 +369,10 @@ foreach disease_ of local levels {
twoway line asr_0_9_ma mo_year_diagn, lcolor(yellow) lstyle(solid) ytitle("Monthly incidence per 100,000 population", size(med)) || line asr_10_19_ma mo_year_diagn, lcolor(orange) lstyle(solid) || line asr_20_29_ma mo_year_diagn, lcolor(red) lstyle(solid) || line asr_30_39_ma mo_year_diagn, lcolor(ltblue) lstyle(solid) || line asr_40_49_ma mo_year_diagn, lcolor(eltblue) lstyle(solid) || line asr_50_59_ma mo_year_diagn, lcolor(blue) lstyle(solid) || line asr_60_69_ma mo_year_diagn, lcolor(purple) lstyle(solid) || line asr_70_79_ma mo_year_diagn, lcolor(brown) lstyle(solid) || line asr_80_ma mo_year_diagn, lcolor(black) lstyle(solid) ylabel(, nogrid labsize(small)) xtitle("Date of diagnosis", size(medium) margin(medsmall)) xlabel(671 "2016" 683 "2017" 695 "2018" 707 "2019" 719 "2020" 731 "2021" 743 "2022" 755 "2023" 767 "2024" 779 "2025", nogrid labsize(small)) title("`dis_title'", size(medium)) xline(722) legend(region(fcolor(white%0)) order(1 "0-9" 2 "10-19" 3 "20-29" 4 "30-39" 5 "40-49" 6 "50-59" 7 "60-69" 8 "70-79" 9 "80+")) name(`disease_'_adj_ma_age2, replace) saving("$projectdir/output/figures/`disease_'_adj_ma_age2.gph", replace)
graph export "$projectdir/output/figures/adj_ma_age2_`disease_'.svg", replace
restore


}

log close
*******Also need by IMD quintile +/- ethnicity**************

/*
Expand Down
682 changes: 682 additions & 0 deletions analysis/100_incidence_graphs_disease.do

Large diffs are not rendered by default.

51 changes: 13 additions & 38 deletions analysis/dataset_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@
from datetime import date, datetime
import codelists_ehrQL as codelists

# # Arguments (from project.yaml)
# from argparse import ArgumentParser

# parser = ArgumentParser()
# parser.add_argument("--diseases", type=str)
# args = parser.parse_args()
# diseases = args.diseases.split(", ")

# diseases = ["asthma", "copd", "chd", "stroke", "heart_failure", "dementia", "multiple_sclerosis", "epilepsy", "crohns_disease", "ulcerative_colitis", "dm_type2", "dm_type1", "ckd", "psoriasis", "atopic_dermatitis", "osteoporosis", "hiv", "depression", "coeliac", "pmr"]
diseases = ["copd", "chd"]
codelist_types = ["snomed", "icd", "resolved"]

dataset = create_dataset()
dataset.configure_dummy_data(population_size=1000)

Expand Down Expand Up @@ -66,38 +78,6 @@ def preceding_registration(dx_date):
dataset.sex = patients.sex
dataset.date_of_death = patients.date_of_death

# # Define patient ethnicity
# latest_ethnicity_code = (
# clinical_events.where(clinical_events.ctv3_code.is_in(codelists.ethnicity_codes))
# .where(clinical_events.date.is_on_or_before(end_date))
# .sort_by(clinical_events.date)
# .last_for_patient()
# .ctv3_code
# )

# latest_ethnicity = latest_ethnicity_code.to_category(codelists.ethnicity_codes)

# dataset.ethnicity = case(
# when(latest_ethnicity == "1").then("White"),
# when(latest_ethnicity == "2").then("Mixed"),
# when(latest_ethnicity == "3").then("Asian or Asian British"),
# when(latest_ethnicity == "4").then("Black or Black British"),
# when(latest_ethnicity == "5").then("Other"),
# otherwise="missing",
# )

# # Define patient IMD, using latest data for each patient
# latest_address_per_patient = addresses.sort_by(addresses.start_date).last_for_patient()
# imd_rounded = latest_address_per_patient.imd_rounded
# dataset.imd_quintile = case(
# when((imd_rounded >= 0) & (imd_rounded < int(32844 * 1 / 5))).then("1"),
# when(imd_rounded < int(32844 * 2 / 5)).then("2"),
# when(imd_rounded < int(32844 * 3 / 5)).then("3"),
# when(imd_rounded < int(32844 * 4 / 5)).then("4"),
# when(imd_rounded < int(32844 * 5 / 5)).then("5"),
# otherwise="Missing",
# )

# Any practice registration before study end date
any_registration = practice_registrations.where(
practice_registrations.start_date <= end_date
Expand All @@ -111,18 +91,13 @@ def preceding_registration(dx_date):
& ((dataset.sex == "male") | (dataset.sex == "female"))
)

# List of diseases and codelists to cycle through
# diseases = ["copd", "chd", "stroke", "heart_failure", "dementia"]
diseases = ["asthma", "copd", "chd", "stroke", "heart_failure", "dementia", "multiple_sclerosis", "epilepsy", "crohns_disease", "ulcerative_colitis", "dm_type2", "dm_type1", "ckd", "psoriasis", "atopic_dermatitis", "osteoporosis", "hiv", "depression", "coeliac", "pmr"]
codelist_types = ["snomed", "icd", "resolved"]

for disease in diseases:

snomed_inc_date = {} # Dictionary to store dates
snomed_last_date = {}
icd_inc_date = {}
icd_last_date = {}
last_date = {} # Dictionary to store dates
last_date = {}

for codelist_type in codelist_types:

Expand Down
208 changes: 80 additions & 128 deletions analysis/dataset_definition_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from ehrql.tables.tpp import patients, medications, practice_registrations, clinical_events, apcs, addresses, ons_deaths, appointments
from datetime import date, datetime
import codelists_ehrQL as codelists
from analysis.dataset_definition import dataset, diseases
from analysis.dataset_definition import dataset
import sys

# Arguments (from project.yaml)
Expand All @@ -11,11 +11,14 @@
parser = ArgumentParser()
parser.add_argument("--start-date", type=str)
parser.add_argument("--intervals", type=int)
parser.add_argument("--intervals_years", type=int)
parser.add_argument("--disease", type=str)
args = parser.parse_args()

start_date = args.start_date
intervals = args.intervals
intervals_years = int(intervals/12)
intervals_years = args.intervals_years
disease = args.disease

index_date = INTERVAL.start_date
end_date = INTERVAL.end_date
Expand All @@ -37,35 +40,10 @@ def preceding_registration(dx_date):
# 12m preceding registration exist at interval start
preceding_reg_index = preceding_registration(index_date).exists_for_patient()

# Currently registered at interval start
currently_registered = practice_registrations.for_patient_on(index_date).exists_for_patient()

# Age at interval start
age = patients.age_on(index_date)

# age_band = case(
# when((age >= 0) & (age < 4)).then("age_0_4"),
# when((age >= 5) & (age < 9)).then("age_5_9"),
# when((age >= 10) & (age < 14)).then("age_10_14"),
# when((age >= 15) & (age < 19)).then("age_15_19"),
# when((age >= 20) & (age < 24)).then("age_20_24"),
# when((age >= 25) & (age < 29)).then("age_25_29"),
# when((age >= 30) & (age < 34)).then("age_30_34"),
# when((age >= 35) & (age < 39)).then("age_35_39"),
# when((age >= 40) & (age < 44)).then("age_40_44"),
# when((age >= 45) & (age < 49)).then("age_45_49"),
# when((age >= 50) & (age < 54)).then("age_50_54"),
# when((age >= 55) & (age < 59)).then("age_55_59"),
# when((age >= 60) & (age < 64)).then("age_60_64"),
# when((age >= 65) & (age < 69)).then("age_65_69"),
# when((age >= 70) & (age < 74)).then("age_70_74"),
# when((age >= 75) & (age < 79)).then("age_75_79"),
# when((age >= 80) & (age < 84)).then("age_80_84"),
# when((age >= 85) & (age < 89)).then("age_85_89"),
# when((age >= 90)).then("age_greater_equal_90"),
# )

age_band2 = case(
age_band = case(
when((age >= 0) & (age < 9)).then("age_0_9"),
when((age >= 10) & (age < 19)).then("age_10_19"),
when((age >= 20) & (age < 29)).then("age_20_29"),
Expand All @@ -79,117 +57,91 @@ def preceding_registration(dx_date):

measures = create_measures()
measures.configure_dummy_data(population_size=1000, legacy=True)
measures.configure_disclosure_control(enabled=False) # Consider changing this in final output
measures.configure_disclosure_control(enabled=False)
measures.define_defaults(intervals=months(intervals).starting_on(start_date))

## Prevalence denominator - people currently registered on index date (Nb. sex already selected for in dataset definition)
## Prevalence denominator - people registered for more than one year on index date (Nb. sex already selected for in dataset definition)
prev_denominator = (
((age >= 0) & (age < 110))
& (dataset.date_of_death.is_after(index_date) | dataset.date_of_death.is_null())
& (currently_registered == True)
& (preceding_reg_index == True)
)

for disease in diseases:

# Dictionary to store the values
prev = {}
inc_case = {}
inc_case_12m_alive = {}
prev_numerators = {}
incidence_numerators = {}
incidence_denominators = {}

# Prevalent diagnosis (at interval start)
prev[f"{disease}_prev"] = (
case(
when(
(getattr(dataset, f"{disease}_inc_date", None) < index_date)
& ((getattr(dataset, f"{disease}_resolved") == False) | ((getattr(dataset, f"{disease}_resolved") == True) & (getattr(dataset, f"{disease}_resolved_date", None) > index_date)))
).then(True),
otherwise=False,
)
# Dictionaries to store the values
prev = {}
prev_numerators = {}
inc_case = {}
inc_case_12m_alive = {}
incidence_numerators = {}
incidence_denominators = {}

# Prevalent diagnosis (at interval start)
prev[disease + "_prev"] = (
case(
when(
(getattr(dataset, disease + "_inc_date", None) < index_date)
& ((getattr(dataset, disease + "_resolved") == False) | ((getattr(dataset, disease + "_resolved") == True) & (getattr(dataset, disease + "_resolved_date", None) > index_date)))
).then(True),
otherwise=False,
)

## Prevalence numerator - people currently registered on index date who have an Dx code on or before index date
prev_numerators[f"{disease}_prev_num"] = (
((prev[f"{disease}_prev"]) == True)
& ((age >= 0) & (age < 110))
& (dataset.date_of_death.is_after(index_date) | dataset.date_of_death.is_null())
& (currently_registered == True)
)
)

# Incident case (i.e. incident date within interval window)
inc_case[f"{disease}_inc_case"] = (
case(
when(((getattr(dataset, f"{disease}_inc_date", None)) >= index_date) & ((getattr(dataset, f"{disease}_inc_date", None)) <= end_date)).then(True),
otherwise=False,
)
## Prevalence numerator - people registered for more than one year on index date who have an Dx code on or before index date
prev_numerators[disease + "_prev_num"] = (
((prev[disease + "_prev"]) == True)
& (prev_denominator == True)
)

# Preceding registration and alive at incident diagnosis date
inc_case_12m_alive[f"{disease}_inc_case_12m_alive"] = (
case(
when(((inc_case[f"{disease}_inc_case"]) == True)
& ((getattr(dataset, f"{disease}_preceding_reg_inc") == True))
& ((getattr(dataset, f"{disease}_alive_inc") == True))
).then(True),
otherwise=False,
)
# Incident case (i.e. incident date within interval window)
inc_case[disease + "_inc_case"] = (
case(
when(((getattr(dataset, disease + "_inc_date", None)) >= index_date) & ((getattr(dataset, disease + "_inc_date", None)) <= end_date)).then(True),
otherwise=False,
)

## Incidence numerator - people with new diagnostic codes in the 1 month after index date who have 12m+ prior registration and alive
incidence_numerators[f"{disease}_inc_num"] = (
((inc_case_12m_alive[f"{disease}_inc_case_12m_alive"]) == True)
& ((age >= 0) & (age < 110))
)
)

## Incidence denominator - people with 12m+ registration prior to index date who do not have a Dx code on or before index date
incidence_denominators[f"{disease}_inc_denom"] = (
((prev[f"{disease}_prev"]) == False)
& (dataset.date_of_death.is_after(index_date) | dataset.date_of_death.is_null())
& (preceding_reg_index == True)
& ((age >= 0) & (age < 110))
)
# Preceding registration and alive at incident diagnosis date
inc_case_12m_alive[disease + "_inc_case_12m_alive"] = (
case(
when(((inc_case[disease + "_inc_case"]) == True)
& ((getattr(dataset, disease + "_preceding_reg_inc") == True))
& ((getattr(dataset, disease + "_alive_inc") == True))
).then(True),
otherwise=False,
)
)

# Prevalence
measures.define_measure(
name=f"{disease}_prevalence",
numerator=prev_numerators[f"{disease}_prev_num"],
denominator=prev_denominator,
intervals=years(intervals_years).starting_on(start_date),
group_by={
"sex": dataset.sex,
"age": age_band2,
},
)

# Incidence by age and sex
measures.define_measure(
name=f"{disease}_incidence",
numerator=incidence_numerators[f"{disease}_inc_num"],
denominator=incidence_denominators[f"{disease}_inc_denom"],
group_by={
"sex": dataset.sex,
"age": age_band2,
},
)

# # Incidence by ethnicity (Nb. not age and sex)
# measures.define_measure(
# name=disease + "_incidence_ethnicity",
# numerator=incidence_numerators[f"{disease}_inc_num"],
# denominator=incidence_denominators[f"{disease}_inc_denom"],
# group_by={
# "ethnicity": dataset.ethnicity,
# },
# )

# # Incidence by IMD quintile (Nb. not age and sex)
# measures.define_measure(
# name=disease + "_incidence_imd",
# numerator=incidence_numerators[f"{disease}_inc_num"],
# denominator=incidence_denominators[f"{disease}_inc_denom"],
# group_by={
# "imd": dataset.imd_quintile,
# },
# )
## Incidence numerator - people with new diagnostic codes in the 1 month after index date who have 12m+ prior registration and alive
incidence_numerators[disease + "_inc_num"] = (
((inc_case_12m_alive[disease + "_inc_case_12m_alive"]) == True)
& ((age >= 0) & (age < 110))
)

## Incidence denominator - people with 12m+ registration prior to index date who do not have a Dx code on or before index date
incidence_denominators[disease + "_inc_denom"] = (
((prev[disease + "_prev"]) == False)
& (prev_denominator == True)
)

# Prevalence by age and sex
measures.define_measure(
name=disease + "_prevalence",
numerator=prev_numerators[disease + "_prev_num"],
denominator=prev_denominator,
intervals=years(intervals_years).starting_on(start_date),
group_by={
"sex": dataset.sex,
"age": age_band,
},
)

# Incidence by age and sex
measures.define_measure(
name=disease + "_incidence",
numerator=incidence_numerators[disease + "_inc_num"],
denominator=incidence_denominators[disease + "_inc_denom"],
group_by={
"sex": dataset.sex,
"age": age_band,
},
)
Loading

0 comments on commit eaac9b3

Please sign in to comment.