From 150b8fac8f29ac2372c158378bc267c5582dd26a Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Mon, 6 Jan 2025 16:46:21 +0000 Subject: [PATCH 1/4] Take reading codelists out of dataset definition Creating a separate module will make it easier to share this code across different dataset definitions that we might be working on for this project. This also fixes the path to themethylphenidate_codelist, before the path was pointing to the same NHSD Primary Care Refset for ADHD. --- analysis/codelists.py | 11 +++++++++++ analysis/dataset_definition_core.py | 6 ++---- 2 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 analysis/codelists.py diff --git a/analysis/codelists.py b/analysis/codelists.py new file mode 100644 index 0000000..7cbad72 --- /dev/null +++ b/analysis/codelists.py @@ -0,0 +1,11 @@ +from ehrql import codelist_from_csv + +adhd_codelist = codelist_from_csv( + "codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv", + column="code", +) + +methylphenidate_codelist = codelist_from_csv( + "codelists/opensafely-methylphenidate-dmd.csv", + column="code", +) diff --git a/analysis/dataset_definition_core.py b/analysis/dataset_definition_core.py index c40df8f..cb5bc1e 100644 --- a/analysis/dataset_definition_core.py +++ b/analysis/dataset_definition_core.py @@ -1,9 +1,7 @@ -from ehrql import create_dataset, codelist_from_csv +from ehrql import create_dataset from ehrql.tables.tpp import patients, practice_registrations, clinical_events, medications -# Codelists -adhd_codelist = codelist_from_csv("codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv", column="code") -methylphenidate_codelist = codelist_from_csv("codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv", column="code") +from codelists import adhd_codelist, methylphenidate_codelist dataset = create_dataset() dataset.configure_dummy_data(population_size=10000) From cfb79a31f508ee15475e9e6ddc5d868e2dc3cdf9 Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Mon, 6 Jan 2025 17:16:35 +0000 Subject: [PATCH 2/4] Simplify dataset definition The main change here is to take out looping through the years, there is another framework in ehrQL, measures, that does this for us. I also switched the naming of patient attributes to have the prefix has (instead of had), we often use this naming convention for our studies. To shorten the code I also removed some of the variables that were only used to define the population and explicitly mention them only inside the define_population function. --- analysis/dataset_definition_core.py | 77 +++++++++++------------------ project.yaml | 8 +-- 2 files changed, 33 insertions(+), 52 deletions(-) diff --git a/analysis/dataset_definition_core.py b/analysis/dataset_definition_core.py index cb5bc1e..47f88f3 100644 --- a/analysis/dataset_definition_core.py +++ b/analysis/dataset_definition_core.py @@ -1,68 +1,47 @@ from ehrql import create_dataset -from ehrql.tables.tpp import patients, practice_registrations, clinical_events, medications +from ehrql.tables.tpp import ( + patients, + practice_registrations, + clinical_events, + medications, +) from codelists import adhd_codelist, methylphenidate_codelist dataset = create_dataset() -dataset.configure_dummy_data(population_size=10000) +dataset.configure_dummy_data(population_size=10) # Date range -start_date = f"2016-01-01" -end_date = f"2023-12-31" +start_date = "2016-01-01" +end_date = "2023-12-31" # Population variables -was_registered = practice_registrations.for_patient_on( - start_date -).exists_for_patient() & practice_registrations.for_patient_on( - end_date +has_registration = practice_registrations.spanning( + start_date, end_date ).exists_for_patient() +dataset.sex = patients.sex +dataset.age = patients.age_on(start_date) -is_female_or_male = patients.sex.is_in(["female", "male"]) - -was_adult = (patients.age_on(start_date) >= 18) & ( - patients.age_on(start_date) <= 120 +selected_events = clinical_events.where( + clinical_events.date.is_on_or_between(start_date, end_date) ) -was_alive = ( - patients.date_of_death.is_after(end_date) - | patients.date_of_death.is_null() +selected_medications = medications.where( + medications.date.is_on_or_between(start_date, end_date) ) -had_adhd_event = clinical_events.where( +dataset.has_adhd_event = selected_events.where( clinical_events.snomedct_code.is_in(adhd_codelist) - & clinical_events.date.is_on_or_between( - start_date, end_date - ) ).exists_for_patient() -dataset.define_population( - is_female_or_male - & was_adult - & was_alive - & was_registered - & had_adhd_event - ) - - -# Exposure variables -years = list(range(2016, 2023 + 1)) -# Iterate over each year and set the attribute on the dataset -for year in years: - start_date = f"{year}-01-01" - end_date = f"{year}-12-31" - - # Construct the attribute name dynamically for each year - attribute_name = f"num_adhd_events_{year}" - - # Calculate the number of ADHD events for the given year - num_adhd_events_year = clinical_events.where( - clinical_events.snomedct_code.is_in(adhd_codelist) - & clinical_events.date.is_on_or_between(start_date, end_date) - ).count_for_patient() - - # Set the attribute on the dataset - setattr(dataset, attribute_name, num_adhd_events_year) - +dataset.has_mph_med = selected_medications.where( + medications.dmd_code.is_in(methylphenidate_codelist) +).exists_for_patient() -dataset.sex = patients.sex -dataset.dob = patients.date_of_birth \ No newline at end of file +dataset.define_population( + has_registration + & dataset.sex.is_in(["male", "female"]) + & (dataset.age >= 18) + & (dataset.age <= 120) + & patients.is_alive_on(start_date), +) diff --git a/project.yaml b/project.yaml index c9ea3fe..f9493a2 100644 --- a/project.yaml +++ b/project.yaml @@ -5,11 +5,13 @@ expectations: population_size: 5000 actions: - generate_dataset_full: - run: ehrql:v1 generate-dataset analysis/dataset_definition_core.py --output output/full_dataset_test.csv.gz + generate_dataset: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition_core.py + --output output/full_dataset_test.csv.gz outputs: highly_sensitive: - full_dataset: output/full_dataset_test.csv.gz + adhd_dataset: output/adhd_dataset.csv.gz generate_charts: run: python:v2 python analysis/report.py From 82e7861099b7f4a405fb80975f8cb4b2f305ca9e Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Mon, 6 Jan 2025 17:38:56 +0000 Subject: [PATCH 3/4] Add measures definition calculating yearly ADHD prevalence This is using the ehrQL measures framework to run the same query for multiple time intervals. I tried to match your original 7 years, although I'd recommend to reduce this to 3 years for the first run to keep the run times short. --- analysis/measures_definition.py | 54 +++++++++++++++++++++++++++++++++ project.yaml | 12 ++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 analysis/measures_definition.py diff --git a/analysis/measures_definition.py b/analysis/measures_definition.py new file mode 100644 index 0000000..22dd766 --- /dev/null +++ b/analysis/measures_definition.py @@ -0,0 +1,54 @@ +from ehrql import INTERVAL, case, create_measures, when, years +from ehrql.tables.tpp import ( + patients, + practice_registrations, + clinical_events, + medications, +) + +from codelists import adhd_codelist, methylphenidate_codelist + +measures = create_measures() +measures.configure_dummy_data(population_size=10) + +# Population variables +has_registration = practice_registrations.spanning( + INTERVAL.start_date, INTERVAL.end_date +).exists_for_patient() + +sex = patients.sex +age = patients.age_on(INTERVAL.start_date) +age_band = case( + when((age >= 0) & (age < 20)).then("0-19"), + when((age >= 20) & (age < 40)).then("20-39"), + when((age >= 40) & (age < 60)).then("40-59"), + when((age >= 60) & (age < 80)).then("60-79"), + when(age >= 80).then("80+"), + when(age.is_null()).then("Missing"), +) + +selected_events = clinical_events.where( + clinical_events.date.is_on_or_between(INTERVAL.start_date, INTERVAL.end_date) +) + +selected_medications = medications.where( + medications.date.is_on_or_between(INTERVAL.start_date, INTERVAL.end_date) +) + +has_adhd_event = selected_events.where( + clinical_events.snomedct_code.is_in(adhd_codelist) +).exists_for_patient() + +measures.define_measure( + name=f"adhd_prevalence", + numerator=has_adhd_event, + denominator=( + has_registration + & patients.sex.is_in(["male", "female"]) + & (age >= 18) + & (age <= 120) + & patients.is_alive_on(INTERVAL.start_date) + ), + group_by={"sex": sex, "age_band": age_band}, + intervals=years(7).starting_on("2016-01-01"), +) diff --git a/project.yaml b/project.yaml index f9493a2..d1a2ffa 100644 --- a/project.yaml +++ b/project.yaml @@ -8,14 +8,22 @@ actions: generate_dataset: run: > ehrql:v1 generate-dataset analysis/dataset_definition_core.py - --output output/full_dataset_test.csv.gz + --output output/adhd_dataset.csv.gz outputs: highly_sensitive: adhd_dataset: output/adhd_dataset.csv.gz + generate_adhd_prevalence: + run: > + ehrql:v1 generate-measures analysis/measures_definition.py + --output output/adhd_prevalence.csv + outputs: + moderately_sensitive: + measure: output/adhd_prevalence.csv + generate_charts: run: python:v2 python analysis/report.py - needs: [generate_dataset_full] + needs: [generate_dataset] outputs: moderately_sensitive: table1: output/results_saved.csv \ No newline at end of file From c0911a18fbdadd1fb61753027b82fd1c349f8bcc Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Mon, 6 Jan 2025 17:40:56 +0000 Subject: [PATCH 4/4] Remove `report.py` for now as it may no longer be needed --- analysis/report.py | 39 --------------------------------------- project.yaml | 7 ------- 2 files changed, 46 deletions(-) delete mode 100644 analysis/report.py diff --git a/analysis/report.py b/analysis/report.py deleted file mode 100644 index 743d0c4..0000000 --- a/analysis/report.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np - -# Ensure the 'outputs' directory exists -output_dir = "output" -os.makedirs(output_dir, exist_ok=True) - -# Load the data -data = pd.read_csv("output/full_dataset_test.csv.gz") - -# Melt the ADHD events data -adhd_events = data.melt( - id_vars=['patient_id', 'sex','dob'], - value_vars=[f'num_adhd_events_{year}' for year in range(2016, 2024)], - var_name='year', - value_name='num_adhd_events' -) - -#Computing ADHD disgnosis - -#First this is the total -adhd_total_sex_table = data.groupby(['sex']).count() -adhd_total_sex_table = adhd_total_sex_table.drop(['patient_id','dob'],axis=1) - -#Second the the number of adhd dia -#Need to binaries the dia -adhd_dia_sex_table = data.copy() -col_years = [f'num_adhd_events_{year}' for year in range(2016, 2024)] -adhd_dia_sex_table[col_years] = adhd_dia_sex_table[col_years] > 0 -adhd_dia_sex_table = adhd_dia_sex_table.groupby(['sex']).sum() -adhd_dia_sex_table = adhd_dia_sex_table.drop(['patient_id','dob'],axis=1) - -#Caulcate the prelavence -prevelnce = adhd_dia_sex_table/adhd_total_sex_table - -#Need to save the table -prevelnce.to_csv('output/results_saved.csv') \ No newline at end of file diff --git a/project.yaml b/project.yaml index d1a2ffa..cf04a37 100644 --- a/project.yaml +++ b/project.yaml @@ -20,10 +20,3 @@ actions: outputs: moderately_sensitive: measure: output/adhd_prevalence.csv - - generate_charts: - run: python:v2 python analysis/report.py - needs: [generate_dataset] - outputs: - moderately_sensitive: - table1: output/results_saved.csv \ No newline at end of file