From 150b8fac8f29ac2372c158378bc267c5582dd26a Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Mon, 6 Jan 2025 16:46:21 +0000
Subject: [PATCH 1/4] Take reading codelists out of dataset definition

Creating a separate module will make it easier to share this code across different dataset definitions that we might be working on for this project. This also fixes the path to themethylphenidate_codelist, before the path was pointing to the same NHSD Primary Care Refset for ADHD.
---
 analysis/codelists.py               | 11 +++++++++++
 analysis/dataset_definition_core.py |  6 ++----
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 analysis/codelists.py

diff --git a/analysis/codelists.py b/analysis/codelists.py
new file mode 100644
index 0000000..7cbad72
--- /dev/null
+++ b/analysis/codelists.py
@@ -0,0 +1,11 @@
+from ehrql import codelist_from_csv
+
+adhd_codelist = codelist_from_csv(
+    "codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv",
+    column="code",
+)
+
+methylphenidate_codelist = codelist_from_csv(
+    "codelists/opensafely-methylphenidate-dmd.csv",
+    column="code",
+)
diff --git a/analysis/dataset_definition_core.py b/analysis/dataset_definition_core.py
index c40df8f..cb5bc1e 100644
--- a/analysis/dataset_definition_core.py
+++ b/analysis/dataset_definition_core.py
@@ -1,9 +1,7 @@
-from ehrql import create_dataset, codelist_from_csv
+from ehrql import create_dataset
 from ehrql.tables.tpp import patients, practice_registrations, clinical_events, medications
 
-# Codelists
-adhd_codelist = codelist_from_csv("codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv", column="code")
-methylphenidate_codelist = codelist_from_csv("codelists/nhsd-primary-care-domain-refsets-adhd_cod.csv", column="code")
+from codelists import adhd_codelist, methylphenidate_codelist
 
 dataset = create_dataset()
 dataset.configure_dummy_data(population_size=10000)

From cfb79a31f508ee15475e9e6ddc5d868e2dc3cdf9 Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Mon, 6 Jan 2025 17:16:35 +0000
Subject: [PATCH 2/4] Simplify dataset definition

The main change here is to take out looping through the years, there is another framework in ehrQL, measures, that does this for us. I also switched the naming of patient attributes to have the prefix has (instead of had), we often use this naming convention for our studies. To shorten the code I also removed some of the variables that were only used to define the population and explicitly mention them only inside the define_population function.
---
 analysis/dataset_definition_core.py | 77 +++++++++++------------------
 project.yaml                        |  8 +--
 2 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/analysis/dataset_definition_core.py b/analysis/dataset_definition_core.py
index cb5bc1e..47f88f3 100644
--- a/analysis/dataset_definition_core.py
+++ b/analysis/dataset_definition_core.py
@@ -1,68 +1,47 @@
 from ehrql import create_dataset
-from ehrql.tables.tpp import patients, practice_registrations, clinical_events, medications
+from ehrql.tables.tpp import (
+    patients,
+    practice_registrations,
+    clinical_events,
+    medications,
+)
 
 from codelists import adhd_codelist, methylphenidate_codelist
 
 dataset = create_dataset()
-dataset.configure_dummy_data(population_size=10000)
+dataset.configure_dummy_data(population_size=10)
 
 # Date range
-start_date = f"2016-01-01"
-end_date = f"2023-12-31"
+start_date = "2016-01-01"
+end_date = "2023-12-31"
 
 # Population variables
-was_registered = practice_registrations.for_patient_on(
-    start_date
-).exists_for_patient() & practice_registrations.for_patient_on(
-    end_date
+has_registration = practice_registrations.spanning(
+    start_date, end_date
 ).exists_for_patient()
+dataset.sex = patients.sex
+dataset.age = patients.age_on(start_date)
 
-is_female_or_male = patients.sex.is_in(["female", "male"])
-
-was_adult = (patients.age_on(start_date) >= 18) & (
-    patients.age_on(start_date) <= 120
+selected_events = clinical_events.where(
+    clinical_events.date.is_on_or_between(start_date, end_date)
 )
 
-was_alive = (
-    patients.date_of_death.is_after(end_date)
-    | patients.date_of_death.is_null()
+selected_medications = medications.where(
+    medications.date.is_on_or_between(start_date, end_date)
 )
 
-had_adhd_event = clinical_events.where(
+dataset.has_adhd_event = selected_events.where(
     clinical_events.snomedct_code.is_in(adhd_codelist)
-    & clinical_events.date.is_on_or_between(
-        start_date, end_date
-    )
 ).exists_for_patient()
 
-dataset.define_population(
-                        is_female_or_male
-                        & was_adult
-                        & was_alive
-                        & was_registered
-                        & had_adhd_event
-                    )
-
-
-# Exposure variables
-years = list(range(2016, 2023 + 1))
-# Iterate over each year and set the attribute on the dataset
-for year in years:
-    start_date = f"{year}-01-01"
-    end_date = f"{year}-12-31"
-    
-    # Construct the attribute name dynamically for each year
-    attribute_name = f"num_adhd_events_{year}"
-    
-    # Calculate the number of ADHD events for the given year
-    num_adhd_events_year = clinical_events.where(
-        clinical_events.snomedct_code.is_in(adhd_codelist)
-        & clinical_events.date.is_on_or_between(start_date, end_date)
-    ).count_for_patient()
-    
-    # Set the attribute on the dataset
-    setattr(dataset, attribute_name, num_adhd_events_year)
-
+dataset.has_mph_med = selected_medications.where(
+    medications.dmd_code.is_in(methylphenidate_codelist)
+).exists_for_patient()
 
-dataset.sex = patients.sex
-dataset.dob = patients.date_of_birth
\ No newline at end of file
+dataset.define_population(
+    has_registration
+    & dataset.sex.is_in(["male", "female"])
+    & (dataset.age >= 18)
+    & (dataset.age <= 120)
+    & patients.is_alive_on(start_date),
+)
diff --git a/project.yaml b/project.yaml
index c9ea3fe..f9493a2 100644
--- a/project.yaml
+++ b/project.yaml
@@ -5,11 +5,13 @@ expectations:
   population_size: 5000
 
 actions:
-  generate_dataset_full:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_core.py --output output/full_dataset_test.csv.gz
+  generate_dataset:
+    run: > 
+      ehrql:v1 generate-dataset analysis/dataset_definition_core.py 
+      --output output/full_dataset_test.csv.gz
     outputs:
       highly_sensitive:
-        full_dataset: output/full_dataset_test.csv.gz
+        adhd_dataset: output/adhd_dataset.csv.gz
   
   generate_charts:
     run: python:v2 python analysis/report.py

From 82e7861099b7f4a405fb80975f8cb4b2f305ca9e Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Mon, 6 Jan 2025 17:38:56 +0000
Subject: [PATCH 3/4] Add measures definition calculating yearly ADHD
 prevalence

This is using the ehrQL measures framework to run the same query for multiple time intervals. I tried to match your original 7 years, although I'd recommend to reduce this to 3 years for the first run to keep the run times short.
---
 analysis/measures_definition.py | 54 +++++++++++++++++++++++++++++++++
 project.yaml                    | 12 ++++++--
 2 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 analysis/measures_definition.py

diff --git a/analysis/measures_definition.py b/analysis/measures_definition.py
new file mode 100644
index 0000000..22dd766
--- /dev/null
+++ b/analysis/measures_definition.py
@@ -0,0 +1,54 @@
+from ehrql import INTERVAL, case, create_measures, when, years
+from ehrql.tables.tpp import (
+    patients,
+    practice_registrations,
+    clinical_events,
+    medications,
+)
+
+from codelists import adhd_codelist, methylphenidate_codelist
+
+measures = create_measures()
+measures.configure_dummy_data(population_size=10)
+
+# Population variables
+has_registration = practice_registrations.spanning(
+    INTERVAL.start_date, INTERVAL.end_date
+).exists_for_patient()
+
+sex = patients.sex
+age = patients.age_on(INTERVAL.start_date)
+age_band = case(
+    when((age >= 0) & (age < 20)).then("0-19"),
+    when((age >= 20) & (age < 40)).then("20-39"),
+    when((age >= 40) & (age < 60)).then("40-59"),
+    when((age >= 60) & (age < 80)).then("60-79"),
+    when(age >= 80).then("80+"),
+    when(age.is_null()).then("Missing"),
+)
+
+selected_events = clinical_events.where(
+    clinical_events.date.is_on_or_between(INTERVAL.start_date, INTERVAL.end_date)
+)
+
+selected_medications = medications.where(
+    medications.date.is_on_or_between(INTERVAL.start_date, INTERVAL.end_date)
+)
+
+has_adhd_event = selected_events.where(
+    clinical_events.snomedct_code.is_in(adhd_codelist)
+).exists_for_patient()
+
+measures.define_measure(
+    name=f"adhd_prevalence",
+    numerator=has_adhd_event,
+    denominator=(
+        has_registration
+        & patients.sex.is_in(["male", "female"])
+        & (age >= 18)
+        & (age <= 120)
+        & patients.is_alive_on(INTERVAL.start_date)
+    ),
+    group_by={"sex": sex, "age_band": age_band},
+    intervals=years(7).starting_on("2016-01-01"),
+)
diff --git a/project.yaml b/project.yaml
index f9493a2..d1a2ffa 100644
--- a/project.yaml
+++ b/project.yaml
@@ -8,14 +8,22 @@ actions:
   generate_dataset:
     run: > 
       ehrql:v1 generate-dataset analysis/dataset_definition_core.py 
-      --output output/full_dataset_test.csv.gz
+      --output output/adhd_dataset.csv.gz
     outputs:
       highly_sensitive:
         adhd_dataset: output/adhd_dataset.csv.gz
   
+  generate_adhd_prevalence:
+    run: > 
+      ehrql:v1 generate-measures analysis/measures_definition.py 
+      --output output/adhd_prevalence.csv
+    outputs:
+      moderately_sensitive:
+        measure: output/adhd_prevalence.csv
+  
   generate_charts:
     run: python:v2 python analysis/report.py
-    needs: [generate_dataset_full]
+    needs: [generate_dataset]
     outputs:
       moderately_sensitive:
         table1: output/results_saved.csv
\ No newline at end of file

From c0911a18fbdadd1fb61753027b82fd1c349f8bcc Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Mon, 6 Jan 2025 17:40:56 +0000
Subject: [PATCH 4/4] Remove `report.py` for now as it may no longer be needed

---
 analysis/report.py | 39 ---------------------------------------
 project.yaml       |  7 -------
 2 files changed, 46 deletions(-)
 delete mode 100644 analysis/report.py

diff --git a/analysis/report.py b/analysis/report.py
deleted file mode 100644
index 743d0c4..0000000
--- a/analysis/report.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-
-# Ensure the 'outputs' directory exists
-output_dir = "output"
-os.makedirs(output_dir, exist_ok=True)
-
-# Load the data
-data = pd.read_csv("output/full_dataset_test.csv.gz")
-
-# Melt the ADHD events data
-adhd_events = data.melt(
-    id_vars=['patient_id', 'sex','dob'],
-    value_vars=[f'num_adhd_events_{year}' for year in range(2016, 2024)],
-    var_name='year',
-    value_name='num_adhd_events'
-)
-
-#Computing ADHD disgnosis 
-
-#First this is the total
-adhd_total_sex_table = data.groupby(['sex']).count()
-adhd_total_sex_table = adhd_total_sex_table.drop(['patient_id','dob'],axis=1)
-
-#Second the the number of adhd dia
-#Need to binaries the dia
-adhd_dia_sex_table = data.copy()
-col_years = [f'num_adhd_events_{year}' for year in range(2016, 2024)]
-adhd_dia_sex_table[col_years] = adhd_dia_sex_table[col_years] > 0
-adhd_dia_sex_table = adhd_dia_sex_table.groupby(['sex']).sum()
-adhd_dia_sex_table = adhd_dia_sex_table.drop(['patient_id','dob'],axis=1)
-
-#Caulcate the prelavence
-prevelnce = adhd_dia_sex_table/adhd_total_sex_table
-
-#Need to save the table
-prevelnce.to_csv('output/results_saved.csv')
\ No newline at end of file
diff --git a/project.yaml b/project.yaml
index d1a2ffa..cf04a37 100644
--- a/project.yaml
+++ b/project.yaml
@@ -20,10 +20,3 @@ actions:
     outputs:
       moderately_sensitive:
         measure: output/adhd_prevalence.csv
-  
-  generate_charts:
-    run: python:v2 python analysis/report.py
-    needs: [generate_dataset]
-    outputs:
-      moderately_sensitive:
-        table1: output/results_saved.csv
\ No newline at end of file