From fc8a4f631384ee1258c03003e8027bc617860d1a Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 11 Nov 2024 16:40:01 +0000
Subject: [PATCH 01/32] Create preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 172 ++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 analysis/preprocess/preprocess_data.R

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
new file mode 100644
index 0000000..67265e2
--- /dev/null
+++ b/analysis/preprocess/preprocess_data.R
@@ -0,0 +1,172 @@
+# Load libraries ---------------------------------------------------------------
+
+library(magrittr)
+library(dplyr)
+library(tidyverse)
+library(lubridate)
+library(data.table)
+library(readr)
+
+# Specify command arguments ----------------------------------------------------
+
+args <- commandArgs(trailingOnly=TRUE)
+print(length(args))
+if(length(args)==0){
+  cohort_name <- "prevax_extf"
+} else {
+  cohort_name <- args[[1]]
+}
+
+# Get column names -------------------------------------------------------------
+
+all_cols <- fread(paste0("output/input_",cohort_name,".csv.gz"), 
+                  header = TRUE, sep = ",", nrows = 0, 
+                  stringsAsFactors = FALSE) %>%
+  names()
+
+message("Column names found")
+
+# Identify column classes ------------------------------------------------------
+
+cat_cols <- c("patient_id", grep("_cat", all_cols, value = TRUE))
+
+bin_cols <- c(grep("_bin", all_cols, value = TRUE), 
+              grep("prostate_cancer_", all_cols, value = TRUE),
+              "has_follow_up_previous_6months", "has_died", "registered_at_start",
+              "tmp_cocp","tmp_hrt")
+
+num_cols <- c(grep("_num", all_cols, value = TRUE),
+              grep("vax_jcvi_age_", all_cols, value = TRUE))
+
+date_cols <- grep("_date", all_cols, value = TRUE)
+
+message("Column classes identified")
+
+# Define column classes --------------------------------------------------------
+
+col_classes <- setNames(
+  c(rep("c", length(cat_cols)),
+    rep("l", length(bin_cols)),
+    rep("d", length(num_cols)),
+    rep("D", length(date_cols))
+  ), 
+  all_cols[match(c(cat_cols, bin_cols, num_cols, date_cols), all_cols)]
+)
+
+message("Column classes defined")
+
+# Read cohort dataset ---------------------------------------------------------- 
+
+df <- read_csv(paste0("output/input_",cohort_name,".csv.gz"), 
+               col_types = col_classes)
+
+message(paste0("Dataset has been read successfully with N = ", nrow(df), " rows"))
+
+# Add death_date and deregistration_date from prelim data ----------------------
+
+prelim_data <- read_csv("output/index_dates.csv.gz")
+prelim_data <- prelim_data[,c("patient_id","death_date","deregistration_date")]
+prelim_data$patient_id <- as.character(prelim_data$patient_id)
+prelim_data$death_date <- as.Date(prelim_data$death_date)
+prelim_data$deregistration_date <- as.Date(prelim_data$deregistration_date)
+
+df <- df %>% inner_join(prelim_data,by="patient_id")
+
+message("Death and deregistration dates added!")
+
+# Format columns ---------------------------------------------------------------
+
+df <- df %>%
+  mutate(across(c(contains("_date")),
+                ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
+         across(contains('_birth_year'),
+                ~ format(as.Date(., origin = "1970-01-01"), "%Y")),
+         across(contains('_num') & !contains('date'), ~ as.numeric(.)),
+         across(contains('_cat'), ~ as.factor(.)),
+         across(contains('_bin'), ~ as.logical(.)))
+
+# Overwrite vaccination information for dummy data and vax cohort only ---------
+
+if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations") &&
+   cohort_name %in% c("vax")) {
+  source("analysis/modify_dummy_vax_data.R")
+  message("Vaccine information overwritten successfully")
+}
+
+# Describe data ----------------------------------------------------------------
+
+sink(paste0("output/describe_",cohort_name,".txt"))
+print(Hmisc::describe(df))
+sink()
+message ("Cohort ",cohort_name, " description written successfully!")
+
+# QC for consultation variable and set max to 365 (i.e., one per day) ----------
+
+df <- df %>%
+  mutate(cov_num_consulation_rate = replace(cov_num_consulation_rate, 
+                                            cov_num_consulation_rate > 365, 365))
+
+# Define COVID-19 severity -----------------------------------------------------
+
+df <- df %>%
+  mutate(sub_cat_covid19_hospital = 
+           ifelse(!is.na(exp_date_covid19_confirmed) &
+                    !is.na(sub_date_covid19_hospital) &
+                    sub_date_covid19_hospital - exp_date_covid19_confirmed >= 0 &
+                    sub_date_covid19_hospital - exp_date_covid19_confirmed < 29, "hospitalised",
+                  ifelse(!is.na(exp_date_covid19_confirmed), "non_hospitalised", 
+                         ifelse(is.na(exp_date_covid19_confirmed), "no_infection", NA)))) %>%
+  mutate(across(sub_cat_covid19_hospital, factor))
+
+df <- df[!is.na(df$patient_id),]
+df[,c("sub_date_covid19_hospital")] <- NULL
+
+message("COVID19 severity determined successfully")
+
+# Restrict columns and save analysis dataset -----------------------------------
+
+df1 <- df %>% 
+  select(patient_id,
+         death_date,
+         starts_with("index_date_"),
+         has_follow_up_previous_6months,
+         deregistration_date,
+         starts_with("end_date_"),
+         contains("sub_"), # Subgroups
+         contains("exp_"), # Exposures
+         contains("out_"), # Outcomes
+         contains("cov_"), # Covariates
+         contains("qa_"), # Quality assurance
+         contains("step"), # diabetes steps
+         contains("vax_date_eligible"), # Vaccination eligibility
+         contains("vax_date_"), # Vaccination dates and vax type 
+         contains("vax_cat_") # Vaccination products
+  )
+
+df1[,colnames(df)[grepl("tmp_",colnames(df))]] <- NULL
+
+# Save input -------------------------------------------------------------------
+
+saveRDS(df1, file = paste0("output/input_",cohort_name,".rds"), compress = TRUE)
+message(paste0("Input data saved successfully with N = ", nrow(df1), " rows"))
+
+# Describe data ----------------------------------------------------------------
+
+sink(paste0("output/describe_input_",cohort_name,"_stage0.txt"))
+print(Hmisc::describe(df1))
+sink()
+
+# Restrict columns and save Venn diagram input dataset -------------------------
+
+df2 <- df %>% select(starts_with(c("patient_id","tmp_out_date","out_date")))
+
+# Describe data ----------------------------------------------------------------
+
+sink(paste0("output/describe_venn_",cohort_name,".txt"))
+print(Hmisc::describe(df2))
+sink()
+
+saveRDS(df2, file = paste0("output/venn_",cohort_name,".rds"), compress = TRUE)
+
+message("Venn diagram data saved successfully")
+tictoc::toc()
\ No newline at end of file

From 641e0bc0156b5f1921a05101922610614c0523ea Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 2 Dec 2024 12:57:56 +0000
Subject: [PATCH 02/32] Create modify_dummy_vax_data.R

---
 analysis/preprocess/modify_dummy_vax_data.R | 173 ++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 analysis/preprocess/modify_dummy_vax_data.R

diff --git a/analysis/preprocess/modify_dummy_vax_data.R b/analysis/preprocess/modify_dummy_vax_data.R
new file mode 100644
index 0000000..9230d8c
--- /dev/null
+++ b/analysis/preprocess/modify_dummy_vax_data.R
@@ -0,0 +1,173 @@
+# Modified from https://github.com/opensafely/waning-ve-2dose-1year/blob/main/analysis/dummy_data_vax.R
+# And https://github.com/opensafely/post-covid-vaccinated/blob/main/analysis/modify_dummy_vax_data.R
+
+# Set seed ---------------------------------------------------------------------
+set.seed(1)
+
+# Change first jab date so that they have roughly correct distribution  
+df_vax<- df %>%
+  mutate(
+    vax_date_Pfizer_1 = as.Date(vax_date_eligible) + days(round(rnorm(nrow(.), mean = 10, sd = 3))),
+    vax_date_AstraZeneca_1 = as.Date(vax_date_eligible) + days(round(rnorm(nrow(.), mean = 10, sd = 3))),
+    vax_date_Moderna_1 = as.Date(vax_date_eligible) + days(round(rnorm(nrow(.), mean = 10, sd = 3)))
+  ) %>%
+  #Pick one vaccine type
+  mutate(
+    vaccine_1_type = sample(
+      x = c("Pfizer", "AstraZeneca", "Moderna",  "None"),
+      size = nrow(.),
+      replace = TRUE,
+      prob = c(0.4, 0.4, 0.05, 0.1)
+    ),
+    # jabs missingness probabilities 
+    missing_pfizer_2 = rbernoulli(nrow(.), p=0.05),
+    missing_az_2 = rbernoulli(nrow(.), p=0.05),
+    missing_moderna_2 = rbernoulli(nrow(.), p=0.05),
+    missing_pfizer_3 = rbernoulli(nrow(.), p=0.9),
+    missing_az_3 = rbernoulli(nrow(.), p=0.9),
+    missing_moderna_3 = rbernoulli(nrow(.), p=0.9)
+  )%>%
+  #Set first jab date according to type and set others to NA 
+  mutate(across(vax_date_Pfizer_1,
+                ~if_else(
+                  vaccine_1_type %in% "Pfizer",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_AstraZeneca_1,
+                ~if_else(
+                  vaccine_1_type %in% "AstraZeneca",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_Moderna_1,
+                ~if_else(
+                  vaccine_1_type %in% "Moderna",
+                  .x,
+                  NA_Date_))) %>%
+  
+  mutate(across(matches("vax_date\\w+_1"),
+                ~ if_else(
+                  vaccine_1_type %in% "None",
+                  NA_Date_,
+                  .x
+                ))) %>%
+  
+  #Change date for the second jab
+  mutate(
+    vax_date_Pfizer_2 = vax_date_Pfizer_1 + days(round(rnorm(nrow(.), mean = 10*7, sd = 3))),
+    vax_date_AstraZeneca_2 = vax_date_AstraZeneca_1 + days(round(rnorm(nrow(.), mean = 10*7, sd = 3))),
+    vax_date_Moderna_2 = vax_date_Moderna_1  + days(round(rnorm(nrow(.), mean = 10*7, sd = 3))),
+  ) %>%
+  
+  # Set 2nd vaccine type
+  mutate(vaccine_2_type =  ifelse(runif(nrow(df),0,1)>0.95 & vaccine_1_type!="None",
+                                  sample(
+                                    x = c("Pfizer", "AstraZeneca", "Moderna",  "None"),
+                                    size = nrow(.),
+                                    replace = TRUE,
+                                    prob = c(0.4, 0.4, 0.05, 0.1)
+                                  ),
+                                  vaccine_1_type)
+  ) %>%
+  
+  #Set second jab date according to type and set others to NA 
+  
+  mutate(across(vax_date_Pfizer_2,
+                ~if_else(
+                  vaccine_2_type %in% "Pfizer",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_AstraZeneca_2,
+                ~if_else(
+                  vaccine_2_type %in% "AstraZeneca",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_Moderna_2,
+                ~if_else(
+                  vaccine_1_type %in% "Moderna",
+                  .x,
+                  NA_Date_))) %>%
+  
+  mutate(across(matches("vax_date\\w+_2"),
+                ~ if_else(
+                  vaccine_2_type %in% "None",
+                  NA_Date_,
+                  .x
+                ))) %>%
+  
+  # Set to NA if jab is missing
+  mutate(across(vax_date_Pfizer_2,
+                ~if_else(
+                  missing_pfizer_2,
+                  NA_Date_,
+                  .x))) %>%
+  mutate(across(vax_date_AstraZeneca_2,
+                ~if_else(
+                  missing_az_2,
+                  NA_Date_,
+                  .x))) %>%
+  mutate(across(vax_date_Moderna_2,
+                ~if_else(
+                  missing_moderna_2,
+                  NA_Date_,
+                  .x))) %>%
+  
+  #Set 3rd jab type
+  mutate(vaccine_3_type =  ifelse( vaccine_2_type!="None",
+                                   sample(
+                                     x = c("Pfizer", "AstraZeneca" ,"Moderna",  "None"),
+                                     size = nrow(.),
+                                     replace = TRUE,
+                                     prob = c(0.6, 0.1, 0.3, 0.1)
+                                   ),vaccine_2_type
+  )
+  ) %>%
+  
+  #Change 3rd jab date
+  mutate(
+    vax_date_Pfizer_3 = vax_date_Pfizer_2 + days(round(rnorm(nrow(.), mean = 6*4*7, sd = 7))),
+    vax_date_AstraZeneca_3 = vax_date_AstraZeneca_2 + days(round(rnorm(nrow(.), mean = 6*4*7, sd = 7))),
+    vax_date_Moderna_3 = vax_date_Moderna_2 + days(round(rnorm(nrow(.), mean = 6*4*7, sd = 7))),
+  ) %>%
+  
+  #Set 3rd jab date according to type and set others to NA 
+  
+  mutate(across(vax_date_Pfizer_3,
+                ~if_else(
+                  vaccine_3_type %in% "Pfizer",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_AstraZeneca_3,
+                ~if_else(
+                  vaccine_3_type %in% "AstraZeneca",
+                  .x,
+                  NA_Date_))) %>%
+  mutate(across(vax_date_Moderna_3,
+                ~if_else(
+                  vaccine_1_type %in% "Moderna",
+                  .x,
+                  NA_Date_))) %>%
+  
+  mutate(across(matches("vax_date\\w+_3"),
+                ~ if_else(
+                  vaccine_3_type %in% "None",
+                  NA_Date_,
+                  .x
+                ))) %>%
+  # Set to NA if jab is missing
+  mutate(across(vax_date_Pfizer_3,
+                ~if_else(
+                  missing_pfizer_3,
+                  NA_Date_,
+                  .x))) %>%
+  mutate(across(vax_date_AstraZeneca_3,
+                ~if_else(
+                  missing_az_3,
+                  NA_Date_,
+                  .x))) %>%
+  mutate(across(vax_date_Moderna_3,
+                ~if_else(
+                  missing_moderna_3,
+                  NA_Date_,
+                  .x)))%>%
+  
+  select(-starts_with("missing"),-matches("vaccine_\\d_type"))
\ No newline at end of file

From 9c47b0d823d05a7e93f876f8f90d7971a6389677 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 2 Dec 2024 15:44:43 +0000
Subject: [PATCH 03/32] Update modify_dummy_vax_data.R

---
 analysis/preprocess/modify_dummy_vax_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/preprocess/modify_dummy_vax_data.R b/analysis/preprocess/modify_dummy_vax_data.R
index 9230d8c..fb6480c 100644
--- a/analysis/preprocess/modify_dummy_vax_data.R
+++ b/analysis/preprocess/modify_dummy_vax_data.R
@@ -5,7 +5,7 @@
 set.seed(1)
 
 # Change first jab date so that they have roughly correct distribution  
-df_vax<- df %>%
+df <- df %>%
   mutate(
     vax_date_Pfizer_1 = as.Date(vax_date_eligible) + days(round(rnorm(nrow(.), mean = 10, sd = 3))),
     vax_date_AstraZeneca_1 = as.Date(vax_date_eligible) + days(round(rnorm(nrow(.), mean = 10, sd = 3))),

From 00a8c03c8538e282340ee0ff1bbf67470da7ba93 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 2 Dec 2024 15:44:45 +0000
Subject: [PATCH 04/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 67265e2..b2447ae 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -12,7 +12,7 @@ library(readr)
 args <- commandArgs(trailingOnly=TRUE)
 print(length(args))
 if(length(args)==0){
-  cohort_name <- "prevax_extf"
+  cohort_name <- "vax"
 } else {
   cohort_name <- args[[1]]
 }
@@ -30,10 +30,8 @@ message("Column names found")
 
 cat_cols <- c("patient_id", grep("_cat", all_cols, value = TRUE))
 
-bin_cols <- c(grep("_bin", all_cols, value = TRUE), 
-              grep("prostate_cancer_", all_cols, value = TRUE),
-              "has_follow_up_previous_6months", "has_died", "registered_at_start",
-              "tmp_cocp","tmp_hrt")
+bin_cols <- c(grep("_bin", all_cols, value = TRUE),
+              "has_follow_up_previous_6months", "was_alive", "has_died")
 
 num_cols <- c(grep("_num", all_cols, value = TRUE),
               grep("vax_jcvi_age_", all_cols, value = TRUE))
@@ -62,18 +60,6 @@ df <- read_csv(paste0("output/input_",cohort_name,".csv.gz"),
 
 message(paste0("Dataset has been read successfully with N = ", nrow(df), " rows"))
 
-# Add death_date and deregistration_date from prelim data ----------------------
-
-prelim_data <- read_csv("output/index_dates.csv.gz")
-prelim_data <- prelim_data[,c("patient_id","death_date","deregistration_date")]
-prelim_data$patient_id <- as.character(prelim_data$patient_id)
-prelim_data$death_date <- as.Date(prelim_data$death_date)
-prelim_data$deregistration_date <- as.Date(prelim_data$deregistration_date)
-
-df <- df %>% inner_join(prelim_data,by="patient_id")
-
-message("Death and deregistration dates added!")
-
 # Format columns ---------------------------------------------------------------
 
 df <- df %>%
@@ -89,7 +75,7 @@ df <- df %>%
 
 if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations") &&
    cohort_name %in% c("vax")) {
-  source("analysis/modify_dummy_vax_data.R")
+  source("analysis/preprocess/modify_dummy_vax_data.R")
   message("Vaccine information overwritten successfully")
 }
 

From 50364ae1eeec2097ab54499aa2d70a289b5e71ec Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 2 Dec 2024 15:44:52 +0000
Subject: [PATCH 05/32] Create specify_paths.R

---
 analysis/specify_paths.R | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 analysis/specify_paths.R

diff --git a/analysis/specify_paths.R b/analysis/specify_paths.R
new file mode 100644
index 0000000..eb9eeaf
--- /dev/null
+++ b/analysis/specify_paths.R
@@ -0,0 +1,13 @@
+# This file is used to specify paths. It is in the .gitignore to keep your information secret.
+# To use, please remove "_example" from the file name and add your specific file paths below.
+
+release <- "" # Specify path to release directory  
+#path_aer_input <- paste0(release,"20230807/aer_input-main-rounded.csv")
+#path_consort <- paste0(release,"20230807/consort_output_rounded.csv")
+#path_median_iqr_age <- paste0(release,"20230807/median_iqr_age.csv")
+path_model_output <- paste0(release,"20231116/model_output_midpoint6.csv")
+path_stata_model_output <- paste0(release,"20231116/stata_model_output_midpoint6.csv")
+path_table1 <- paste0(release,"18_05_2023/") # Paths as multiple files
+path_table2 <- paste0(release,"18_05_2023/") # Paths as multiple files
+#path_venn <- paste0(release,"20230807/venn_output_rounded.csv")
+#path_extendedtable1 <- paste0(release,"20230810/extendedtable1_output_rounded.csv")
\ No newline at end of file

From 7460ad3013421bf2e25dd02ffc80e422bebd87bc Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Tue, 10 Dec 2024 13:09:59 +0000
Subject: [PATCH 06/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index b2447ae..08303de 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -154,5 +154,4 @@ sink()
 
 saveRDS(df2, file = paste0("output/venn_",cohort_name,".rds"), compress = TRUE)
 
-message("Venn diagram data saved successfully")
-tictoc::toc()
\ No newline at end of file
+message("Venn diagram data saved successfully")
\ No newline at end of file

From 3b0c18df586f2caef9ce23c0b4124e2a168af3b3 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 13:43:01 +0000
Subject: [PATCH 07/32] Delete specify_paths.R

---
 analysis/specify_paths.R | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 analysis/specify_paths.R

diff --git a/analysis/specify_paths.R b/analysis/specify_paths.R
deleted file mode 100644
index eb9eeaf..0000000
--- a/analysis/specify_paths.R
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file is used to specify paths. It is in the .gitignore to keep your information secret.
-# To use, please remove "_example" from the file name and add your specific file paths below.
-
-release <- "" # Specify path to release directory  
-#path_aer_input <- paste0(release,"20230807/aer_input-main-rounded.csv")
-#path_consort <- paste0(release,"20230807/consort_output_rounded.csv")
-#path_median_iqr_age <- paste0(release,"20230807/median_iqr_age.csv")
-path_model_output <- paste0(release,"20231116/model_output_midpoint6.csv")
-path_stata_model_output <- paste0(release,"20231116/stata_model_output_midpoint6.csv")
-path_table1 <- paste0(release,"18_05_2023/") # Paths as multiple files
-path_table2 <- paste0(release,"18_05_2023/") # Paths as multiple files
-#path_venn <- paste0(release,"20230807/venn_output_rounded.csv")
-#path_extendedtable1 <- paste0(release,"20230810/extendedtable1_output_rounded.csv")
\ No newline at end of file

From 20aa35bc519c0e06c5487ac295687d351bfc967e Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 13:47:44 +0000
Subject: [PATCH 08/32] Create specify_paths_example.R

---
 analysis/specify_paths_example.R | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 analysis/specify_paths_example.R

diff --git a/analysis/specify_paths_example.R b/analysis/specify_paths_example.R
new file mode 100644
index 0000000..4c9dea1
--- /dev/null
+++ b/analysis/specify_paths_example.R
@@ -0,0 +1,15 @@
+# This file is used to specify paths. It is in the .gitignore to keep your information secret.
+# To use, please remove "_example" from the file name and add your specific file paths below.
+
+release <- "" # Specify path to release directory  
+path_aer_input <- paste0(release,"20230807/aer_input-main-rounded.csv")
+path_consort <- paste0(release,"20230807/consort_output_rounded.csv")
+path_median_iqr_age <- paste0(release,"20230807/median_iqr_age.csv")
+path_table1 <- paste0(release,"20230807/table1_output_rounded.csv")
+path_table2 <- paste0(release,"20230927/table2_output_rounded.csv")
+path_venn <- paste0(release,"20230807/venn_output_rounded.csv")
+path_extendedtable1 <- paste0(release,"20230810/extendedtable1_output_rounded.csv")
+path_model_output <- paste0(release,"20231115/model_output_rounded.csv")
+path_stata_model_output <- paste0(release,"20231117/stata_model_output_rounded.csv")
+path_cohortcovid <- paste0(release,"20240503/cohortcovid_midpoint6.csv")
+path_cohortoverlap <- paste0(release,"20240503/cohortoverlap_midpoint6.csv")
\ No newline at end of file

From 44e3d8933994a4422f4239bf8daa3ad9f352b60e Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 14:45:20 +0000
Subject: [PATCH 09/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 34 ++++++++-------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 08303de..2541ed9 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -26,6 +26,8 @@ all_cols <- fread(paste0("output/input_",cohort_name,".csv.gz"),
 
 message("Column names found")
 
+print(all_cols)
+
 # Identify column classes ------------------------------------------------------
 
 cat_cols <- c("patient_id", grep("_cat", all_cols, value = TRUE))
@@ -63,13 +65,13 @@ message(paste0("Dataset has been read successfully with N = ", nrow(df), " rows"
 # Format columns ---------------------------------------------------------------
 
 df <- df %>%
-  mutate(across(c(contains("_date")),
+  mutate(across(all_of(date_cols),
                 ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
          across(contains('_birth_year'),
                 ~ format(as.Date(., origin = "1970-01-01"), "%Y")),
-         across(contains('_num') & !contains('date'), ~ as.numeric(.)),
-         across(contains('_cat'), ~ as.factor(.)),
-         across(contains('_bin'), ~ as.logical(.)))
+         across(all_of(num_cols), ~ as.numeric(.)),
+         across(all_of(cat_cols), ~ as.factor(.)),
+         across(all_of(bin_cols), ~ as.logical(.)))
 
 # Overwrite vaccination information for dummy data and vax cohort only ---------
 
@@ -86,28 +88,11 @@ print(Hmisc::describe(df))
 sink()
 message ("Cohort ",cohort_name, " description written successfully!")
 
-# QC for consultation variable and set max to 365 (i.e., one per day) ----------
-
-df <- df %>%
-  mutate(cov_num_consulation_rate = replace(cov_num_consulation_rate, 
-                                            cov_num_consulation_rate > 365, 365))
-
-# Define COVID-19 severity -----------------------------------------------------
-
-df <- df %>%
-  mutate(sub_cat_covid19_hospital = 
-           ifelse(!is.na(exp_date_covid19_confirmed) &
-                    !is.na(sub_date_covid19_hospital) &
-                    sub_date_covid19_hospital - exp_date_covid19_confirmed >= 0 &
-                    sub_date_covid19_hospital - exp_date_covid19_confirmed < 29, "hospitalised",
-                  ifelse(!is.na(exp_date_covid19_confirmed), "non_hospitalised", 
-                         ifelse(is.na(exp_date_covid19_confirmed), "no_infection", NA)))) %>%
-  mutate(across(sub_cat_covid19_hospital, factor))
+# Remove records with missing patient id ---------------------------------------
 
 df <- df[!is.na(df$patient_id),]
-df[,c("sub_date_covid19_hospital")] <- NULL
 
-message("COVID19 severity determined successfully")
+message("All records with valid patient IDs retained.")
 
 # Restrict columns and save analysis dataset -----------------------------------
 
@@ -122,8 +107,9 @@ df1 <- df %>%
          contains("exp_"), # Exposures
          contains("out_"), # Outcomes
          contains("cov_"), # Covariates
+         contains("inex_"), # Inclusion/exclusion
+         contains("cens_"), # Censor
          contains("qa_"), # Quality assurance
-         contains("step"), # diabetes steps
          contains("vax_date_eligible"), # Vaccination eligibility
          contains("vax_date_"), # Vaccination dates and vax type 
          contains("vax_cat_") # Vaccination products

From 6c1d51322ad0ebea8b67136c47c64c834a70ff0c Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 14:45:23 +0000
Subject: [PATCH 10/32] Delete specify_paths_example.R

---
 analysis/specify_paths_example.R | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 analysis/specify_paths_example.R

diff --git a/analysis/specify_paths_example.R b/analysis/specify_paths_example.R
deleted file mode 100644
index 4c9dea1..0000000
--- a/analysis/specify_paths_example.R
+++ /dev/null
@@ -1,15 +0,0 @@
-# This file is used to specify paths. It is in the .gitignore to keep your information secret.
-# To use, please remove "_example" from the file name and add your specific file paths below.
-
-release <- "" # Specify path to release directory  
-path_aer_input <- paste0(release,"20230807/aer_input-main-rounded.csv")
-path_consort <- paste0(release,"20230807/consort_output_rounded.csv")
-path_median_iqr_age <- paste0(release,"20230807/median_iqr_age.csv")
-path_table1 <- paste0(release,"20230807/table1_output_rounded.csv")
-path_table2 <- paste0(release,"20230927/table2_output_rounded.csv")
-path_venn <- paste0(release,"20230807/venn_output_rounded.csv")
-path_extendedtable1 <- paste0(release,"20230810/extendedtable1_output_rounded.csv")
-path_model_output <- paste0(release,"20231115/model_output_rounded.csv")
-path_stata_model_output <- paste0(release,"20231117/stata_model_output_rounded.csv")
-path_cohortcovid <- paste0(release,"20240503/cohortcovid_midpoint6.csv")
-path_cohortoverlap <- paste0(release,"20240503/cohortoverlap_midpoint6.csv")
\ No newline at end of file

From 5348c8545af18013f48b306f30710d6449f70aa2 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 15:32:28 +0000
Subject: [PATCH 11/32] Create post-covid-respiratory.Rproj

---
 post-covid-respiratory/post-covid-respiratory.Rproj | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 post-covid-respiratory/post-covid-respiratory.Rproj

diff --git a/post-covid-respiratory/post-covid-respiratory.Rproj b/post-covid-respiratory/post-covid-respiratory.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/post-covid-respiratory/post-covid-respiratory.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX

From 9c134bbe1d27cd214f82cdd260637fdecf8efc52 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 15:38:53 +0000
Subject: [PATCH 12/32] Rproject

---
 post-covid-respiratory/post-covid-respiratory.Rproj | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 post-covid-respiratory/post-covid-respiratory.Rproj

diff --git a/post-covid-respiratory/post-covid-respiratory.Rproj b/post-covid-respiratory/post-covid-respiratory.Rproj
deleted file mode 100644
index 8e3c2eb..0000000
--- a/post-covid-respiratory/post-covid-respiratory.Rproj
+++ /dev/null
@@ -1,13 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX

From 2c3d57ae8532eaec1d93de806ed78d73cf912be7 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 16:20:59 +0000
Subject: [PATCH 13/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 2541ed9..2218a20 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -68,10 +68,7 @@ df <- df %>%
   mutate(across(all_of(date_cols),
                 ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
          across(contains('_birth_year'),
-                ~ format(as.Date(., origin = "1970-01-01"), "%Y")),
-         across(all_of(num_cols), ~ as.numeric(.)),
-         across(all_of(cat_cols), ~ as.factor(.)),
-         across(all_of(bin_cols), ~ as.logical(.)))
+                ~ format(as.Date(., origin = "1970-01-01"), "%Y")))
 
 # Overwrite vaccination information for dummy data and vax cohort only ---------
 

From 24ffc94d1a89d60ffbaee6520ebd64f5db2a0326 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 17:51:46 +0000
Subject: [PATCH 14/32] Update create_project_actions.R

---
 analysis/create_project_actions.R | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
index ebbfb87..fef22da 100644
--- a/analysis/create_project_actions.R
+++ b/analysis/create_project_actions.R
@@ -79,6 +79,27 @@ generate_study_population <- function(cohort){
   )
 }
 
+# Create function to preprocess data -------------------------------------------
+
+preprocess_data <- function(cohort){
+  splice(
+    comment(glue("Preprocess data - {cohort}")),
+    action(
+      name = glue("preprocess_data_{cohort}"),
+      run = glue("r:latest analysis/preprocess/preprocess_data.R"),
+      arguments = c(cohort),
+      needs = list("generate_index_dates",glue("generate_study_population_{cohort}")),
+      moderately_sensitive = list(
+        describe = glue("output/not-for-review/describe_input_{cohort}_stage0.txt"),
+        describe_venn = glue("output/not-for-review/describe_venn_{cohort}.txt")
+      ),
+      highly_sensitive = list(
+        cohort = glue("output/input_{cohort}.rds"),
+        venn = glue("output/venn_{cohort}.rds")
+      )
+    )
+  )
+}
 
 # Define and combine all actions into a list of actions ------------------------------0
 
@@ -123,9 +144,19 @@ actions_list <- splice(
                   function(x) generate_study_population(cohort = x)), 
            recursive = FALSE
     )
+  ),
+  
+  ## Preprocess data -----------------------------------------------------------
+  
+  splice(
+    unlist(lapply(cohorts, 
+                  function(x) preprocess_data(cohort = x)), 
+           recursive = FALSE
+    )
   )
 )
 
+
 # Combine actions into project list --------------------------------------------
 
 project_list <- splice(

From 41ec521ff66ebe8f5bb6a9e6df7da13da4913386 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 17:51:54 +0000
Subject: [PATCH 15/32] Update dataset_definition_cohorts.py

---
 analysis/dataset_definition_cohorts.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/analysis/dataset_definition_cohorts.py b/analysis/dataset_definition_cohorts.py
index 86189ad..0add1dc 100644
--- a/analysis/dataset_definition_cohorts.py
+++ b/analysis/dataset_definition_cohorts.py
@@ -34,7 +34,7 @@ def generate_dataset(index_date, end_date_exp, end_date_out):
     for var_name, var_value in variables.items():
         setattr(dataset, var_name, var_value)
     
-    # Extract index dates for cohorts from index_dates.csv
+    # Extract index dates for cohorts from index_dates.csv, also extract vax_date_eligible for modifying dummy vax dates
 
     @table_from_file("output/index_dates.csv.gz")
     
@@ -48,6 +48,7 @@ class index_dates(PatientFrame):
         index_unvax = Series(date)
         end_unvax_exposure = Series(date)
         end_unvax_outcome = Series(date)
+        vax_date_eligible = Series(date)
 
     dataset.index_prevax = index_dates.index_prevax
     dataset.end_prevax_exposure = index_dates.end_prevax_exposure
@@ -61,4 +62,7 @@ class index_dates(PatientFrame):
     dataset.end_vax_exposure = index_dates.end_vax_exposure
     dataset.end_vax_outcome = index_dates.end_vax_outcome
 
+    dataset.vax_date_eligible = index_dates.vax_date_eligible
+
+
     return dataset
\ No newline at end of file

From 64ba7df4e8dac96a7b03818d4078fee224c175c9 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 17:52:16 +0000
Subject: [PATCH 16/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 2218a20..a5702ab 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -32,8 +32,7 @@ print(all_cols)
 
 cat_cols <- c("patient_id", grep("_cat", all_cols, value = TRUE))
 
-bin_cols <- c(grep("_bin", all_cols, value = TRUE),
-              "has_follow_up_previous_6months", "was_alive", "has_died")
+bin_cols <- c(grep("_bin", all_cols, value = TRUE))
 
 num_cols <- c(grep("_num", all_cols, value = TRUE),
               grep("vax_jcvi_age_", all_cols, value = TRUE))
@@ -66,9 +65,7 @@ message(paste0("Dataset has been read successfully with N = ", nrow(df), " rows"
 
 df <- df %>%
   mutate(across(all_of(date_cols),
-                ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
-         across(contains('_birth_year'),
-                ~ format(as.Date(., origin = "1970-01-01"), "%Y")))
+                ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")))
 
 # Overwrite vaccination information for dummy data and vax cohort only ---------
 

From 149424c90f17e096d7894bae7a7ff67d850afe97 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 17:52:28 +0000
Subject: [PATCH 17/32] Update variables_cohorts.py

---
 analysis/variables_cohorts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/analysis/variables_cohorts.py b/analysis/variables_cohorts.py
index 2bfa792..c7be3a1 100644
--- a/analysis/variables_cohorts.py
+++ b/analysis/variables_cohorts.py
@@ -531,8 +531,7 @@ def generate_variables(index_date, end_date_exp, end_date_out):
         ),
 
     ## Covid_19 severity
-
-        sub_date_covid19_hospital = sub_date_covid19_hospital,
+    
         # case(*when_thens, otherwise=None) the conditions are evaluated in order https://docs.opensafely.org/ehrql/reference/language/#case
         sub_cat_covid19_hospital = case(
             when(

From 1e8e6771aa0745d3e2be70a7a4f3dc50a586f3a1 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Thu, 12 Dec 2024 17:52:47 +0000
Subject: [PATCH 18/32] Update variables_dates.py

---
 analysis/variables_dates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/variables_dates.py b/analysis/variables_dates.py
index b62e8b6..c73bff5 100644
--- a/analysis/variables_dates.py
+++ b/analysis/variables_dates.py
@@ -546,7 +546,7 @@
 
 # Define a dictionary of preliminary date variables (Death, Vaccination) created above 
 prelim_date_variables = dict(
-    cens_date_death=death_date,
+    death_date=death_date,
     vax_date_covid_1=vax_date_covid_1,
     vax_date_covid_2=vax_date_covid_2,
     vax_date_covid_3=vax_date_covid_3,

From 74082abc2df2f440e955c72578e911a0044eaa6e Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:06:24 +0000
Subject: [PATCH 19/32] Update project actions

---
 analysis/create_project_actions.R |  2 +-
 project.yaml                      | 45 +++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
index fef22da..8036f41 100644
--- a/analysis/create_project_actions.R
+++ b/analysis/create_project_actions.R
@@ -88,7 +88,7 @@ preprocess_data <- function(cohort){
       name = glue("preprocess_data_{cohort}"),
       run = glue("r:latest analysis/preprocess/preprocess_data.R"),
       arguments = c(cohort),
-      needs = list("generate_index_dates",glue("generate_study_population_{cohort}")),
+      needs = list("generate_dataset_index_dates",glue("generate_study_population_{cohort}")),
       moderately_sensitive = list(
         describe = glue("output/not-for-review/describe_input_{cohort}_stage0.txt"),
         describe_venn = glue("output/not-for-review/describe_venn_{cohort}.txt")
diff --git a/project.yaml b/project.yaml
index aa9f11b..e9ab2b6 100644
--- a/project.yaml
+++ b/project.yaml
@@ -60,3 +60,48 @@ actions:
       highly_sensitive:
         cohort: output/input_unvax.csv.gz
 
+  ## Preprocess data - prevax 
+
+  preprocess_data_prevax:
+    run: r:latest analysis/preprocess/preprocess_data.R prevax
+    needs:
+    - generate_dataset_index_dates
+    - generate_study_population_prevax
+    outputs:
+      moderately_sensitive:
+        describe: output/not-for-review/describe_input_prevax_stage0.txt
+        describe_venn: output/not-for-review/describe_venn_prevax.txt
+      highly_sensitive:
+        cohort: output/input_prevax.rds
+        venn: output/venn_prevax.rds
+
+  ## Preprocess data - vax 
+
+  preprocess_data_vax:
+    run: r:latest analysis/preprocess/preprocess_data.R vax
+    needs:
+    - generate_dataset_index_dates
+    - generate_study_population_vax
+    outputs:
+      moderately_sensitive:
+        describe: output/not-for-review/describe_input_vax_stage0.txt
+        describe_venn: output/not-for-review/describe_venn_vax.txt
+      highly_sensitive:
+        cohort: output/input_vax.rds
+        venn: output/venn_vax.rds
+
+  ## Preprocess data - unvax 
+
+  preprocess_data_unvax:
+    run: r:latest analysis/preprocess/preprocess_data.R unvax
+    needs:
+    - generate_dataset_index_dates
+    - generate_study_population_unvax
+    outputs:
+      moderately_sensitive:
+        describe: output/not-for-review/describe_input_unvax_stage0.txt
+        describe_venn: output/not-for-review/describe_venn_unvax.txt
+      highly_sensitive:
+        cohort: output/input_unvax.rds
+        venn: output/venn_unvax.rds
+

From 3f57e883c46f75ca9d4d5ae24ea67f5ac9f5c0d6 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:18:18 +0000
Subject: [PATCH 20/32] Extract vax dates/type variables to cohort datasets for
 later pipelines

The vax_date_eligible variable will be used to modify vaccination dummy data and include other useful variables in the cohort dataset.

Index dates for different cohorts will be extracted in their respective scripts, rather than being defined here. While the values for these index dates will differ across cohorts, their variable names should remain the same in each cohort:
index_date
end_date_exposure
end_date_outcome
---
 analysis/dataset_definition_cohorts.py | 60 +++++++++++++++++---------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/analysis/dataset_definition_cohorts.py b/analysis/dataset_definition_cohorts.py
index 0add1dc..2db14d7 100644
--- a/analysis/dataset_definition_cohorts.py
+++ b/analysis/dataset_definition_cohorts.py
@@ -33,36 +33,54 @@ def generate_dataset(index_date, end_date_exp, end_date_out):
 
     for var_name, var_value in variables.items():
         setattr(dataset, var_name, var_value)
-    
-    # Extract index dates for cohorts from index_dates.csv, also extract vax_date_eligible for modifying dummy vax dates
+
+# Extract date variables for later pipelines
 
     @table_from_file("output/index_dates.csv.gz")
     
     class index_dates(PatientFrame):
-        index_prevax = Series(date)
-        end_prevax_exposure = Series(date)
-        end_prevax_outcome = Series(date)
-        index_vax = Series(date)
-        end_vax_exposure = Series(date)
-        end_vax_outcome = Series(date)
-        index_unvax = Series(date)
-        end_unvax_exposure = Series(date)
-        end_unvax_outcome = Series(date)
+    # Vaccine category and eligibility variables
+        vax_cat_jcvi_group = Series(str)
         vax_date_eligible = Series(date)
 
-    dataset.index_prevax = index_dates.index_prevax
-    dataset.end_prevax_exposure = index_dates.end_prevax_exposure
-    dataset.end_prevax_outcome = index_dates.end_prevax_outcome
+    # General COVID vaccination dates
+        vax_date_covid_1 = Series(date)
+        vax_date_covid_2 = Series(date)
+        vax_date_covid_3 = Series(date)
 
-    dataset.index_unvax = index_dates.index_unvax
-    dataset.end_unvax_exposure = index_dates.end_unvax_exposure
-    dataset.end_unvax_outcome = index_dates.end_unvax_outcome
+    # Pfizer vaccine-specific dates
+        vax_date_Pfizer_1 = Series(date)
+        vax_date_Pfizer_2 = Series(date)
+        vax_date_Pfizer_3 = Series(date)
 
-    dataset.index_vax = index_dates.index_vax
-    dataset.end_vax_exposure = index_dates.end_vax_exposure
-    dataset.end_vax_outcome = index_dates.end_vax_outcome
+    # AstraZeneca vaccine-specific dates
+        vax_date_AstraZeneca_1 = Series(date)
+        vax_date_AstraZeneca_2 = Series(date)
+        vax_date_AstraZeneca_3 = Series(date)
 
-    dataset.vax_date_eligible = index_dates.vax_date_eligible
+    # Moderna vaccine-specific dates
+        vax_date_Moderna_1 = Series(date)
+        vax_date_Moderna_2 = Series(date)
+        vax_date_Moderna_3 = Series(date)
 
+    # Censoring date due to death
+        cens_date_death = Series(date)
+
+    # Mapping all variables from index_dates to the dataset
+    dataset.vax_cat_jcvi_group = index_dates.vax_cat_jcvi_group
+    dataset.vax_date_eligible = index_dates.vax_date_eligible
+    dataset.vax_date_covid_1 = index_dates.vax_date_covid_1
+    dataset.vax_date_covid_2 = index_dates.vax_date_covid_2
+    dataset.vax_date_covid_3 = index_dates.vax_date_covid_3
+    dataset.vax_date_Pfizer_1 = index_dates.vax_date_Pfizer_1
+    dataset.vax_date_Pfizer_2 = index_dates.vax_date_Pfizer_2
+    dataset.vax_date_Pfizer_3 = index_dates.vax_date_Pfizer_3
+    dataset.vax_date_AstraZeneca_1 = index_dates.vax_date_AstraZeneca_1
+    dataset.vax_date_AstraZeneca_2 = index_dates.vax_date_AstraZeneca_2
+    dataset.vax_date_AstraZeneca_3 = index_dates.vax_date_AstraZeneca_3
+    dataset.vax_date_Moderna_1 = index_dates.vax_date_Moderna_1
+    dataset.vax_date_Moderna_2 = index_dates.vax_date_Moderna_2
+    dataset.vax_date_Moderna_3 = index_dates.vax_date_Moderna_3
+    dataset.cens_date_death = index_dates.cens_date_death
 
     return dataset
\ No newline at end of file

From 18b85e283a10cc8e156248dcedd843b28d14181f Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:18:42 +0000
Subject: [PATCH 21/32] Rename death_date

---
 analysis/dataset_definition_dates.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/analysis/dataset_definition_dates.py b/analysis/dataset_definition_dates.py
index 9017dd6..59ede12 100644
--- a/analysis/dataset_definition_dates.py
+++ b/analysis/dataset_definition_dates.py
@@ -64,11 +64,11 @@
 dataset.index_prevax = minimum_of(pandemic_start, pandemic_start)
 
 dataset.end_prevax_exposure = minimum_of(
-    dataset.death_date, dataset.vax_date_covid_1, dataset.vax_date_eligible, all_eligible
+    dataset.cens_date_death, dataset.vax_date_covid_1, dataset.vax_date_eligible, all_eligible
 )
 
 dataset.end_prevax_outcome = minimum_of(
-    dataset.death_date, omicron_date
+    dataset.cens_date_death, omicron_date
 )
 
 dataset.index_vax = maximum_of(
@@ -76,7 +76,7 @@
     delta_date
 )
 dataset.end_vax_exposure = minimum_of(
-    dataset.death_date, omicron_date
+    dataset.cens_date_death, omicron_date
 )
 
 dataset.end_vax_outcome = dataset.end_vax_exposure
@@ -86,8 +86,8 @@
     delta_date
 )
 dataset.end_unvax_exposure = minimum_of(
-    dataset.death_date, omicron_date, dataset.vax_date_covid_1
+    dataset.cens_date_death, omicron_date, dataset.vax_date_covid_1
 )
 dataset.end_unvax_outcome = minimum_of(
-    dataset.death_date, omicron_date
+    dataset.cens_date_death, omicron_date
 )
\ No newline at end of file

From 32ae8be7b1026bc1f82a3817cc5eb7c799d27341 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:19:54 +0000
Subject: [PATCH 22/32] Extract index dates to each cohort

Extract index dates to each cohort:
index_date
end_date_exposure
end_date_outcome
---
 analysis/dataset_definition_prevax.py | 6 +++++-
 analysis/dataset_definition_unvax.py  | 6 +++++-
 analysis/dataset_definition_vax.py    | 6 +++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/analysis/dataset_definition_prevax.py b/analysis/dataset_definition_prevax.py
index 2ba4930..6a8b404 100644
--- a/analysis/dataset_definition_prevax.py
+++ b/analysis/dataset_definition_prevax.py
@@ -19,4 +19,8 @@ class index_dates(PatientFrame):
 
 # Create dataset
 
-dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
\ No newline at end of file
+dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
+
+dataset.index_date = index_date
+dataset.end_date_exposure = end_date_exposure
+dataset.end_date_outcome = end_date_outcome
\ No newline at end of file
diff --git a/analysis/dataset_definition_unvax.py b/analysis/dataset_definition_unvax.py
index 8b4c849..e6035cf 100644
--- a/analysis/dataset_definition_unvax.py
+++ b/analysis/dataset_definition_unvax.py
@@ -19,4 +19,8 @@ class index_dates(PatientFrame):
 
 # Create dataset
 
-dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
\ No newline at end of file
+dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
+
+dataset.index_date = index_date
+dataset.end_date_exposure = end_date_exposure
+dataset.end_date_outcome = end_date_outcome
\ No newline at end of file
diff --git a/analysis/dataset_definition_vax.py b/analysis/dataset_definition_vax.py
index 600dace..72cf8be 100644
--- a/analysis/dataset_definition_vax.py
+++ b/analysis/dataset_definition_vax.py
@@ -19,4 +19,8 @@ class index_dates(PatientFrame):
 
 # Create dataset
 
-dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
\ No newline at end of file
+dataset = generate_dataset(index_date, end_date_exposure, end_date_outcome)
+
+dataset.index_date = index_date
+dataset.end_date_exposure = end_date_exposure
+dataset.end_date_outcome = end_date_outcome
\ No newline at end of file

From f9e6c899969796ad8d53b7302eccc80ed72b5b5f Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:20:32 +0000
Subject: [PATCH 23/32] Update preprocess_data.R

They have been renamed and classed into either inex_ or cens_
---
 analysis/preprocess/preprocess_data.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index a5702ab..c66621a 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -92,10 +92,7 @@ message("All records with valid patient IDs retained.")
 
 df1 <- df %>% 
   select(patient_id,
-         death_date,
          starts_with("index_date_"),
-         has_follow_up_previous_6months,
-         deregistration_date,
          starts_with("end_date_"),
          contains("sub_"), # Subgroups
          contains("exp_"), # Exposures

From 1daf7f7e36e0a933519b5380221721fe56e647a4 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:20:41 +0000
Subject: [PATCH 24/32] Update variables_dates.py

---
 analysis/variables_dates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/variables_dates.py b/analysis/variables_dates.py
index c73bff5..b62e8b6 100644
--- a/analysis/variables_dates.py
+++ b/analysis/variables_dates.py
@@ -546,7 +546,7 @@
 
 # Define a dictionary of preliminary date variables (Death, Vaccination) created above 
 prelim_date_variables = dict(
-    death_date=death_date,
+    cens_date_death=death_date,
     vax_date_covid_1=vax_date_covid_1,
     vax_date_covid_2=vax_date_covid_2,
     vax_date_covid_3=vax_date_covid_3,

From 632fda9d3faa0931408cdc86f26849bd4bdec157 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 12:38:32 +0000
Subject: [PATCH 25/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index c66621a..ba34e82 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -92,7 +92,7 @@ message("All records with valid patient IDs retained.")
 
 df1 <- df %>% 
   select(patient_id,
-         starts_with("index_date_"),
+         starts_with("index_date"),
          starts_with("end_date_"),
          contains("sub_"), # Subgroups
          contains("exp_"), # Exposures

From 468cd66e559b9808a63fa7cd65b1c7da8df6fffe Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 13:49:48 +0000
Subject: [PATCH 26/32] Update YAML

---
 analysis/create_project_actions.R |  4 ++--
 project.yaml                      | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
index 8036f41..314343b 100644
--- a/analysis/create_project_actions.R
+++ b/analysis/create_project_actions.R
@@ -90,8 +90,8 @@ preprocess_data <- function(cohort){
       arguments = c(cohort),
       needs = list("generate_dataset_index_dates",glue("generate_study_population_{cohort}")),
       moderately_sensitive = list(
-        describe = glue("output/not-for-review/describe_input_{cohort}_stage0.txt"),
-        describe_venn = glue("output/not-for-review/describe_venn_{cohort}.txt")
+        describe = glue("output/describe_input_{cohort}_stage0.txt"),
+        describe_venn = glue("output/describe_venn_{cohort}.txt")
       ),
       highly_sensitive = list(
         cohort = glue("output/input_{cohort}.rds"),
diff --git a/project.yaml b/project.yaml
index e9ab2b6..8ececf4 100644
--- a/project.yaml
+++ b/project.yaml
@@ -69,8 +69,8 @@ actions:
     - generate_study_population_prevax
     outputs:
       moderately_sensitive:
-        describe: output/not-for-review/describe_input_prevax_stage0.txt
-        describe_venn: output/not-for-review/describe_venn_prevax.txt
+        describe: output/describe_input_prevax_stage0.txt
+        describe_venn: output/describe_venn_prevax.txt
       highly_sensitive:
         cohort: output/input_prevax.rds
         venn: output/venn_prevax.rds
@@ -84,8 +84,8 @@ actions:
     - generate_study_population_vax
     outputs:
       moderately_sensitive:
-        describe: output/not-for-review/describe_input_vax_stage0.txt
-        describe_venn: output/not-for-review/describe_venn_vax.txt
+        describe: output/describe_input_vax_stage0.txt
+        describe_venn: output/describe_venn_vax.txt
       highly_sensitive:
         cohort: output/input_vax.rds
         venn: output/venn_vax.rds
@@ -99,8 +99,8 @@ actions:
     - generate_study_population_unvax
     outputs:
       moderately_sensitive:
-        describe: output/not-for-review/describe_input_unvax_stage0.txt
-        describe_venn: output/not-for-review/describe_venn_unvax.txt
+        describe: output/describe_input_unvax_stage0.txt
+        describe_venn: output/describe_venn_unvax.txt
       highly_sensitive:
         cohort: output/input_unvax.rds
         venn: output/venn_unvax.rds

From ea769776850e8799057fa5557fa6b1c636e8b949 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 13:49:52 +0000
Subject: [PATCH 27/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index ba34e82..83225e5 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -12,7 +12,7 @@ library(readr)
 args <- commandArgs(trailingOnly=TRUE)
 print(length(args))
 if(length(args)==0){
-  cohort_name <- "vax"
+  cohort_name <- "prevax"
 } else {
   cohort_name <- args[[1]]
 }

From 9bff56f3d0d399fd20e12c32982d4b06d49f4ae2 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Fri, 13 Dec 2024 14:52:38 +0000
Subject: [PATCH 28/32] Update README.md

---
 README.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index b27bffd..2d1a433 100644
--- a/README.md
+++ b/README.md
@@ -9,17 +9,6 @@ This repository may reflect an incomplete or incorrect analysis with no further
 The content has ONLY been made public to support the OpenSAFELY [open science and transparency principles](https://www.opensafely.org/about/#contributing-to-best-practice-around-open-science) and to support the sharing of re-usable code for other subsequent users.
 No clinical, policy or safety conclusions must be drawn from the contents of this repository.
 
-# About the OpenSAFELY framework
-
-The OpenSAFELY framework is a Trusted Research Environment (TRE) for electronic
-health records research in the NHS, with a focus on public accountability and
-research quality.
-
-Read more at [OpenSAFELY.org](https://opensafely.org).
-
-# Licences
-As standard, research projects have a MIT license. 
-
 ## Repository navigation
 
 -   If you are interested in how we defined our code lists, look in the [`codelists`](./codelists) folder.
@@ -36,3 +25,14 @@ As standard, research projects have a MIT license.
 -   The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project.R`](analysis/create_project.R) script which generates all the actions.
 
 -   Descriptive and Model outputs, including figures and tables are in the [`released_outputs`](./release_outputs) directory.
+
+# About the OpenSAFELY framework
+
+The OpenSAFELY framework is a Trusted Research Environment (TRE) for electronic
+health records research in the NHS, with a focus on public accountability and
+research quality.
+
+Read more at [OpenSAFELY.org](https://opensafely.org).
+
+# Licences
+As standard, research projects have a MIT license. 
\ No newline at end of file

From 0fe6452f27e627503b12a1688883eab035d901b1 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 16 Dec 2024 16:32:48 +0000
Subject: [PATCH 29/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 83225e5..4a76ac1 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -65,7 +65,9 @@ message(paste0("Dataset has been read successfully with N = ", nrow(df), " rows"
 
 df <- df %>%
   mutate(across(all_of(date_cols),
-                ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")))
+                ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
+         across(contains('_birth_year'), 
+                 ~ format(as.Date(., origin = "1970-01-01"), "%Y")))
 
 # Overwrite vaccination information for dummy data and vax cohort only ---------
 

From aca510e883769c341c549d86a033a7497b9c2eb9 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 16 Dec 2024 16:35:31 +0000
Subject: [PATCH 30/32] Update preprocess_data.R

---
 analysis/preprocess/preprocess_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/preprocess/preprocess_data.R b/analysis/preprocess/preprocess_data.R
index 4a76ac1..61708c6 100644
--- a/analysis/preprocess/preprocess_data.R
+++ b/analysis/preprocess/preprocess_data.R
@@ -67,7 +67,7 @@ df <- df %>%
   mutate(across(all_of(date_cols),
                 ~ floor_date(as.Date(., format="%Y-%m-%d"), unit = "days")),
          across(contains('_birth_year'), 
-                 ~ format(as.Date(., origin = "1970-01-01"), "%Y")))
+                ~ format(as.Date(., origin = "1970-01-01"), "%Y")))
 
 # Overwrite vaccination information for dummy data and vax cohort only ---------
 

From 0cf4af3d5c5ab5fc9fcc46f33a224f904ed3fe91 Mon Sep 17 00:00:00 2001
From: ZoeMZou <mz16609@bristol.ac.uk>
Date: Mon, 16 Dec 2024 17:11:52 +0000
Subject: [PATCH 31/32] Move scripts to directory for dataset_definition

---
 README.md                                         |  8 ++++----
 analysis/create_project_actions.R                 |  6 +++---
 .../{ => dataset_definition}/active_analyses.R    |  0
 analysis/{ => dataset_definition}/codelists.py    |  0
 .../dataset_definition_cohorts.py                 |  0
 .../dataset_definition_dates.py                   |  0
 .../dataset_definition_prevax.py                  |  0
 .../dataset_definition_unvax.py                   |  0
 .../dataset_definition_vax.py                     |  0
 analysis/{ => dataset_definition}/metadates.R     |  0
 analysis/{ => dataset_definition}/utility.R       |  0
 .../variable_helper_functions.py                  |  0
 .../{ => dataset_definition}/variables_cohorts.py |  0
 .../{ => dataset_definition}/variables_dates.py   |  0
 project.yaml                                      | 15 +++++++++------
 15 files changed, 16 insertions(+), 13 deletions(-)
 rename analysis/{ => dataset_definition}/active_analyses.R (100%)
 rename analysis/{ => dataset_definition}/codelists.py (100%)
 rename analysis/{ => dataset_definition}/dataset_definition_cohorts.py (100%)
 rename analysis/{ => dataset_definition}/dataset_definition_dates.py (100%)
 rename analysis/{ => dataset_definition}/dataset_definition_prevax.py (100%)
 rename analysis/{ => dataset_definition}/dataset_definition_unvax.py (100%)
 rename analysis/{ => dataset_definition}/dataset_definition_vax.py (100%)
 rename analysis/{ => dataset_definition}/metadates.R (100%)
 rename analysis/{ => dataset_definition}/utility.R (100%)
 rename analysis/{ => dataset_definition}/variable_helper_functions.py (100%)
 rename analysis/{ => dataset_definition}/variables_cohorts.py (100%)
 rename analysis/{ => dataset_definition}/variables_dates.py (100%)

diff --git a/README.md b/README.md
index 2d1a433..d74e0d6 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,14 @@ No clinical, policy or safety conclusions must be drawn from the contents of thi
 
 -   Analyses scripts are in the [`analysis`](./analysis) directory:
 
-    -   If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates.
-    -   If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/variables_dates.py).
-    -   If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables.
+    -   If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/dataset_definition/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/dataset_definition/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates.
+    -   If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/dataset_definition/variables_dates.py).
+    -   If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/dataset_definition/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables.
     -   This directory also contains all the R scripts that process, describe, and analyse the extracted data.
 
 -   The [active_analyses](lib/active_analyses.rds) contains a list of active analyses.
 
--   The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project.R`](analysis/create_project.R) script which generates all the actions.
+-   The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project_actions.R`](analysis/create_project_actions.R) script which generates all the actions.
 
 -   Descriptive and Model outputs, including figures and tables are in the [`released_outputs`](./release_outputs) directory.
 
diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
index 314343b..e6c8c28 100644
--- a/analysis/create_project_actions.R
+++ b/analysis/create_project_actions.R
@@ -70,7 +70,7 @@ generate_study_population <- function(cohort){
     comment(glue("Generate study population - {cohort}")),
     action(
       name = glue("generate_study_population_{cohort}"),
-      run = glue("ehrql:v1 generate-dataset analysis/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"),
+      run = glue("ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"),
       needs = list("generate_dataset_index_dates"),
       highly_sensitive = list(
         cohort = glue("output/input_{cohort}.csv.gz")
@@ -119,7 +119,7 @@ actions_list <- splice(
   
   action(
     name = glue("vax_eligibility_inputs"),
-    run = "r:latest analysis/metadates.R",
+    run = "r:latest analysis/dataset_definition/metadates.R",
     highly_sensitive = list(
       study_dates_json = glue("output/study_dates.json")
     )
@@ -130,7 +130,7 @@ actions_list <- splice(
   
   action(
     name = "generate_dataset_index_dates",
-    run = "ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz",
+    run = "ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py --output output/index_dates.csv.gz",
     needs = list("vax_eligibility_inputs"),
     highly_sensitive = list(
       dataset = glue("output/index_dates.csv.gz")
diff --git a/analysis/active_analyses.R b/analysis/dataset_definition/active_analyses.R
similarity index 100%
rename from analysis/active_analyses.R
rename to analysis/dataset_definition/active_analyses.R
diff --git a/analysis/codelists.py b/analysis/dataset_definition/codelists.py
similarity index 100%
rename from analysis/codelists.py
rename to analysis/dataset_definition/codelists.py
diff --git a/analysis/dataset_definition_cohorts.py b/analysis/dataset_definition/dataset_definition_cohorts.py
similarity index 100%
rename from analysis/dataset_definition_cohorts.py
rename to analysis/dataset_definition/dataset_definition_cohorts.py
diff --git a/analysis/dataset_definition_dates.py b/analysis/dataset_definition/dataset_definition_dates.py
similarity index 100%
rename from analysis/dataset_definition_dates.py
rename to analysis/dataset_definition/dataset_definition_dates.py
diff --git a/analysis/dataset_definition_prevax.py b/analysis/dataset_definition/dataset_definition_prevax.py
similarity index 100%
rename from analysis/dataset_definition_prevax.py
rename to analysis/dataset_definition/dataset_definition_prevax.py
diff --git a/analysis/dataset_definition_unvax.py b/analysis/dataset_definition/dataset_definition_unvax.py
similarity index 100%
rename from analysis/dataset_definition_unvax.py
rename to analysis/dataset_definition/dataset_definition_unvax.py
diff --git a/analysis/dataset_definition_vax.py b/analysis/dataset_definition/dataset_definition_vax.py
similarity index 100%
rename from analysis/dataset_definition_vax.py
rename to analysis/dataset_definition/dataset_definition_vax.py
diff --git a/analysis/metadates.R b/analysis/dataset_definition/metadates.R
similarity index 100%
rename from analysis/metadates.R
rename to analysis/dataset_definition/metadates.R
diff --git a/analysis/utility.R b/analysis/dataset_definition/utility.R
similarity index 100%
rename from analysis/utility.R
rename to analysis/dataset_definition/utility.R
diff --git a/analysis/variable_helper_functions.py b/analysis/dataset_definition/variable_helper_functions.py
similarity index 100%
rename from analysis/variable_helper_functions.py
rename to analysis/dataset_definition/variable_helper_functions.py
diff --git a/analysis/variables_cohorts.py b/analysis/dataset_definition/variables_cohorts.py
similarity index 100%
rename from analysis/variables_cohorts.py
rename to analysis/dataset_definition/variables_cohorts.py
diff --git a/analysis/variables_dates.py b/analysis/dataset_definition/variables_dates.py
similarity index 100%
rename from analysis/variables_dates.py
rename to analysis/dataset_definition/variables_dates.py
diff --git a/project.yaml b/project.yaml
index 8ececf4..dcf0c44 100644
--- a/project.yaml
+++ b/project.yaml
@@ -14,7 +14,7 @@ actions:
   ## Generate vaccination eligibility information 
 
   vax_eligibility_inputs:
-    run: r:latest analysis/metadates.R
+    run: r:latest analysis/dataset_definition/metadates.R
     outputs:
       highly_sensitive:
         study_dates_json: output/study_dates.json
@@ -22,7 +22,8 @@ actions:
   ## Generate dates for all cohorts 
 
   generate_dataset_index_dates:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py
+      --output output/index_dates.csv.gz
     needs:
     - vax_eligibility_inputs
     outputs:
@@ -32,8 +33,8 @@ actions:
   ## Generate study population - prevax 
 
   generate_study_population_prevax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_prevax.py --output
-      output/input_prevax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_prevax.py
+      --output output/input_prevax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs:
@@ -43,7 +44,8 @@ actions:
   ## Generate study population - vax 
 
   generate_study_population_vax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_vax.py --output output/input_vax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_vax.py
+      --output output/input_vax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs:
@@ -53,7 +55,8 @@ actions:
   ## Generate study population - unvax 
 
   generate_study_population_unvax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_unvax.py --output output/input_unvax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_unvax.py
+      --output output/input_unvax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs:

From ff24a27a237ea2a80b72c38b34e6908c543f47ff Mon Sep 17 00:00:00 2001
From: Venexia Walker <venexia.walker@bristol.ac.uk>
Date: Tue, 17 Dec 2024 09:23:29 +0000
Subject: [PATCH 32/32] Move utility and active_analyses up a directory

---
 analysis/{dataset_definition => }/active_analyses.R | 0
 analysis/{dataset_definition => }/utility.R         | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename analysis/{dataset_definition => }/active_analyses.R (100%)
 rename analysis/{dataset_definition => }/utility.R (100%)

diff --git a/analysis/dataset_definition/active_analyses.R b/analysis/active_analyses.R
similarity index 100%
rename from analysis/dataset_definition/active_analyses.R
rename to analysis/active_analyses.R
diff --git a/analysis/dataset_definition/utility.R b/analysis/utility.R
similarity index 100%
rename from analysis/dataset_definition/utility.R
rename to analysis/utility.R