From 0cf4af3d5c5ab5fc9fcc46f33a224f904ed3fe91 Mon Sep 17 00:00:00 2001 From: ZoeMZou Date: Mon, 16 Dec 2024 17:11:52 +0000 Subject: [PATCH] Move scripts to directory for dataset_definition --- README.md | 8 ++++---- analysis/create_project_actions.R | 6 +++--- .../{ => dataset_definition}/active_analyses.R | 0 analysis/{ => dataset_definition}/codelists.py | 0 .../dataset_definition_cohorts.py | 0 .../dataset_definition_dates.py | 0 .../dataset_definition_prevax.py | 0 .../dataset_definition_unvax.py | 0 .../dataset_definition_vax.py | 0 analysis/{ => dataset_definition}/metadates.R | 0 analysis/{ => dataset_definition}/utility.R | 0 .../variable_helper_functions.py | 0 .../{ => dataset_definition}/variables_cohorts.py | 0 .../{ => dataset_definition}/variables_dates.py | 0 project.yaml | 15 +++++++++------ 15 files changed, 16 insertions(+), 13 deletions(-) rename analysis/{ => dataset_definition}/active_analyses.R (100%) rename analysis/{ => dataset_definition}/codelists.py (100%) rename analysis/{ => dataset_definition}/dataset_definition_cohorts.py (100%) rename analysis/{ => dataset_definition}/dataset_definition_dates.py (100%) rename analysis/{ => dataset_definition}/dataset_definition_prevax.py (100%) rename analysis/{ => dataset_definition}/dataset_definition_unvax.py (100%) rename analysis/{ => dataset_definition}/dataset_definition_vax.py (100%) rename analysis/{ => dataset_definition}/metadates.R (100%) rename analysis/{ => dataset_definition}/utility.R (100%) rename analysis/{ => dataset_definition}/variable_helper_functions.py (100%) rename analysis/{ => dataset_definition}/variables_cohorts.py (100%) rename analysis/{ => dataset_definition}/variables_dates.py (100%) diff --git a/README.md b/README.md index 2d1a433..d74e0d6 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,14 @@ No clinical, policy or safety conclusions must be drawn from the contents of thi - Analyses scripts are in the [`analysis`](./analysis) directory: - - If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates. - - If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/variables_dates.py). - - If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables. + - If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/dataset_definition/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/dataset_definition/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates. + - If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/dataset_definition/variables_dates.py). + - If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/dataset_definition/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables. - This directory also contains all the R scripts that process, describe, and analyse the extracted data. - The [active_analyses](lib/active_analyses.rds) contains a list of active analyses. -- The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project.R`](analysis/create_project.R) script which generates all the actions. +- The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project_actions.R`](analysis/create_project_actions.R) script which generates all the actions. - Descriptive and Model outputs, including figures and tables are in the [`released_outputs`](./release_outputs) directory. diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R index 314343b..e6c8c28 100644 --- a/analysis/create_project_actions.R +++ b/analysis/create_project_actions.R @@ -70,7 +70,7 @@ generate_study_population <- function(cohort){ comment(glue("Generate study population - {cohort}")), action( name = glue("generate_study_population_{cohort}"), - run = glue("ehrql:v1 generate-dataset analysis/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"), + run = glue("ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"), needs = list("generate_dataset_index_dates"), highly_sensitive = list( cohort = glue("output/input_{cohort}.csv.gz") @@ -119,7 +119,7 @@ actions_list <- splice( action( name = glue("vax_eligibility_inputs"), - run = "r:latest analysis/metadates.R", + run = "r:latest analysis/dataset_definition/metadates.R", highly_sensitive = list( study_dates_json = glue("output/study_dates.json") ) @@ -130,7 +130,7 @@ actions_list <- splice( action( name = "generate_dataset_index_dates", - run = "ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz", + run = "ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py --output output/index_dates.csv.gz", needs = list("vax_eligibility_inputs"), highly_sensitive = list( dataset = glue("output/index_dates.csv.gz") diff --git a/analysis/active_analyses.R b/analysis/dataset_definition/active_analyses.R similarity index 100% rename from analysis/active_analyses.R rename to analysis/dataset_definition/active_analyses.R diff --git a/analysis/codelists.py b/analysis/dataset_definition/codelists.py similarity index 100% rename from analysis/codelists.py rename to analysis/dataset_definition/codelists.py diff --git a/analysis/dataset_definition_cohorts.py b/analysis/dataset_definition/dataset_definition_cohorts.py similarity index 100% rename from analysis/dataset_definition_cohorts.py rename to analysis/dataset_definition/dataset_definition_cohorts.py diff --git a/analysis/dataset_definition_dates.py b/analysis/dataset_definition/dataset_definition_dates.py similarity index 100% rename from analysis/dataset_definition_dates.py rename to analysis/dataset_definition/dataset_definition_dates.py diff --git a/analysis/dataset_definition_prevax.py b/analysis/dataset_definition/dataset_definition_prevax.py similarity index 100% rename from analysis/dataset_definition_prevax.py rename to analysis/dataset_definition/dataset_definition_prevax.py diff --git a/analysis/dataset_definition_unvax.py b/analysis/dataset_definition/dataset_definition_unvax.py similarity index 100% rename from analysis/dataset_definition_unvax.py rename to analysis/dataset_definition/dataset_definition_unvax.py diff --git a/analysis/dataset_definition_vax.py b/analysis/dataset_definition/dataset_definition_vax.py similarity index 100% rename from analysis/dataset_definition_vax.py rename to analysis/dataset_definition/dataset_definition_vax.py diff --git a/analysis/metadates.R b/analysis/dataset_definition/metadates.R similarity index 100% rename from analysis/metadates.R rename to analysis/dataset_definition/metadates.R diff --git a/analysis/utility.R b/analysis/dataset_definition/utility.R similarity index 100% rename from analysis/utility.R rename to analysis/dataset_definition/utility.R diff --git a/analysis/variable_helper_functions.py b/analysis/dataset_definition/variable_helper_functions.py similarity index 100% rename from analysis/variable_helper_functions.py rename to analysis/dataset_definition/variable_helper_functions.py diff --git a/analysis/variables_cohorts.py b/analysis/dataset_definition/variables_cohorts.py similarity index 100% rename from analysis/variables_cohorts.py rename to analysis/dataset_definition/variables_cohorts.py diff --git a/analysis/variables_dates.py b/analysis/dataset_definition/variables_dates.py similarity index 100% rename from analysis/variables_dates.py rename to analysis/dataset_definition/variables_dates.py diff --git a/project.yaml b/project.yaml index 8ececf4..dcf0c44 100644 --- a/project.yaml +++ b/project.yaml @@ -14,7 +14,7 @@ actions: ## Generate vaccination eligibility information vax_eligibility_inputs: - run: r:latest analysis/metadates.R + run: r:latest analysis/dataset_definition/metadates.R outputs: highly_sensitive: study_dates_json: output/study_dates.json @@ -22,7 +22,8 @@ actions: ## Generate dates for all cohorts generate_dataset_index_dates: - run: ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz + run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py + --output output/index_dates.csv.gz needs: - vax_eligibility_inputs outputs: @@ -32,8 +33,8 @@ actions: ## Generate study population - prevax generate_study_population_prevax: - run: ehrql:v1 generate-dataset analysis/dataset_definition_prevax.py --output - output/input_prevax.csv.gz + run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_prevax.py + --output output/input_prevax.csv.gz needs: - generate_dataset_index_dates outputs: @@ -43,7 +44,8 @@ actions: ## Generate study population - vax generate_study_population_vax: - run: ehrql:v1 generate-dataset analysis/dataset_definition_vax.py --output output/input_vax.csv.gz + run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_vax.py + --output output/input_vax.csv.gz needs: - generate_dataset_index_dates outputs: @@ -53,7 +55,8 @@ actions: ## Generate study population - unvax generate_study_population_unvax: - run: ehrql:v1 generate-dataset analysis/dataset_definition_unvax.py --output output/input_unvax.csv.gz + run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_unvax.py + --output output/input_unvax.csv.gz needs: - generate_dataset_index_dates outputs: