Move scripts to directory for dataset_definition

opensafely · Dec 16, 2024 · 0cf4af3 · 0cf4af3
1 parent aca510e
commit 0cf4af3
Show file tree

Hide file tree

Showing 15 changed files with 16 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -15,14 +15,14 @@ No clinical, policy or safety conclusions must be drawn from the contents of thi
 
 -   Analyses scripts are in the [`analysis`](./analysis) directory:
 
-    -   If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates.
-    -   If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/variables_dates.py).
-    -   If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables.
+    -   If you are interested in how we defined our variables, we use the variable script [variable_helper_fuctions](analysis/dataset_definition/variable_helper_functions.py) to define functions that generate variables. We then apply these functions in [variables_cohorts](analysis/variables_cohorts.py) to create a dictionary of variables for cohort definitions, and in [variables_dates](analysis/dataset_definition/variables_dates.py) to create a dictionary of variables for calculating study start dates and end dates.
+    -   If you are interested in how we defined study dates (e.g., index and end dates), these vary by cohort and are described in the protocol. We use the script [dataset_definition_dates](analysis/dataset_definition/dataset_definition_dates.py) to generate a dataset with all required dates for each cohort. This script imported all variables generated from [variables_dates](analysis/dataset_definition/variables_dates.py).
+    -   If you are interested in how we defined our cohorts, we use the dataset definition script [dataset_definition_cohorts](analysis/dataset_definition/dataset_definition_cohorts.py) to define a function that generates cohorts. This script imports all variables generated from [variables_cohorts](analysis/dataset_definition/variables_cohorts.py) using the patient's index date, the cohort start date and the cohort end date. This approach is used to generate three cohorts: pre-vaccination, vaccinated, and unvaccinated—found in [dataset_definition_prevax](analysis/dataset_definition/dataset_definition_prevax.py), [dataset_definition_vax](analysis/dataset_definition/dataset_definition_vax.py), and [dataset_definition_unvax](analysis/dataset_definition/dataset_definition_unvax.py), respectively. For each cohort, the extracted data is initially processed in the preprocess data script [preprocess data script](analysis/preprocess/preprocess_data.R), which generates a flag variable for pre-existing respiratory conditions and restricts the data to relevant variables.
     -   This directory also contains all the R scripts that process, describe, and analyse the extracted data.
 
 -   The [active_analyses](lib/active_analyses.rds) contains a list of active analyses.
 
--   The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project.R`](analysis/create_project.R) script which generates all the actions.
+-   The [`project.yaml`](./project.yaml) defines run-order and dependencies for all the analysis scripts. This file should not be edited directly. To make changes to the yaml, edit and run the [`create_project_actions.R`](analysis/create_project_actions.R) script which generates all the actions.
 
 -   Descriptive and Model outputs, including figures and tables are in the [`released_outputs`](./release_outputs) directory.
 

diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
@@ -70,7 +70,7 @@ generate_study_population <- function(cohort){
     comment(glue("Generate study population - {cohort}")),
     action(
       name = glue("generate_study_population_{cohort}"),
-      run = glue("ehrql:v1 generate-dataset analysis/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"),
+      run = glue("ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_{cohort}.py --output output/input_{cohort}.csv.gz"),
       needs = list("generate_dataset_index_dates"),
       highly_sensitive = list(
         cohort = glue("output/input_{cohort}.csv.gz")
@@ -119,7 +119,7 @@ actions_list <- splice(
 
   action(
     name = glue("vax_eligibility_inputs"),
-    run = "r:latest analysis/metadates.R",
+    run = "r:latest analysis/dataset_definition/metadates.R",
     highly_sensitive = list(
       study_dates_json = glue("output/study_dates.json")
     )
@@ -130,7 +130,7 @@ actions_list <- splice(
 
   action(
     name = "generate_dataset_index_dates",
-    run = "ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz",
+    run = "ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py --output output/index_dates.csv.gz",
     needs = list("vax_eligibility_inputs"),
     highly_sensitive = list(
       dataset = glue("output/index_dates.csv.gz")

diff --git a/analysis/active_analyses.R → ...ysis/dataset_definition/active_analyses.R b/analysis/active_analyses.R → ...ysis/dataset_definition/active_analyses.R
diff --git a/analysis/codelists.py → analysis/dataset_definition/codelists.py b/analysis/codelists.py → analysis/dataset_definition/codelists.py
diff --git a/analysis/dataset_definition_cohorts.py → ..._definition/dataset_definition_cohorts.py b/analysis/dataset_definition_cohorts.py → ..._definition/dataset_definition_cohorts.py
diff --git a/analysis/dataset_definition_dates.py → ...et_definition/dataset_definition_dates.py b/analysis/dataset_definition_dates.py → ...et_definition/dataset_definition_dates.py
diff --git a/analysis/dataset_definition_prevax.py → ...t_definition/dataset_definition_prevax.py b/analysis/dataset_definition_prevax.py → ...t_definition/dataset_definition_prevax.py
diff --git a/analysis/dataset_definition_unvax.py → ...et_definition/dataset_definition_unvax.py b/analysis/dataset_definition_unvax.py → ...et_definition/dataset_definition_unvax.py
diff --git a/analysis/dataset_definition_vax.py → ...aset_definition/dataset_definition_vax.py b/analysis/dataset_definition_vax.py → ...aset_definition/dataset_definition_vax.py
diff --git a/analysis/metadates.R → analysis/dataset_definition/metadates.R b/analysis/metadates.R → analysis/dataset_definition/metadates.R
diff --git a/analysis/utility.R → analysis/dataset_definition/utility.R b/analysis/utility.R → analysis/dataset_definition/utility.R
diff --git a/analysis/variable_helper_functions.py → ...t_definition/variable_helper_functions.py b/analysis/variable_helper_functions.py → ...t_definition/variable_helper_functions.py
diff --git a/analysis/variables_cohorts.py → ...s/dataset_definition/variables_cohorts.py b/analysis/variables_cohorts.py → ...s/dataset_definition/variables_cohorts.py
diff --git a/analysis/variables_dates.py → ...sis/dataset_definition/variables_dates.py b/analysis/variables_dates.py → ...sis/dataset_definition/variables_dates.py
diff --git a/project.yaml b/project.yaml
@@ -14,15 +14,16 @@ actions:
   ## Generate vaccination eligibility information 
 
   vax_eligibility_inputs:
-    run: r:latest analysis/metadates.R
+    run: r:latest analysis/dataset_definition/metadates.R
     outputs:
       highly_sensitive:
         study_dates_json: output/study_dates.json
 
   ## Generate dates for all cohorts 
 
   generate_dataset_index_dates:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_dates.py --output output/index_dates.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_dates.py
+      --output output/index_dates.csv.gz
     needs:
     - vax_eligibility_inputs
     outputs:
@@ -32,8 +33,8 @@ actions:
   ## Generate study population - prevax 
 
   generate_study_population_prevax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_prevax.py --output
-      output/input_prevax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_prevax.py
+      --output output/input_prevax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs:
@@ -43,7 +44,8 @@ actions:
   ## Generate study population - vax 
 
   generate_study_population_vax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_vax.py --output output/input_vax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_vax.py
+      --output output/input_vax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs:
@@ -53,7 +55,8 @@ actions:
   ## Generate study population - unvax 
 
   generate_study_population_unvax:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition_unvax.py --output output/input_unvax.csv.gz
+    run: ehrql:v1 generate-dataset analysis/dataset_definition/dataset_definition_unvax.py
+      --output output/input_unvax.csv.gz
     needs:
     - generate_dataset_index_dates
     outputs: