Merge pull request mmcdermott#223 from mmcdermott/dev

Adds examples for AUMCdb, eICU, and increases test coverage significantly.
VectorInstitute · Nov 8, 2024 · f38901d · f38901d
2 parents ae5b155 + 04c82a6
commit f38901d
Show file tree

Hide file tree

Showing 49 changed files with 1,714 additions and 218 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -35,7 +35,7 @@ jobs:
       #----------------------------------------------
       - name: Run tests
         run: |
-          pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=docs
+          pytest src/ tests/ -v --doctest-modules --cov=src --junitxml=junit.xml -s
 
       - name: Upload coverage to Codecov
         uses: codecov/[email protected]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 default_language_version:
   python: python3.12
 
-exclude: "docs/index.md|MIMIC-IV_Example/README.md|eICU_Example/README.md"
+exclude: "docs/index.md|MIMIC-IV_Example/README.md|eICU_Example/README.md|AUMCdb_Example/README.md"
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       # list of supported hooks: https://pre-commit.com/hooks.html
       - id: trailing-whitespace
@@ -22,27 +22,27 @@ repos:
 
   # python code formatting
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 24.10.0
     hooks:
       - id: black
         args: [--line-length, "110"]
 
   # python import sorting
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         args: ["--profile", "black", "--filter-files", "-o", "wandb"]
 
   - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.0
+    rev: v2.3.1
     hooks:
       - id: autoflake
         args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.19.0
     hooks:
       - id: pyupgrade
         args: [--py311-plus]
@@ -56,7 +56,7 @@ repos:
 
   # python check (PEP8), programming errors and code complexity
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         args:
@@ -73,21 +73,21 @@ repos:
 
   # yaml formatting
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.3
+    rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
         types: [yaml]
         exclude: "environment.yaml"
 
   # shell scripts linter
   - repo: https://github.com/shellcheck-py/shellcheck-py
-    rev: v0.9.0.5
+    rev: v0.10.0.1
     hooks:
       - id: shellcheck
 
   # md formatting
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.17
+    rev: 0.7.18
     hooks:
       - id: mdformat
         args: ["--number"]
@@ -104,7 +104,7 @@ repos:
 
   # word spelling linter
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.3.0
     hooks:
       - id: codespell
         args:
@@ -113,17 +113,21 @@ repos:
 
   # jupyter notebook cell output clearing
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.6.1
+    rev: 0.7.1
     hooks:
       - id: nbstripout
 
   # jupyter notebook linting
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.0
+    rev: 1.8.7
     hooks:
       - id: nbqa-black
         args: ["--line-length=110"]
       - id: nbqa-isort
         args: ["--profile=black"]
       - id: nbqa-flake8
-        args: ["--extend-ignore=E203,E402,E501,F401,F841", "--exclude=logs/*,data/*"]
+        args:
+          [
+            "--extend-ignore=E203,E402,E501,F401,F841",
+            "--exclude=logs/*,data/*",
+          ]
diff --git a/AUMCdb_Example/README.md b/AUMCdb_Example/README.md
@@ -0,0 +1,89 @@
+# AUMC Example
+
+This is an example of how to extract a MEDS dataset from AUMCdb (https://github.com/AmsterdamUMC/AmsterdamUMCdb). All scripts in this README are assumed to be run from this directory or from the directory in which the files in Step 0.5. were downloaded.
+
+## Step 0: Installation
+
+```bash
+conda create -n MEDS python=3.12
+conda activate MEDS
+pip install "MEDS_transforms[local_parallelism,slurm_parallelism]"
+```
+
+If you want to profile the time and memory costs of your ETL, also install: `pip install hydra-profiler`.
+
+## Step 0.5: Set-up
+
+Set some environment variables and download the necessary files:
+
+```bash
+export AUMC_RAW_DIR=??? # set to the directory in which you want to store the raw data
+export AUMC_PRE_MEDS_DIR=??? # set to the directory in which you want to store the intermediate MEDS data
+export AUMC_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the final MEDS data
+
+export VERSION=0.0.8 # or whatever version you want
+export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/AUMC_Example"
+
+wget $URL/run.sh
+wget $URL/pre_MEDS.py
+wget $URL/local_parallelism_runner.yaml
+wget $URL/slurm_runner.yaml
+mkdir configs
+cd configs
+wget $URL/configs/extract_AUMC.yaml
+cd ..
+chmod +x run.sh
+chmod +x pre_MEDS.py
+```
+
+## Step 1: Download AUMC
+
+Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$AUMC_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored.
+
+## Step 2: Run the MEDS ETL
+
+To run the MEDS ETL, run the following command:
+
+```bash
+./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_COHORT_DIR
+```
+
+> \[!NOTE\]
+> This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_AUMC.yaml` file.
+> Check that your environment variables are set correctly.
+
+To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an
+additional argument
+
+```bash
+export N_WORKERS=5
+./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_DIR \
+    stage_runner_fp=slurm_runner.yaml
+```
+
+The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used
+at maximum.
+
+The `slurm_runner.yaml` file (downloaded above) runs each stage across several workers on separate slurm
+worker nodes using the `submitit` launcher. _**You will need to customize this file to your own slurm system
+so that the partition names are correct before use.**_ The memory and time costs are viable in the current
+configuration, but if your nodes are sufficiently different you may need to adjust those as well.
+
+The `local_parallelism_runner.yaml` file (downloaded above) runs each stage via separate processes on the
+launching machine. There are no additional arguments needed for this stage beyond the `N_WORKERS` environment
+variable and there is nothing to customize in this file.
+
+To profile the time and memory costs of your ETL, add the `do_profile=true` flag at the end.
+
+## Notes
+
+Note: If you use the slurm system and you launch the hydra submitit jobs from an interactive slurm node, you
+may need to run `unset SLURM_CPU_BIND` in your terminal first to avoid errors.
+
+## Future Work
+
+Check with AUMCdb authors:
+
+- How should we deal with `registeredat` and `updatedat`?
+- We **IGNORE** several flags for the `drugitems` -- this may be a mistake!
+- When is the administered dose recorded? Is this done after the fact?
diff --git a/AUMCdb_Example/__init__.py b/AUMCdb_Example/__init__.py
diff --git a/AUMCdb_Example/configs/event_configs.yaml b/AUMCdb_Example/configs/event_configs.yaml
@@ -0,0 +1,123 @@
+subject_id_col: patientid
+
+patient:
+  dob:
+    code: "MEDS_BIRTH"
+    time: col(dateofbirth)
+  gender:
+    code: ["GENDER", "col(gender)"]
+    time: null
+  dod:
+    code: "MEDS_DEATH"
+    time: col(dateofdeath)
+
+admissions:
+  icu_admission:
+    code:
+      - "ICU_ADMISSION"
+      - col(location)
+      - col(urgency)
+      - col(origin)
+      - col(specialty)
+    time: col(admittedattime)
+  icu_discharge:
+    code:
+      - "ICU_DISCHARGE"
+      - col(destination)
+    time: col(dischargedattime)
+  weight:
+    code:
+      - "WEIGHT_AT_ADMISSION"
+      - col(weightsource)
+      - col(weightgroup)
+    time: col(admittedattime)
+  height:
+    code:
+      - "HEIGHT_AT_ADMISSION"
+      - col(heightsource)
+      - col(heightgroup)
+    time: col(admittedattime)
+
+numericitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(unit)
+    time: col(measuredattime)
+    numeric_value: value
+
+listitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(islabresult)
+      - col(value)
+    time: col(measuredattime)
+
+freetextitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(islabresult)
+    time: col(measuredattime)
+    text_value: value
+
+procedureorderitems:
+  event:
+    code:
+      - PROCEDURE
+      - col(ordercategoryname)
+      - col(item)
+    time: col(registeredattime)
+
+processitems:
+  start:
+    code:
+      - PROCESS
+      - START
+      - col(item)
+    time: col(starttime)
+  end:
+    code:
+      - PROCESS
+      - END
+      - col(item)
+    time: col(stoptime)
+
+drugitems:
+  start:
+    code:
+      - DRUG
+      - START
+      - col(ordercategory)
+      - col(item)
+      - col(action)
+    time: col(starttime)
+  rate:
+    code:
+      - DRUG
+      - RATE
+      - col(ordercategory)
+      - col(item)
+      - col(rateunit)
+    time: col(starttime)
+    numeric_value: col(rate)
+  dose:
+    code:
+      - DRUG
+      - DOSE
+      - col(ordercategory)
+      - col(item)
+      - col(doseunit)
+    time: col(starttime)
+    numeric_value: col(dose)
+  end:
+    code:
+      - DRUG
+      - END
+      - col(ordercategory)
+      - col(item)
+    time: col(stoptime)
diff --git a/AUMCdb_Example/configs/extract_AUMC.yaml b/AUMCdb_Example/configs/extract_AUMC.yaml
@@ -0,0 +1,35 @@
+defaults:
+  - _extract
+  - _self_
+
+description: |-
+  This pipeline extracts the AUMCdb dataset in longitudinal, sparse form from an input dataset meeting
+  select criteria and converts them to the flattened, MEDS format. You can control the key arguments to this
+  pipeline by setting environment variables:
+  ```bash
+    export EVENT_CONVERSION_CONFIG_FP=# Path to your event conversion config
+    export AUMC_PRE_MEDS_DIR=# Path to the output dir of the pre-MEDS step
+    export AUMC_MEDS_COHORT_DIR=# Path to where you want the dataset to live
+  ```
+
+# The event conversion configuration file is used throughout the pipeline to define the events to extract.
+event_conversion_config_fp: ${oc.env:EVENT_CONVERSION_CONFIG_FP}
+
+input_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
+cohort_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
+
+etl_metadata:
+  dataset_name: AUMCdb
+  dataset_version: 1.0.2
+
+stage_configs:
+  split_and_shard_subjects:
+    n_subjects_per_shard: 1000
+
+stages:
+  - shard_events
+  - split_and_shard_subjects
+  - convert_to_sharded_events
+  - merge_to_MEDS_cohort
+  - finalize_MEDS_metadata
+  - finalize_MEDS_data
diff --git a/AUMCdb_Example/configs/pre_MEDS.yaml b/AUMCdb_Example/configs/pre_MEDS.yaml
@@ -0,0 +1,13 @@
+input_dir: ${oc.env:AUMC_RAW_DIR}
+cohort_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
+
+log_dir: ${cohort_dir}/.logs
+
+# Hydra
+hydra:
+  job:
+    name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S}
+  run:
+    dir: ${log_dir}
+  sweep:
+    dir: ${log_dir}