From f87cc30a06d4360a6b083d08d7dd742038b26e13 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Fri, 2 Aug 2024 22:00:29 +0200
Subject: [PATCH 01/76] add first version of aumc-meds

---
 AUMC_Example/configs/event_configs.yaml       | 120 +++++++++++
 AUMC_Example/configs/pre_MEDS.yaml            |  11 +
 AUMC_Example/configs/table_preprocessors.yaml | 116 ++++++++++
 AUMC_Example/joint_script.sh                  | 101 +++++++++
 AUMC_Example/pre_MEDS.py                      | 204 ++++++++++++++++++
 5 files changed, 552 insertions(+)
 create mode 100644 AUMC_Example/configs/event_configs.yaml
 create mode 100644 AUMC_Example/configs/pre_MEDS.yaml
 create mode 100644 AUMC_Example/configs/table_preprocessors.yaml
 create mode 100755 AUMC_Example/joint_script.sh
 create mode 100755 AUMC_Example/pre_MEDS.py

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
new file mode 100644
index 00000000..c44bceae
--- /dev/null
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -0,0 +1,120 @@
+patient_id_col: patientid
+
+patient:
+  dob: 
+    code: "DOB"
+    timestamp: col(dateofbirth)
+  gender:
+    code: ["GENDER", "col(gender)"]
+    timestamp: null
+
+admissions:
+  icu_admission:
+    code:
+      - "ICU_ADMISSION"
+      - col(location)
+      - col(urgency)
+      - col(origin)
+      - col(specialty)
+    timestamp: col(admittedattimestamp)
+  icu_discharge:
+    code: 
+      - "ICU_DISCHARGE"
+      - col(destination)
+    timestamp: col(dischargedattimestamp)
+  weight:
+    code:
+      - "WEIGHT_AT_ADMISSION"
+      - col(weightsource)
+      - col(weightgroup)
+    timestamp: col(admittedattimestamp)
+  height: 
+    code:
+      - "HEIGHT_AT_ADMISSION"
+      - col(heightsource)
+      - col(heightgroup)
+    timestamp: col(admittedattimestamp)
+
+numericitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(unit)
+    timestamp: col(measuredattimestamp)
+    numerical_value: value
+
+listitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(islabresult)
+      - col(value)
+    timestamp: col(measuredattimestamp)
+  
+freetextitems:
+  event:
+    code:
+      - MEASURE
+      - col(item)
+      - col(islabresult)
+    timestamp: col(measuredattimestamp)
+    text_value: value
+
+procedureorderitems:
+  event: 
+    code:
+      - PROCEDURE
+      - col(ordercategoryname)
+      - col(item)
+    timestamp: col(registeredattimestamp)
+
+processitems:
+  start: 
+    code:
+      - PROCESS
+      - START
+      - col(item)
+    timestamp: col(starttimestamp)
+  end:
+    code: 
+      - PROCESS
+      - END
+      - col(item)
+    timestamp: col(stoptimestamp)
+
+drugitems: 
+  start: 
+    code:
+      - DRUG
+      - START
+      - col(ordercategory)
+      - col(item)
+      - col(action)
+    timestamp: col(starttimestamp)
+  rate: 
+    code:
+      - DRUG
+      - RATE
+      - col(ordercategory)
+      - col(item)
+      - col(rateunit)
+    timestamp: col(starttimestamp)
+    numerical_value: col(rate)
+  dose: 
+    code:
+      - DRUG
+      - DOSE
+      - col(ordercategory)
+      - col(item)
+      - col(doseunit)
+    timestamp: col(starttimestamp)
+    numerical_value: col(dose)
+  end:
+    code:
+      - DRUG
+      - END
+      - col(ordercategory)
+      - col(item)
+    timestamp: col(stoptimestamp)
diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMC_Example/configs/pre_MEDS.yaml
new file mode 100644
index 00000000..b5cfa4cb
--- /dev/null
+++ b/AUMC_Example/configs/pre_MEDS.yaml
@@ -0,0 +1,11 @@
+raw_cohort_dir: ???
+output_dir: ???
+
+# Hydra
+hydra:
+  job:
+    name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S}
+  run:
+    dir: ${output_dir}/.logs/${hydra.job.name}
+  sweep:
+    dir: ${output_dir}/.logs/${hydra.job.name}
diff --git a/AUMC_Example/configs/table_preprocessors.yaml b/AUMC_Example/configs/table_preprocessors.yaml
new file mode 100644
index 00000000..777410ee
--- /dev/null
+++ b/AUMC_Example/configs/table_preprocessors.yaml
@@ -0,0 +1,116 @@
+admissions:
+  offset_col:
+    - "admittedat"
+    - "dischargedat"
+  pseudotime_col: 
+    - "admittedattimestamp"
+    - "dischargedattimestamp"
+  output_data_cols:
+    - "location"
+    - "urgency" 
+    - "origin"
+    - "destination"
+    - "weightgroup"
+    - "weightsource"
+    - "heightgroup"
+    - "heightsource"
+    - "specialty"
+
+numericitems:
+  offset_col: 
+    - "measuredat"
+    - "registeredat"
+    - "updatedat"
+  pseudotime_col: 
+    - "measuredattimestamp"
+    - "registeredattimestamp"
+    - "updatedattimestamp"
+  output_data_cols:
+    - "item"
+    - "value"
+    - "unit"
+    - "registeredby"
+    - "updatedby"
+  warning_items:
+    - "How should we deal with `registeredat` and `updatedat`?"
+
+listitems:
+  offset_col: 
+    - "measuredat"
+    - "registeredat"
+    - "updatedat"
+  pseudotime_col: 
+    - "measuredattimestamp"
+    - "registeredattimestamp"
+    - "updatedattimestamp"
+  output_data_cols:
+    - "item"
+    - "value"
+    - "islabresult"
+    - "registeredby"
+    - "updatedby"
+  warning_items:
+    - "How should we deal with `registeredat` and `updatedat`?"
+
+freetextitems:
+  offset_col: 
+    - "measuredat"
+    - "registeredat"
+    - "updatedat"
+  pseudotime_col: 
+    - "measuredattimestamp"
+    - "registeredattimestamp"
+    - "updatedattimestamp"
+  output_data_cols:
+    - "item"
+    - "value"
+    - "comment"
+    - "islabresult"
+    - "registeredby"
+    - "updatedby"
+  warning_items:
+    - "How should we deal with `registeredat` and `updatedat`?"
+
+drugitems:
+  offset_col:
+    - "start"
+    - "stop"
+  pseudotime_col: 
+    - "starttimestamp"
+    - "stoptimestamp"
+  output_data_cols:
+    - "orderid"
+    - "ordercategory"
+    - "item"
+    - "rate"
+    - "rateunit"
+    - "ratetimeunitid"
+    - "dose"
+    - "doseunit"
+    - "doserateunit"
+    - "duration"
+    - "administered"
+    - "administeredunit"
+    - "action"
+  warning_items: 
+    - "We **IGNORE** several flags here -- this may be a mistake!"
+    - "When is the administered dose recorded? Is this done after the fact?"
+
+procedureorderitems:
+  offset_col: "registeredat"
+  pseudotime_col: "registeredattimestamp"
+  output_data_cols:
+    - "orderid"
+    - "ordercategoryname"
+    - "item"
+    - "registeredby"
+
+processitems:
+  offset_col:
+    - "start"
+    - "stop"
+  pseudotime_col: 
+    - "starttimestamp"
+    - "stoptimestamp"
+  output_data_cols:
+    - "item"
\ No newline at end of file
diff --git a/AUMC_Example/joint_script.sh b/AUMC_Example/joint_script.sh
new file mode 100755
index 00000000..a710b0a0
--- /dev/null
+++ b/AUMC_Example/joint_script.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PREMEDS_DIR> <AUMC_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes AUMCdb data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  AUMC_RAW_DIR        Directory containing raw AUMCdb data files."
+    echo "  AUMC_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  AUMC_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -lt 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
+AUMC_RAW_DIR="$1"
+AUMC_PREMEDS_DIR="$2"
+AUMC_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+echo "Note that AUMCdb has a lot of observations in the numericitems, so to keep to a reasonable "
+echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
+echo "the final unique check (which should not be necessary given the structure of AUMCdb and is expensive) "
+echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
+echo "args when running this script:"
+echo "  * stage_configs.split_and_shard_patients.n_patients_per_shard=10000"
+echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
+
+
+echo "Running pre-MEDS conversion."
+./AUMC_Example/pre_MEDS.py raw_cohort_dir="$AUMC_RAW_DIR" output_dir="$AUMC_PREMEDS_DIR"
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+MEDS_extract-shard_events \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$AUMC_PREMEDS_DIR" \
+    cohort_dir="$AUMC_MEDS_DIR" \
+    stage="shard_events" \
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting patients in serial"
+MEDS_extract-split_and_shard_patients \
+    input_dir="$AUMC_PREMEDS_DIR" \
+    cohort_dir="$AUMC_MEDS_DIR" \
+    stage="split_and_shard_patients" \
+    stage_configs.split_and_shard_patients.n_patients_per_shard=10000 \
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+MEDS_extract-convert_to_sharded_events \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$AUMC_PREMEDS_DIR" \
+    cohort_dir="$AUMC_MEDS_DIR" \
+    stage="convert_to_sharded_events" \
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+MEDS_extract-merge_to_MEDS_cohort \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$AUMC_PREMEDS_DIR" \
+    cohort_dir="$AUMC_MEDS_DIR" \
+    stage="merge_to_MEDS_cohort" \
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
+
+echo "Aggregating initial code stats with $N_PARALLEL_WORKERS workers in parallel"
+MEDS_transform-aggregate_code_metadata \
+    --config-name="extract" \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$AUMC_PREMEDS_DIR" \
+    cohort_dir="$AUMC_MEDS_DIR" \
+    stage="aggregate_code_metadata" \
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
\ No newline at end of file
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
new file mode 100755
index 00000000..09e6ccd2
--- /dev/null
+++ b/AUMC_Example/pre_MEDS.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+
+"""Performs pre-MEDS data wrangling for AUMCdb.
+
+See the docstring of `main` for more information.
+"""
+import rootutils
+
+root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
+
+import gzip
+from collections.abc import Callable
+from datetime import datetime
+from pathlib import Path
+
+import hydra
+import polars as pl
+from loguru import logger
+from omegaconf import DictConfig, OmegaConf
+
+from MEDS_transforms.utils import get_shard_prefix, hydra_loguru_init, write_lazyframe
+
+ADMISSION_ID = "admissionid"
+PATIENT_ID = "patientid"
+
+def load_raw_aumc_file(fp: Path, **kwargs) -> pl.LazyFrame:
+    """Load a raw AUMCdb file into a Polars DataFrame.
+
+    Args:
+        fp: The path to the AUMCdb file.
+
+    Returns:
+        The Polars DataFrame containing the AUMCdb data.
+    """
+
+    return pl.scan_csv(fp, infer_schema_length=10000000, encoding="utf8-lossy", **kwargs)
+
+
+def process_patient_and_admissions(df: pl.LazyFrame) -> pl.LazyFrame:
+    """Takes the admissions table and converts it to a form that includes timestamps.
+
+    As AUMCdb stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true
+    timestamp of their health system admission. This is acceptable because in AUMCdb ONLY RELATIVE TIME
+    DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES.
+
+    The output of this process is ultimately converted to events via the `patient` key in the
+    `configs/event_configs.yaml` file.
+    """
+
+    origin_pseudotime = pl.datetime(
+        year = pl.col("admissionyeargroup").str.extract(r"(2003|2010)").cast(pl.Int32),
+        month = 1, day = 1
+    )
+
+    # TODO: consider using better logic to infer date of birth for patients 
+    #       with more than one admission.
+    age_in_years = ((
+        pl.col("agegroup").str.extract("(\\d{2}).?$").cast(pl.Int32) + 
+        pl.col("agegroup").str.extract("^(\\d{2})").cast(pl.Int32)
+    ) / 2).ceil()
+    age_in_days = age_in_years * 365.25
+    # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate
+    pseudo_date_of_birth = origin_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2))
+
+    return df.filter(pl.col("admissioncount") == 1).select(
+        PATIENT_ID, 
+        pseudo_date_of_birth.alias("dateofbirth"),
+        "gender",
+        origin_pseudotime.alias("firstadmittedattimestamp"),
+    ), df.select(PATIENT_ID, ADMISSION_ID)
+
+
+def join_and_get_pseudotime_fntr(
+    table_name: str,
+    offset_col: str | list[str],
+    pseudotime_col: str | list[str],
+    output_data_cols: list[str] | None = None,
+    warning_items: list[str] | None = None,
+) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]:
+    """Returns a function that joins a dataframe to the `patient` table and adds pseudotimes.
+
+    Also raises specified warning strings via the logger for uncertain columns.
+
+    TODO
+    """
+
+    if output_data_cols is None:
+        output_data_cols = []
+
+    if isinstance(offset_col, str):
+        offset_col = [offset_col]
+    if isinstance(pseudotime_col, str):
+        pseudotime_col = [pseudotime_col]
+
+    if len(offset_col) != len(pseudotime_col):
+        raise ValueError(
+            "There must be the same number of `offset_col`s and `pseudotime_col`s specified. Got "
+            f"{len(offset_col)} and {len(pseudotime_col)}, respectively."
+        )
+
+    def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
+        f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps.
+
+        The output of this process is ultimately converted to events via the `{table_name}` key in the
+        `configs/event_configs.yaml` file.
+        """
+        pseudotimes = [
+            (pl.col("firstadmittedattimestamp") + pl.duration(milliseconds=pl.col(offset))).alias(pseudotime)
+            for pseudotime, offset in zip(pseudotime_col, offset_col)
+        ]
+
+        if warning_items:
+            warning_lines = [
+                f"NOT SURE ABOUT THE FOLLOWING for {table_name} table. Check with the AUMCdb team:",
+                *(f"  - {item}" for item in warning_items),
+            ]
+            logger.warning("\n".join(warning_lines))
+
+        return df.join(patient_df, on=ADMISSION_ID, how="inner").select(
+            PATIENT_ID,
+            ADMISSION_ID,
+            *pseudotimes,
+            *output_data_cols,
+        )
+
+    return fn
+
+
+@hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
+def main(cfg: DictConfig):
+    """Performs pre-MEDS data wrangling for AUMCdb.
+
+
+    """
+
+    hydra_loguru_init()
+
+    table_preprocessors_config_fp = Path("./AUMC_Example/configs/table_preprocessors.yaml")
+    logger.info(f"Loading table preprocessors from {str(table_preprocessors_config_fp.resolve())}...")
+    preprocessors = OmegaConf.load(table_preprocessors_config_fp)
+    functions = {}
+    for table_name, preprocessor_cfg in preprocessors.items():
+        logger.info(f"  Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}")
+        functions[table_name] = join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg)
+
+    raw_cohort_dir = Path(cfg.raw_cohort_dir)
+    MEDS_input_dir = Path(cfg.output_dir)
+
+    patient_out_fp = MEDS_input_dir / "patient.parquet"
+    link_out_fp = MEDS_input_dir / "link_patient_to_admission.parquet"
+
+    if patient_out_fp.is_file():
+        logger.info(f"Reloading processed patient df from {str(patient_out_fp.resolve())}")
+        patient_df = pl.read_parquet(patient_out_fp, use_pyarrow=True).lazy()
+        link_df = pl.read_parquet(link_out_fp, use_pyarrow=True).lazy()
+    else:
+        logger.info("Processing patient table first...")
+
+        admissions_fp = raw_cohort_dir / "admissions.csv"
+        logger.info(f"Loading {str(admissions_fp.resolve())}...")
+        raw_admissions_df = load_raw_aumc_file(admissions_fp)
+
+        logger.info("Processing patient table...")
+        patient_df, link_df = process_patient_and_admissions(raw_admissions_df)
+        write_lazyframe(patient_df, patient_out_fp)
+        write_lazyframe(link_df, link_out_fp)
+
+    patient_df = patient_df.join(link_df, on=PATIENT_ID)
+
+    all_fps = [fp for fp in raw_cohort_dir.glob("*.csv")]
+
+    unused_tables = {}
+
+    for in_fp in all_fps:
+        pfx = get_shard_prefix(raw_cohort_dir, in_fp)
+        if pfx in unused_tables:
+            logger.warning(f"Skipping {pfx} as it is not supported in this pipeline.")
+            continue
+        elif pfx not in functions:
+            logger.warning(f"No function needed for {pfx}. For AUMCdb, THIS IS UNEXPECTED")
+            continue
+
+        out_fp = MEDS_input_dir / f"{pfx}.parquet"
+
+        if out_fp.is_file():
+            print(f"Done with {pfx}. Continuing")
+            continue
+
+        out_fp.parent.mkdir(parents=True, exist_ok=True)
+
+        fn = functions[pfx]
+
+        st = datetime.now()
+        logger.info(f"Processing {pfx}...")
+        df = load_raw_aumc_file(in_fp)
+        processed_df = fn(df, patient_df)
+        processed_df.sink_parquet(out_fp)
+        logger.info(f"  * Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}")
+
+    logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}")
+
+
+if __name__ == "__main__":
+    main()

From 4c6a069605f1993c9848124f00cea6f937e65df8 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Mon, 5 Aug 2024 13:15:22 +0200
Subject: [PATCH 02/76] update aumc to meds==0.3 and MEDS-transforms==0.0.2

---
 AUMC_Example/configs/event_configs.yaml       | 32 +++++++++----------
 AUMC_Example/configs/table_preprocessors.yaml | 32 +++++++++----------
 AUMC_Example/pre_MEDS.py                      |  4 +--
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index c44bceae..bd4b8a1b 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -3,10 +3,10 @@ patient_id_col: patientid
 patient:
   dob: 
     code: "DOB"
-    timestamp: col(dateofbirth)
+    time: col(dateofbirth)
   gender:
     code: ["GENDER", "col(gender)"]
-    timestamp: null
+    time: null
 
 admissions:
   icu_admission:
@@ -16,24 +16,24 @@ admissions:
       - col(urgency)
       - col(origin)
       - col(specialty)
-    timestamp: col(admittedattimestamp)
+    time: col(admittedattime)
   icu_discharge:
     code: 
       - "ICU_DISCHARGE"
       - col(destination)
-    timestamp: col(dischargedattimestamp)
+    time: col(dischargedattime)
   weight:
     code:
       - "WEIGHT_AT_ADMISSION"
       - col(weightsource)
       - col(weightgroup)
-    timestamp: col(admittedattimestamp)
+    time: col(admittedattime)
   height: 
     code:
       - "HEIGHT_AT_ADMISSION"
       - col(heightsource)
       - col(heightgroup)
-    timestamp: col(admittedattimestamp)
+    time: col(admittedattime)
 
 numericitems:
   event:
@@ -41,7 +41,7 @@ numericitems:
       - MEASURE
       - col(item)
       - col(unit)
-    timestamp: col(measuredattimestamp)
+    time: col(measuredattime)
     numerical_value: value
 
 listitems:
@@ -51,7 +51,7 @@ listitems:
       - col(item)
       - col(islabresult)
       - col(value)
-    timestamp: col(measuredattimestamp)
+    time: col(measuredattime)
   
 freetextitems:
   event:
@@ -59,7 +59,7 @@ freetextitems:
       - MEASURE
       - col(item)
       - col(islabresult)
-    timestamp: col(measuredattimestamp)
+    time: col(measuredattime)
     text_value: value
 
 procedureorderitems:
@@ -68,7 +68,7 @@ procedureorderitems:
       - PROCEDURE
       - col(ordercategoryname)
       - col(item)
-    timestamp: col(registeredattimestamp)
+    time: col(registeredattime)
 
 processitems:
   start: 
@@ -76,13 +76,13 @@ processitems:
       - PROCESS
       - START
       - col(item)
-    timestamp: col(starttimestamp)
+    time: col(starttime)
   end:
     code: 
       - PROCESS
       - END
       - col(item)
-    timestamp: col(stoptimestamp)
+    time: col(stoptime)
 
 drugitems: 
   start: 
@@ -92,7 +92,7 @@ drugitems:
       - col(ordercategory)
       - col(item)
       - col(action)
-    timestamp: col(starttimestamp)
+    time: col(starttime)
   rate: 
     code:
       - DRUG
@@ -100,7 +100,7 @@ drugitems:
       - col(ordercategory)
       - col(item)
       - col(rateunit)
-    timestamp: col(starttimestamp)
+    time: col(starttime)
     numerical_value: col(rate)
   dose: 
     code:
@@ -109,7 +109,7 @@ drugitems:
       - col(ordercategory)
       - col(item)
       - col(doseunit)
-    timestamp: col(starttimestamp)
+    time: col(starttime)
     numerical_value: col(dose)
   end:
     code:
@@ -117,4 +117,4 @@ drugitems:
       - END
       - col(ordercategory)
       - col(item)
-    timestamp: col(stoptimestamp)
+    time: col(stoptime)
diff --git a/AUMC_Example/configs/table_preprocessors.yaml b/AUMC_Example/configs/table_preprocessors.yaml
index 777410ee..048f758e 100644
--- a/AUMC_Example/configs/table_preprocessors.yaml
+++ b/AUMC_Example/configs/table_preprocessors.yaml
@@ -3,8 +3,8 @@ admissions:
     - "admittedat"
     - "dischargedat"
   pseudotime_col: 
-    - "admittedattimestamp"
-    - "dischargedattimestamp"
+    - "admittedattime"
+    - "dischargedattime"
   output_data_cols:
     - "location"
     - "urgency" 
@@ -22,9 +22,9 @@ numericitems:
     - "registeredat"
     - "updatedat"
   pseudotime_col: 
-    - "measuredattimestamp"
-    - "registeredattimestamp"
-    - "updatedattimestamp"
+    - "measuredattime"
+    - "registeredattime"
+    - "updatedattime"
   output_data_cols:
     - "item"
     - "value"
@@ -40,9 +40,9 @@ listitems:
     - "registeredat"
     - "updatedat"
   pseudotime_col: 
-    - "measuredattimestamp"
-    - "registeredattimestamp"
-    - "updatedattimestamp"
+    - "measuredattime"
+    - "registeredattime"
+    - "updatedattime"
   output_data_cols:
     - "item"
     - "value"
@@ -58,9 +58,9 @@ freetextitems:
     - "registeredat"
     - "updatedat"
   pseudotime_col: 
-    - "measuredattimestamp"
-    - "registeredattimestamp"
-    - "updatedattimestamp"
+    - "measuredattime"
+    - "registeredattime"
+    - "updatedattime"
   output_data_cols:
     - "item"
     - "value"
@@ -76,8 +76,8 @@ drugitems:
     - "start"
     - "stop"
   pseudotime_col: 
-    - "starttimestamp"
-    - "stoptimestamp"
+    - "starttime"
+    - "stoptime"
   output_data_cols:
     - "orderid"
     - "ordercategory"
@@ -98,7 +98,7 @@ drugitems:
 
 procedureorderitems:
   offset_col: "registeredat"
-  pseudotime_col: "registeredattimestamp"
+  pseudotime_col: "registeredattime"
   output_data_cols:
     - "orderid"
     - "ordercategoryname"
@@ -110,7 +110,7 @@ processitems:
     - "start"
     - "stop"
   pseudotime_col: 
-    - "starttimestamp"
-    - "stoptimestamp"
+    - "starttime"
+    - "stoptime"
   output_data_cols:
     - "item"
\ No newline at end of file
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 09e6ccd2..addb5c0e 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -66,7 +66,7 @@ def process_patient_and_admissions(df: pl.LazyFrame) -> pl.LazyFrame:
         PATIENT_ID, 
         pseudo_date_of_birth.alias("dateofbirth"),
         "gender",
-        origin_pseudotime.alias("firstadmittedattimestamp"),
+        origin_pseudotime.alias("firstadmittedattime"),
     ), df.select(PATIENT_ID, ADMISSION_ID)
 
 
@@ -105,7 +105,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         `configs/event_configs.yaml` file.
         """
         pseudotimes = [
-            (pl.col("firstadmittedattimestamp") + pl.duration(milliseconds=pl.col(offset))).alias(pseudotime)
+            (pl.col("firstadmittedattime") + pl.duration(milliseconds=pl.col(offset))).alias(pseudotime)
             for pseudotime, offset in zip(pseudotime_col, offset_col)
         ]
 

From 1ab62b2095a1972d9fefe5db1de41c1e59ad63aa Mon Sep 17 00:00:00 2001
From: rvandewater <r.p.vandewater@gmail.com>
Date: Tue, 6 Aug 2024 11:10:02 +0200
Subject: [PATCH 03/76] New readme (WIP)

---
 AUMC_Example/README.md | 157 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 AUMC_Example/README.md

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
new file mode 100644
index 00000000..235e835a
--- /dev/null
+++ b/AUMC_Example/README.md
@@ -0,0 +1,157 @@
+# AUMC Example
+
+This is an example of how to extract a MEDS dataset from AUMC. All scripts in this README are assumed to
+be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
+up from this one).
+
+## Step 0: Installation
+
+Download this repository and install the requirements:
+If you want to install via pypi, (note that for now, you still need to copy some files locally even with a
+pypi installation, which is covered below, so make sure you are in a suitable directory) use:
+
+```bash
+conda create -n MEDS python=3.12
+conda activate MEDS
+pip install MEDS_transforms[examples,local_parallelism]
+mkdir AUMC_Example
+cd AUMC_Example
+wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/joint_script.sh
+wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/joint_script_slurm.sh
+wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/pre_MEDS.py
+chmod +x joint_script.sh
+chmod +x joint_script_slurm.sh
+chmod +x pre_MEDS.py
+cd ..
+```
+
+If you want to install locally, use:
+
+```bash
+git clone git@github.com:mmcdermott/MEDS_transforms.git
+cd MEDS_transforms
+conda create -n MEDS python=3.12
+conda activate MEDS
+pip install .[examples,local_parallelism]
+```
+
+## Step 1: Download AUMC
+
+Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$MIMICIV_RAW_DIR` to denote
+the root directory of where the resulting _core data files_ are stored -- e.g., there should be a `hosp` and
+`icu` subdirectory of `$MIMICIV_RAW_DIR`.
+
+## Step 1.5: Download MIMIC-IV Metadata files
+
+```bash
+cd $MIMIC_RAW_DIR
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/d_labitems_to_loinc.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/inputevents_to_rxnorm.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/lab_itemid_to_loinc.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_main.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_value.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/numerics-summary.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/outputevents_to_loinc.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_datetimeevents.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_itemid.csv
+wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/waveforms-summary.csv
+```
+
+## Step 2: Run the basic MEDS ETL
+
+This step contains several sub-steps; luckily, all these substeps can be run via a single script, with the
+`joint_script.sh` script which uses the Hydra `joblib` launcher to run things with local parallelism (make
+sure you enable this feature by including the `[local_parallelism]` option during installation) or via
+`joint_script_slurm.sh` which uses the Hydra `submitit` launcher to run things through slurm (make sure you
+enable this feature by including the `[slurm_parallelism]` option during installation). This script entails
+several steps:
+
+### Step 2.1: Get the data ready for base MEDS extraction
+
+This is a step in a few parts:
+
+1. Join a few tables by `hadm_id` to get the right times in the right rows for processing. In
+   particular, we need to join:
+   - the `hosp/diagnoses_icd` table with the `hosp/admissions` table to get the `dischtime` for each
+     `hadm_id`.
+   - the `hosp/drgcodes` table with the `hosp/admissions` table to get the `dischtime` for each `hadm_id`.
+2. Convert the patient's static data to a more parseable form. This entails:
+   - Get the patient's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and
+     `anchor_offset` fields.
+   - Merge the patient's `dod` with the `deathtime` from the `admissions` table.
+
+After these steps, modified files or symlinks to the original files will be written in a new directory which
+will be used as the input to the actual MEDS extraction ETL. We'll use `$AUMC_PREMEDS_DIR` to denote this
+directory.
+
+This step is run in the `joint_script.sh` script or the `joint_script_slurm.sh` script, but in either case the
+base command that is run is as follows (assumed to be run **not** from this directory but from the
+root directory of this repository):
+```bash
+export AUMC_RAW_DIR=/path/to/AUMC/raw
+export AUMC_PREMEDS_DIR=/path/to/AUMC/pre_meds
+```bash
+./AUMC_Example/pre_MEDS.py raw_cohort_dir=$AUMC_RAW_DIR output_dir=$AUMC_PREMEDS_DIR
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+
+### Step 2.2: Run the MEDS extraction ETL
+
+We will assume you want to output the final MEDS dataset into a directory we'll denote as `$AUMC_MEDS_DIR`.
+Note this is a different directory than the pre-MEDS directory (though, of course, they can both be
+subdirectories of the same root directory).
+
+This is a step in 4 parts:
+
+1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers
+   performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers.
+
+   This step uses the `./scripts/extraction/shard_events.py` script. See `joint_script*.sh` for the expected
+   format of the command.
+
+2. Extract and form the patient splits and sub-shards. The `./scripts/extraction/split_and_shard_patients.py`
+   script is used for this step. See `joint_script*.sh` for the expected format of the command.
+
+3. Extract patient sub-shards and convert to MEDS events. The
+   `./scripts/extraction/convert_to_sharded_events.py` script is used for this step. See `joint_script*.sh` for
+   the expected format of the command.
+
+4. Merge the MEDS events into a single file per patient sub-shard. The
+   `./scripts/extraction/merge_to_MEDS_cohort.py` script is used for this step. See `joint_script*.sh` for the
+   expected format of the command.
+
+5. (Optional) Generate preliminary code statistics and merge to external metadata. This is not performed
+   currently in the `joint_script*.sh` scripts.
+
+## Limitations / TO-DOs:
+
+Currently, some tables are ignored, including:
+
+1. `hosp/emar_detail`
+2. `hosp/microbiologyevents`
+3. `hosp/services`
+4. `icu/datetimeevents`
+5. `icu/ingredientevents`
+
+Lots of questions remain about how to appropriately handle times of the data -- e.g., things like HCPCS
+events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the
+timeline which is otherwise stored at the _datetime_ resolution?
+
+Other questions:
+
+1. How to handle merging the deathtimes between the hosp table and the patients table?
+2. How to handle the dob nonsense MIMIC has?
+
+## Notes
+
+Note: If you use the slurm system and you launch the hydra submitit jobs from an interactive slurm node, you
+may need to run `unset SLURM_CPU_BIND` in your terminal first to avoid errors.
+
+## Future Work
+
+### Pre-MEDS Processing
+
+If you wanted, some other processing could also be done here, such as:
+
+1. Converting the patient's dynamically recorded race into a static, most commonly recorded race field.

From e5a76220725653974bfbbe4de8fadd48423391d5 Mon Sep 17 00:00:00 2001
From: rvandewater <r.p.vandewater@gmail.com>
Date: Tue, 6 Aug 2024 11:42:44 +0200
Subject: [PATCH 04/76] small corrections

---
 AUMC_Example/README.md | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index 235e835a..895e6d15 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -1,6 +1,6 @@
 # AUMC Example
 
-This is an example of how to extract a MEDS dataset from AUMC. All scripts in this README are assumed to
+This is an example of how to extract a MEDS dataset from AUMCdb (https://github.com/AmsterdamUMC/AmsterdamUMCdb). All scripts in this README are assumed to
 be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
 up from this one).
 
@@ -37,25 +37,7 @@ pip install .[examples,local_parallelism]
 
 ## Step 1: Download AUMC
 
-Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$MIMICIV_RAW_DIR` to denote
-the root directory of where the resulting _core data files_ are stored -- e.g., there should be a `hosp` and
-`icu` subdirectory of `$MIMICIV_RAW_DIR`.
-
-## Step 1.5: Download MIMIC-IV Metadata files
-
-```bash
-cd $MIMIC_RAW_DIR
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/d_labitems_to_loinc.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/inputevents_to_rxnorm.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/lab_itemid_to_loinc.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_main.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_value.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/numerics-summary.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/outputevents_to_loinc.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_datetimeevents.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_itemid.csv
-wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/waveforms-summary.csv
-```
+Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$AUMC_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored.
 
 ## Step 2: Run the basic MEDS ETL
 
@@ -128,11 +110,7 @@ This is a step in 4 parts:
 
 Currently, some tables are ignored, including:
 
-1. `hosp/emar_detail`
-2. `hosp/microbiologyevents`
-3. `hosp/services`
-4. `icu/datetimeevents`
-5. `icu/ingredientevents`
+INSERT TABLES IGNORED HERE
 
 Lots of questions remain about how to appropriately handle times of the data -- e.g., things like HCPCS
 events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the
@@ -154,4 +132,3 @@ may need to run `unset SLURM_CPU_BIND` in your terminal first to avoid errors.
 
 If you wanted, some other processing could also be done here, such as:
 
-1. Converting the patient's dynamically recorded race into a static, most commonly recorded race field.

From ee79278be1fb0192868cdd339d397013754fcce1 Mon Sep 17 00:00:00 2001
From: rvandewater <r.p.vandewater@gmail.com>
Date: Wed, 7 Aug 2024 11:30:03 +0200
Subject: [PATCH 05/76] rootutils needed in pre_MEDS.py for AUMC

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e512e7e5..8bf85a7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-  "polars~=1.1.0", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy", "meds==0.3",
+  "polars~=1.1.0", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy", "meds==0.3", "rootutils"
 ]
 
 [tool.setuptools_scm]

From e427579c3baca1bc5359ce7a9f02b05f117dce84 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 7 Aug 2024 12:03:27 +0200
Subject: [PATCH 06/76] remove agg meta data and add missing finalize

---
 AUMC_Example/joint_script.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/AUMC_Example/joint_script.sh b/AUMC_Example/joint_script.sh
index a710b0a0..28bbe855 100755
--- a/AUMC_Example/joint_script.sh
+++ b/AUMC_Example/joint_script.sh
@@ -89,13 +89,14 @@ MEDS_extract-merge_to_MEDS_cohort \
     stage="merge_to_MEDS_cohort" \
     event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
 
-echo "Aggregating initial code stats with $N_PARALLEL_WORKERS workers in parallel"
-MEDS_transform-aggregate_code_metadata \
-    --config-name="extract" \
+echo "Finalizing MEDS data with $N_PARALLEL_WORKERS workers in parallel"
+MEDS_extract-finalize_MEDS_data \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir="$AUMC_PREMEDS_DIR" \
+    input_dir="$AUMC_RAW_DIR" \
     cohort_dir="$AUMC_MEDS_DIR" \
-    stage="aggregate_code_metadata" \
+    stage="finalize_MEDS_data" \
+    etl_metadata.dataset_name="AUMCdb" \
+    etl_metadata.dataset_version="1.0.2" \
     event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
\ No newline at end of file

From 5ba4126c4715381b6402f2d62e7456c7c530d527 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Fri, 23 Aug 2024 14:13:38 +0200
Subject: [PATCH 07/76] add missing (derived) dateofdeath

---
 AUMC_Example/configs/event_configs.yaml | 3 +++
 AUMC_Example/pre_MEDS.py                | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index bd4b8a1b..c828f2d5 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -7,6 +7,9 @@ patient:
   gender:
     code: ["GENDER", "col(gender)"]
     time: null
+  dod: 
+    code: "DEATH"
+    time: col(dateofdeath)
 
 admissions:
   icu_admission:
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index addb5c0e..9212d97f 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -61,12 +61,15 @@ def process_patient_and_admissions(df: pl.LazyFrame) -> pl.LazyFrame:
     age_in_days = age_in_years * 365.25
     # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate
     pseudo_date_of_birth = origin_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2))
+    pseudo_date_of_death = origin_pseudotime + pl.duration(milliseconds=pl.col("dateofdeath"))
+
 
     return df.filter(pl.col("admissioncount") == 1).select(
         PATIENT_ID, 
         pseudo_date_of_birth.alias("dateofbirth"),
         "gender",
         origin_pseudotime.alias("firstadmittedattime"),
+        pseudo_date_of_death.alias("dateofdeath")
     ), df.select(PATIENT_ID, ADMISSION_ID)
 
 

From 6a5a47eadaeab03804b0da8f9e81c5d0257d6bc5 Mon Sep 17 00:00:00 2001
From: rvandewater <robin.vandewater@access1.hpc.dhclab.i.hpi.de>
Date: Thu, 29 Aug 2024 10:39:25 +0200
Subject: [PATCH 08/76] numeric items

---
 AUMC_Example/configs/event_configs.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index c828f2d5..b7a24e02 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -45,7 +45,7 @@ numericitems:
       - col(item)
       - col(unit)
     time: col(measuredattime)
-    numerical_value: value
+    numeric_value: value
 
 listitems:
   event:
@@ -104,7 +104,7 @@ drugitems:
       - col(item)
       - col(rateunit)
     time: col(starttime)
-    numerical_value: col(rate)
+    numeric_value: col(rate)
   dose: 
     code:
       - DRUG
@@ -113,7 +113,7 @@ drugitems:
       - col(item)
       - col(doseunit)
     time: col(starttime)
-    numerical_value: col(dose)
+    numeric_value: col(dose)
   end:
     code:
       - DRUG

From 6bd54c3426f561fa8e3b3d5fdf916d264f4aaba3 Mon Sep 17 00:00:00 2001
From: rvandewater <robin.vandewater@access1.hpc.dhclab.i.hpi.de>
Date: Mon, 16 Sep 2024 14:53:13 +0200
Subject: [PATCH 09/76] Changed sharding to more reasonable amount

---
 AUMC_Example/joint_script.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/AUMC_Example/joint_script.sh b/AUMC_Example/joint_script.sh
index 28bbe855..a9e2992b 100755
--- a/AUMC_Example/joint_script.sh
+++ b/AUMC_Example/joint_script.sh
@@ -66,7 +66,7 @@ MEDS_extract-split_and_shard_patients \
     input_dir="$AUMC_PREMEDS_DIR" \
     cohort_dir="$AUMC_MEDS_DIR" \
     stage="split_and_shard_patients" \
-    stage_configs.split_and_shard_patients.n_patients_per_shard=10000 \
+    stage_configs.split_and_shard_patients.n_patients_per_shard=1000 \
     event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
 
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
@@ -99,4 +99,4 @@ MEDS_extract-finalize_MEDS_data \
     stage="finalize_MEDS_data" \
     etl_metadata.dataset_name="AUMCdb" \
     etl_metadata.dataset_version="1.0.2" \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
\ No newline at end of file
+    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"

From 1b81003fd016ba06f5e1c237b66f6f37acc19457 Mon Sep 17 00:00:00 2001
From: rvandewater <robin.vandewater@access1.hpc.dhclab.i.hpi.de>
Date: Mon, 16 Sep 2024 15:35:26 +0200
Subject: [PATCH 10/76] Initial streamline with main

---
 AUMC_Example/configs/event_configs.yaml    |   2 +-
 AUMC_Example/configs/pre_MEDS.yaml         |   4 +-
 AUMC_Example/local_parallelism_runner.yaml |   3 +
 AUMC_Example/run.sh                        | 105 +++++++++++++++++++++
 AUMC_Example/slurm_runner.yaml             |  61 ++++++++++++
 5 files changed, 172 insertions(+), 3 deletions(-)
 create mode 100644 AUMC_Example/local_parallelism_runner.yaml
 create mode 100755 AUMC_Example/run.sh
 create mode 100644 AUMC_Example/slurm_runner.yaml

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index b7a24e02..7abe661b 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -1,4 +1,4 @@
-patient_id_col: patientid
+patient_id_col: subject_id
 
 patient:
   dob: 
diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMC_Example/configs/pre_MEDS.yaml
index b5cfa4cb..524d63f8 100644
--- a/AUMC_Example/configs/pre_MEDS.yaml
+++ b/AUMC_Example/configs/pre_MEDS.yaml
@@ -1,5 +1,5 @@
-raw_cohort_dir: ???
-output_dir: ???
+input_dir: ${oc.env:AUMC_RAW_DIR}
+cohort_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
 
 # Hydra
 hydra:
diff --git a/AUMC_Example/local_parallelism_runner.yaml b/AUMC_Example/local_parallelism_runner.yaml
new file mode 100644
index 00000000..a1d9a6c1
--- /dev/null
+++ b/AUMC_Example/local_parallelism_runner.yaml
@@ -0,0 +1,3 @@
+parallelize:
+  n_workers: ${oc.env:N_WORKERS}
+  launcher: "joblib"
diff --git a/AUMC_Example/run.sh b/AUMC_Example/run.sh
new file mode 100755
index 00000000..9c06c7e9
--- /dev/null
+++ b/AUMC_Example/run.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <MIMICIV_RAW_DIR> <MIMICIV_PRE_MEDS_DIR> <MIMICIV_MEDS_COHORT_DIR>"
+    echo
+    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  MIMICIV_RAW_DIR                                Directory containing raw MIMIC-IV data files."
+    echo "  MIMICIV_PREMEDS_DIR                            Output directory for pre-MEDS data."
+    echo "  MIMICIV_MEDS_DIR                               Output directory for processed MEDS data."
+    echo "  (OPTIONAL) do_unzip=true OR do_unzip=false     Optional flag to unzip files before processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+echo "Unsetting SLURM_CPU_BIND in case you're running this on a slurm interactive node with slurm parallelism"
+unset SLURM_CPU_BIND
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -lt 3 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
+export MIMICIV_RAW_DIR=$1
+export MIMICIV_PRE_MEDS_DIR=$2
+export MIMICIV_MEDS_COHORT_DIR=$3
+shift 3
+
+# Defaults
+_DO_UNZIP_ARG_STR=""
+
+if [ $# -ge 1 ]; then
+  case "$1" in
+    do_unzip=*)
+      _DO_UNZIP_ARG_STR="$1"
+      shift 1
+      ;;
+  esac
+fi
+
+DO_UNZIP="false"
+
+if [ -n "$_DO_UNZIP_ARG_STR" ]; then
+  case "$_DO_UNZIP_ARG_STR" in
+    do_unzip=true)
+      DO_UNZIP="true"
+      ;;
+    do_unzip=false)
+      DO_UNZIP="false"
+      ;;
+    *)
+      echo "Error: Invalid do_unzip value. Use 'do_unzip=true' or 'do_unzip=false'."
+      exit 1
+      ;;
+  esac
+  echo "Setting DO_UNZIP=$DO_UNZIP"
+fi
+
+# TODO: Add wget blocks once testing is validated.
+
+EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml"
+PIPELINE_CONFIG_FP="$(pwd)/configs/extract_MIMIC.yaml"
+PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py"
+
+# We export these variables separately from their assignment so that any errors during assignment are caught.
+export EVENT_CONVERSION_CONFIG_FP
+export PIPELINE_CONFIG_FP
+export PRE_MEDS_PY_FP
+
+if [ "$DO_UNZIP" == "true" ]; then
+  GZ_FILES="${MIMICIV_RAW_DIR}/*/*.csv.gz"
+  if compgen -G "$GZ_FILES" > /dev/null; then
+    echo "Unzipping csv.gz files matching $GZ_FILES."
+    for file in $GZ_FILES; do gzip -d --force "$file"; done
+  else
+    echo "No csz.gz files to unzip at $GZ_FILES."
+  fi
+else
+  echo "Skipping unzipping."
+fi
+
+echo "Running pre-MEDS conversion."
+python "$PRE_MEDS_PY_FP" input_dir="$MIMICIV_RAW_DIR" cohort_dir="$MIMICIV_PRE_MEDS_DIR"
+
+if [ -z "$N_WORKERS" ]; then
+  echo "Setting N_WORKERS to 1 to avoid issues with the runners."
+  export N_WORKERS="1"
+fi
+
+echo "Running extraction pipeline."
+MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"
diff --git a/AUMC_Example/slurm_runner.yaml b/AUMC_Example/slurm_runner.yaml
new file mode 100644
index 00000000..4dbed261
--- /dev/null
+++ b/AUMC_Example/slurm_runner.yaml
@@ -0,0 +1,61 @@
+parallelize:
+  n_workers: ${oc.env:N_WORKERS}
+  launcher: "submitit_slurm"
+
+shard_events:
+  parallelize:
+    launcher_params:
+      timeout_min: 50
+      cpus_per_task: 10
+      mem_gb: 40
+      partition: "short"
+
+split_and_shard_subjects:
+  parallelize:
+    n_workers: 1
+    launcher_params:
+      timeout_min: 10
+      cpus_per_task: 10
+      mem_gb: 7
+      partition: "short"
+
+convert_to_sharded_events:
+  parallelize:
+    launcher_params:
+      timeout_min: 10
+      cpus_per_task: 10
+      mem_gb: 25
+      partition: "short"
+
+merge_to_MEDS_cohort:
+  parallelize:
+    launcher_params:
+      timeout_min: 15
+      cpus_per_task: 10
+      mem_gb: 85
+      partition: "short"
+
+extract_code_metadata:
+  parallelize:
+    launcher_params:
+      timeout_min: 10
+      cpus_per_task: 10
+      mem_gb: 25
+      partition: "short"
+
+finalize_MEDS_metadata:
+  parallelize:
+    n_workers: 1
+    launcher_params:
+      timeout_min: 10
+      cpus_per_task: 5
+      mem_gb: 10
+      partition: "short"
+
+finalize_MEDS_data:
+  parallelize:
+    launcher_params:
+      timeout_min: 10
+      cpus_per_task: 10
+      mem_gb: 70
+      partition: "short"

From dc30f461ea979cc2bdc12802e56d26c9013e7eaf Mon Sep 17 00:00:00 2001
From: rvandewater <robin.vandewater@kottos.hpc.dhclab.i.hpi.de>
Date: Wed, 18 Sep 2024 13:07:28 +0200
Subject: [PATCH 11/76] MIMIC changes

---
 MIMIC-IV_Example/README.md | 12 +++++++-----
 MIMIC-IV_Example/run.sh    |  6 ++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index dbfebf9e..516a16d6 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -18,8 +18,8 @@ If you want to profile the time and memory costs of your ETL, also install: `pip
 Set some environment variables and download the necessary files:
 ```bash
 export MIMICIV_RAW_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
-export MIMICIV_PRE_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
-export MIMICIV_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
+export MIMICIV_PRE_MEDS_DIR=??? # set to the directory in which you want to store the intermediate MEDS MIMIC-IV data
+export MIMICIV_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the final MEDS MIMIC-IV data
 
 export VERSION=0.0.6 # or whatever version you want
 export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/MIMIC-IV_Example"
@@ -46,7 +46,7 @@ the root directory of where the resulting _core data files_ are stored -- e.g.,
 ## Step 1.5: Download MIMIC-IV Metadata files
 
 ```bash
-cd $MIMIC_RAW_DIR
+cd $MIMICIV_RAW_DIR
 export MIMIC_URL=https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map
 wget $MIMIC_URL/d_labitems_to_loinc.csv
 wget $MIMIC_URL/inputevents_to_rxnorm.csv
@@ -65,9 +65,11 @@ wget $MIMIC_URL/waveforms-summary.csv
 To run the MEDS ETL, run the following command:
 
 ```bash
-./run.sh $MIMICIV_RAW_DIR $MIMICIV_PRE_MEDS_DIR $MIMICIV_MEDS_DIR do_unzip=true
+./run.sh $MIMICIV_RAW_DIR $MIMICIV_PRE_MEDS_DIR $MIMICIV_MEDS_COHORT_DIR do_unzip=true
 ```
-
+> [!NOTE] 
+> This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_MIMIC.yaml` file.
+> Check that your environment variables are set correctly.
 To not unzip the `.csv.gz` files, set `do_unzip=false` instead of `do_unzip=true`.
 
 To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an
diff --git a/MIMIC-IV_Example/run.sh b/MIMIC-IV_Example/run.sh
index 9c06c7e9..e5c0513b 100755
--- a/MIMIC-IV_Example/run.sh
+++ b/MIMIC-IV_Example/run.sh
@@ -35,6 +35,12 @@ if [ "$#" -lt 3 ]; then
     display_help
 fi
 
+# Check that the do_unzip flag is not set as a positional argument
+if [[ "$1" == "do_unzip=true" || "$1" == "do_unzip=false" || "$2" == "do_unzip=true" || "$2" == "do_unzip=false" || "$3" == "do_unzip=true" || "$3" == "do_unzip=false" ]]; then
+    echo "Error: Incorrect number of arguments provided. Check if your environment variables are set correctly."
+    display_help
+fi
+
 export MIMICIV_RAW_DIR=$1
 export MIMICIV_PRE_MEDS_DIR=$2
 export MIMICIV_MEDS_COHORT_DIR=$3

From 48e03882fc05cfb495dd5dca3766d9c768c33de8 Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Tue, 15 Oct 2024 15:51:34 -0500
Subject: [PATCH 12/76] corrected timestamps in event_configs

---
 eICU_Example/configs/event_configs.yaml | 20 +++---
 eICU_Example/configs/extract_eICU.yaml  | 36 +++++++++++
 eICU_Example/run.sh                     | 84 +++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 10 deletions(-)
 create mode 100644 eICU_Example/configs/extract_eICU.yaml
 create mode 100644 eICU_Example/run.sh

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index fb7901cf..0b111615 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -21,42 +21,42 @@ patient:
       - col(hospitalregion)
       - col(hospitalteachingstatus)
       - col(hospitalnumbedscategory)
-    time: col(hospitaladmittime)
+    time: col(hospitaladmittimestamp) # missing hospitaladmittime
     hospital_id: "hospitalid"
   hosp_discharge:
     code:
       - "HOSPITAL_DISCHARGE"
       - col(hospitaldischargestatus)
       - col(hospitaldischargelocation)
-    time: col(hospitaldischargetime)
+    time: col(hospitaldischargetimestamp) # missing hospitaldischargetime
   unit_admission:
     code:
       - "UNIT_ADMISSION"
       - col(unitadmitsource)
       - col(unitstaytype)
-    time: col(unitadmittime)
+    time: col(unitadmittimestamp) # missing unitadmittime
     ward_id: "wardid"
     unit_stay_id: "patientunitstayid"
   unit_admission_weight:
     code:
       - "UNIT_ADMISSION_WEIGHT"
-    time: col(unitadmittime)
+    time: col(unitadmittimestamp) # missing unitadmittime
     numeric_value: "unitadmissionweight"
   unit_admission_height:
     code:
       - "UNIT_ADMISSION_HEIGHT"
-    time: col(unitadmittime)
+    time: col(unitadmittimestamp) # missing unitadmittime
     numeric_value: "unitadmissionheight"
   unit_discharge:
     code:
       - "UNIT_DISCHARGE"
       - col(unitdischargestatus)
       - col(unitdischargelocation)
-    time: col(unitdischargetime)
+    time: col(unitdischargetimestamp) # missing unitdischargetime
   unit_discharge_weight:
     code:
       - "UNIT_DISCHARGE_WEIGHT"
-    time: col(unitdischargetime)
+    time: col(unitdischargetimestamp) # missing unitdischargetime
     numeric_value: "unitdischargeweight"
 
 admissiondx:
@@ -153,7 +153,7 @@ medication:
       - "MEDICATION"
       - "ORDERED"
       - col(drugname)
-    time: col(drugordertime)
+    time: col(drugordertimestamp) # missing drugordertime
     medication_id: "medicationid"
     drug_iv_admixture: "drugivadmixture"
     dosage: "dosage"
@@ -167,14 +167,14 @@ medication:
       - "MEDICATION"
       - "STARTED"
       - col(drugname)
-    time: col(drugstarttime)
+    time: col(drugstarttimestamp) # missing drugstarttime
     medication_id: "medicationid"
   drug_stopped:
     code:
       - "MEDICATION"
       - "STOPPED"
       - col(drugname)
-    time: col(drugstoptime)
+    time: col(drugstoptimestamp) # missing drugstoptime
     medication_id: "medicationid"
 
 nurseAssessment:
diff --git a/eICU_Example/configs/extract_eICU.yaml b/eICU_Example/configs/extract_eICU.yaml
new file mode 100644
index 00000000..47350337
--- /dev/null
+++ b/eICU_Example/configs/extract_eICU.yaml
@@ -0,0 +1,36 @@
+defaults:
+  - _extract
+  - _self_
+
+description: |-
+  This pipeline extracts the eICU dataset in longitudinal, sparse form from an input dataset meeting
+  select criteria and converts them to the flattened, MEDS format. You can control the key arguments to this
+  pipeline by setting environment variables:
+  ```bash
+    export EVENT_CONVERSION_CONFIG_FP=# Path to your event conversion config
+    export EICU_PRE_MEDS_DIR=# Path to the output dir of the pre-MEDS step
+    export EICU_MEDS_COHORT_DIR=# Path to where you want the dataset to live
+  ```
+
+# The event conversion configuration file is used throughout the pipeline to define the events to extract.
+event_conversion_config_fp: ${oc.env:EVENT_CONVERSION_CONFIG_FP}
+
+input_dir: ${oc.env:EICU_PRE_MEDS_DIR}
+cohort_dir: ${oc.env:EICU_MEDS_COHORT_DIR}
+
+etl_metadata:
+  dataset_name: eICU
+  dataset_version: 2.0
+
+stage_configs:
+  shard_events:
+    infer_schema_length: 999999999
+
+stages:
+  - shard_events
+  - split_and_shard_subjects
+  - convert_to_sharded_events
+  - merge_to_MEDS_cohort
+  - extract_code_metadata
+  - finalize_MEDS_metadata
+  - finalize_MEDS_data
diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
new file mode 100644
index 00000000..8eaf5b92
--- /dev/null
+++ b/eICU_Example/run.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes eICU data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  EICU_RAW_DIR        Directory containing raw eICU data files."
+    echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -lt 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
+EICU_RAW_DIR="$1"
+EICU_PREMEDS_DIR="$2"
+EICU_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+echo "Note that eICU has a lot more observations per subject than does MIMIC-IV, so to keep to a reasonable "
+echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
+echo "the final unique check (which should not be necessary given the structure of eICU and is expensive) "
+echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
+echo "args when running this script:"
+echo "  * stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000"
+echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
+
+echo "Running pre-MEDS conversion."
+# ./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+./src/MEDS_transforms/extract/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting subjects in serial"
+./src/MEDS_transforms/extract/split_and_shard_subjects.py \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./src/MEDS_transforms/extract/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./src/MEDS_transforms/extract/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"

From c612d6fcef5418ca78b253ea3e32ad3a6a5d38a9 Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 16 Oct 2024 16:37:35 -0500
Subject: [PATCH 13/76] fixed table names and reduced number of subsets per
 shard

---
 eICU_Example/configs/event_configs.yaml       |  8 +++++---
 eICU_Example/configs/table_preprocessors.yaml |  4 ++--
 eICU_Example/run.sh                           | 18 +++++++++++++-----
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 0b111615..e807b7d8 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -1,6 +1,8 @@
 # Note that there is no "subject_id" for eICU -- patients are only differentiable during the course of a
 # single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID"
 
+
+
 subject_id_col: patienthealthsystemstayid
 
 patient:
@@ -59,14 +61,14 @@ patient:
     time: col(unitdischargetimestamp) # missing unitdischargetime
     numeric_value: "unitdischargeweight"
 
-admissiondx:
+admissionDx:
   admission_diagnosis:
     code:
       - "ADMISSION_DX"
       - col(admitdxname)
     time: col(admitDxEnteredTimestamp)
-    admission_dx_id: "admitDxID"
-    unit_stay_id: "patientunitstayid"
+    admission_dx_id: "admissiondxid" # "admitDxID"
+    unit_stay_id: "patientunitstayid"  # not created
 
 allergy:
   allergy:
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index a3ad2c30..43533a83 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -1,7 +1,7 @@
-admissiondx:
+admissionDx:
   offset_col: "admitdxenteredoffset"
   pseudotime_col: "admitDxEnteredTimestamp"
-  output_data_cols: ["admitdxname", "admitdxid"]
+  output_data_cols: ["admitdxname", "admissiondxid"] # "admitDxID"
   warning_items: ["How should we use `admitdxtest`?", "How should we use `admitdxpath`?"]
 
 allergy:
diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index 8eaf5b92..a81f8b77 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -48,7 +48,7 @@ echo "  * stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000"
 echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
 
 echo "Running pre-MEDS conversion."
-# ./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
 
 echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
 ./src/MEDS_transforms/extract/shard_events.py \
@@ -57,13 +57,17 @@ echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
     hydra/launcher=joblib \
     input_dir="$EICU_PREMEDS_DIR" \
     cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
+    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
+    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
 
 echo "Splitting subjects in serial"
 ./src/MEDS_transforms/extract/split_and_shard_subjects.py \
     input_dir="$EICU_PREMEDS_DIR" \
     cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
+    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
+    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
 
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
 ./src/MEDS_transforms/extract/convert_to_sharded_events.py \
@@ -72,7 +76,9 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
     hydra/launcher=joblib \
     input_dir="$EICU_PREMEDS_DIR" \
     cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
+    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
+    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./src/MEDS_transforms/extract/merge_to_MEDS_cohort.py \
@@ -81,4 +87,6 @@ echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
     hydra/launcher=joblib \
     input_dir="$EICU_PREMEDS_DIR" \
     cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
+    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
+    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"

From df59f5456a7ff550777ef38be8ec94cbbbaf6661 Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 16 Oct 2024 16:41:15 -0500
Subject: [PATCH 14/76] reduce number of workers for merge_to_MEDS_cohort

---
 eICU_Example/run.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index a81f8b77..82132156 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -46,6 +46,7 @@ echo "in the merge stage. You can do this by setting the following parameters at
 echo "args when running this script:"
 echo "  * stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000"
 echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
+echo "Additionally, consider reducing N_PARALLEL_WORKERS if > 1"
 
 echo "Running pre-MEDS conversion."
 ./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"

From 728b844d49572914aa432a682bbd9f406772c93d Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Thu, 17 Oct 2024 09:14:17 -0500
Subject: [PATCH 15/76] clean configs

---
 eICU_Example/configs/event_configs.yaml       | 24 +++++++++----------
 eICU_Example/configs/table_preprocessors.yaml |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index e807b7d8..030f8d37 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -23,42 +23,42 @@ patient:
       - col(hospitalregion)
       - col(hospitalteachingstatus)
       - col(hospitalnumbedscategory)
-    time: col(hospitaladmittimestamp) # missing hospitaladmittime
+    time: col(hospitaladmittimestamp)
     hospital_id: "hospitalid"
   hosp_discharge:
     code:
       - "HOSPITAL_DISCHARGE"
       - col(hospitaldischargestatus)
       - col(hospitaldischargelocation)
-    time: col(hospitaldischargetimestamp) # missing hospitaldischargetime
+    time: col(hospitaldischargetimestamp)
   unit_admission:
     code:
       - "UNIT_ADMISSION"
       - col(unitadmitsource)
       - col(unitstaytype)
-    time: col(unitadmittimestamp) # missing unitadmittime
+    time: col(unitadmittimestamp)
     ward_id: "wardid"
     unit_stay_id: "patientunitstayid"
   unit_admission_weight:
     code:
       - "UNIT_ADMISSION_WEIGHT"
-    time: col(unitadmittimestamp) # missing unitadmittime
+    time: col(unitadmittimestamp)
     numeric_value: "unitadmissionweight"
   unit_admission_height:
     code:
       - "UNIT_ADMISSION_HEIGHT"
-    time: col(unitadmittimestamp) # missing unitadmittime
+    time: col(unitadmittimestamp)
     numeric_value: "unitadmissionheight"
   unit_discharge:
     code:
       - "UNIT_DISCHARGE"
       - col(unitdischargestatus)
       - col(unitdischargelocation)
-    time: col(unitdischargetimestamp) # missing unitdischargetime
+    time: col(unitdischargetimestamp)
   unit_discharge_weight:
     code:
       - "UNIT_DISCHARGE_WEIGHT"
-    time: col(unitdischargetimestamp) # missing unitdischargetime
+    time: col(unitdischargetimestamp)
     numeric_value: "unitdischargeweight"
 
 admissionDx:
@@ -67,8 +67,8 @@ admissionDx:
       - "ADMISSION_DX"
       - col(admitdxname)
     time: col(admitDxEnteredTimestamp)
-    admission_dx_id: "admissiondxid" # "admitDxID"
-    unit_stay_id: "patientunitstayid"  # not created
+    admission_dx_id: "admissiondxid"
+    unit_stay_id: "patientunitstayid"
 
 allergy:
   allergy:
@@ -155,7 +155,7 @@ medication:
       - "MEDICATION"
       - "ORDERED"
       - col(drugname)
-    time: col(drugordertimestamp) # missing drugordertime
+    time: col(drugordertimestamp)
     medication_id: "medicationid"
     drug_iv_admixture: "drugivadmixture"
     dosage: "dosage"
@@ -169,14 +169,14 @@ medication:
       - "MEDICATION"
       - "STARTED"
       - col(drugname)
-    time: col(drugstarttimestamp) # missing drugstarttime
+    time: col(drugstarttimestamp)
     medication_id: "medicationid"
   drug_stopped:
     code:
       - "MEDICATION"
       - "STOPPED"
       - col(drugname)
-    time: col(drugstoptimestamp) # missing drugstoptime
+    time: col(drugstoptimestamp)
     medication_id: "medicationid"
 
 nurseAssessment:
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index 43533a83..13cfe4ad 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -1,7 +1,7 @@
 admissionDx:
   offset_col: "admitdxenteredoffset"
   pseudotime_col: "admitDxEnteredTimestamp"
-  output_data_cols: ["admitdxname", "admissiondxid"] # "admitDxID"
+  output_data_cols: ["admitdxname", "admissiondxid"]
   warning_items: ["How should we use `admitdxtest`?", "How should we use `admitdxpath`?"]
 
 allergy:

From 08ee8fc3149e4db02704bd935611a7e0aac495ef Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Thu, 17 Oct 2024 09:35:02 -0500
Subject: [PATCH 16/76] clean configs

---
 eICU_Example/configs/event_configs.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 030f8d37..708863c1 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -1,8 +1,6 @@
 # Note that there is no "subject_id" for eICU -- patients are only differentiable during the course of a
 # single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID"
 
-
-
 subject_id_col: patienthealthsystemstayid
 
 patient:

From acf72200ec10e145510fd24db5e0c77e34247f76 Mon Sep 17 00:00:00 2001
From: rvandewater <robin.vandewater@access1.hpc.dhclab.i.hpi.de>
Date: Fri, 18 Oct 2024 14:55:25 +0200
Subject: [PATCH 17/76] AUMC start

---
 AUMC_Example/configs/event_configs.yaml     |  2 +-
 AUMC_Example/configs/extract_AUMC.yaml      | 40 ++++++++++++++
 AUMC_Example/configs/pre_MEDS.yaml          |  1 +
 AUMC_Example/pre_MEDS.py                    |  2 +-
 AUMC_Example/run.sh                         | 60 ++++-----------------
 MIMIC-IV_Example/configs/extract_MIMIC.yaml |  2 +
 6 files changed, 54 insertions(+), 53 deletions(-)
 create mode 100644 AUMC_Example/configs/extract_AUMC.yaml

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index 7abe661b..d10c370d 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -1,4 +1,4 @@
-patient_id_col: subject_id
+subject_id_col: patientid
 
 patient:
   dob: 
diff --git a/AUMC_Example/configs/extract_AUMC.yaml b/AUMC_Example/configs/extract_AUMC.yaml
new file mode 100644
index 00000000..4a913b78
--- /dev/null
+++ b/AUMC_Example/configs/extract_AUMC.yaml
@@ -0,0 +1,40 @@
+defaults:
+  - _extract
+  - _self_
+
+description: |-
+  This pipeline extracts the AUMCdb dataset in longitudinal, sparse form from an input dataset meeting
+  select criteria and converts them to the flattened, MEDS format. You can control the key arguments to this
+  pipeline by setting environment variables:
+  ```bash
+    export EVENT_CONVERSION_CONFIG_FP=# Path to your event conversion config
+    export AUMC_PRE_MEDS_DIR=# Path to the output dir of the pre-MEDS step
+    export AUMC_MEDS_COHORT_DIR=# Path to where you want the dataset to live
+  ```
+
+# The event conversion configuration file is used throughout the pipeline to define the events to extract.
+event_conversion_config_fp: ${oc.env:EVENT_CONVERSION_CONFIG_FP}
+
+input_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
+cohort_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
+
+etl_metadata:
+  dataset_name: AUMCdb
+  dataset_version: 1.0.2
+
+stage_configs:
+  shard_events:
+    infer_schema_length: 999999999
+    # data_input_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
+    # raw_cohort_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
+  split_and_shard_subjects:
+    n_patients_per_shard=1000
+
+stages:
+  - shard_events
+  - split_and_shard_subjects
+  - convert_to_sharded_events
+  - merge_to_MEDS_cohort
+  - extract_code_metadata
+  - finalize_MEDS_metadata
+  - finalize_MEDS_data
diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMC_Example/configs/pre_MEDS.yaml
index 524d63f8..e29ccfa9 100644
--- a/AUMC_Example/configs/pre_MEDS.yaml
+++ b/AUMC_Example/configs/pre_MEDS.yaml
@@ -1,5 +1,6 @@
 input_dir: ${oc.env:AUMC_RAW_DIR}
 cohort_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
+output_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
 
 # Hydra
 hydra:
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 9212d97f..3dd63e2d 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -146,7 +146,7 @@ def main(cfg: DictConfig):
         logger.info(f"  Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}")
         functions[table_name] = join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg)
 
-    raw_cohort_dir = Path(cfg.raw_cohort_dir)
+    raw_cohort_dir = Path(cfg.input_dir)
     MEDS_input_dir = Path(cfg.output_dir)
 
     patient_out_fp = MEDS_input_dir / "patient.parquet"
diff --git a/AUMC_Example/run.sh b/AUMC_Example/run.sh
index 9c06c7e9..c83bfaaf 100755
--- a/AUMC_Example/run.sh
+++ b/AUMC_Example/run.sh
@@ -5,16 +5,15 @@ set -e
 
 # Function to display help message
 function display_help() {
-    echo "Usage: $0 <MIMICIV_RAW_DIR> <MIMICIV_PRE_MEDS_DIR> <MIMICIV_MEDS_COHORT_DIR>"
+    echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PRE_MEDS_DIR> <AUMC_MEDS_COHORT_DIR>"
     echo
     echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
     echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
     echo
     echo "Arguments:"
-    echo "  MIMICIV_RAW_DIR                                Directory containing raw MIMIC-IV data files."
-    echo "  MIMICIV_PREMEDS_DIR                            Output directory for pre-MEDS data."
-    echo "  MIMICIV_MEDS_DIR                               Output directory for processed MEDS data."
-    echo "  (OPTIONAL) do_unzip=true OR do_unzip=false     Optional flag to unzip files before processing."
+    echo "  AUMC_RAW_DIR                                Directory containing raw AUMCdb data files."
+    echo "  AUMC_PREMEDS_DIR                            Output directory for pre-MEDS data."
+    echo "  AUMC_MEDS_DIR                               Output directory for processed MEDS data."
     echo
     echo "Options:"
     echo "  -h, --help          Display this help message and exit."
@@ -35,45 +34,15 @@ if [ "$#" -lt 3 ]; then
     display_help
 fi
 
-export MIMICIV_RAW_DIR=$1
-export MIMICIV_PRE_MEDS_DIR=$2
-export MIMICIV_MEDS_COHORT_DIR=$3
+export AUMC_RAW_DIR=$1
+export AUMC_PRE_MEDS_DIR=$2
+export AUMC_MEDS_COHORT_DIR=$3
 shift 3
 
-# Defaults
-_DO_UNZIP_ARG_STR=""
-
-if [ $# -ge 1 ]; then
-  case "$1" in
-    do_unzip=*)
-      _DO_UNZIP_ARG_STR="$1"
-      shift 1
-      ;;
-  esac
-fi
-
-DO_UNZIP="false"
-
-if [ -n "$_DO_UNZIP_ARG_STR" ]; then
-  case "$_DO_UNZIP_ARG_STR" in
-    do_unzip=true)
-      DO_UNZIP="true"
-      ;;
-    do_unzip=false)
-      DO_UNZIP="false"
-      ;;
-    *)
-      echo "Error: Invalid do_unzip value. Use 'do_unzip=true' or 'do_unzip=false'."
-      exit 1
-      ;;
-  esac
-  echo "Setting DO_UNZIP=$DO_UNZIP"
-fi
-
 # TODO: Add wget blocks once testing is validated.
 
 EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml"
-PIPELINE_CONFIG_FP="$(pwd)/configs/extract_MIMIC.yaml"
+PIPELINE_CONFIG_FP="$(pwd)/configs/extract_AUMC.yaml"
 PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py"
 
 # We export these variables separately from their assignment so that any errors during assignment are caught.
@@ -81,20 +50,9 @@ export EVENT_CONVERSION_CONFIG_FP
 export PIPELINE_CONFIG_FP
 export PRE_MEDS_PY_FP
 
-if [ "$DO_UNZIP" == "true" ]; then
-  GZ_FILES="${MIMICIV_RAW_DIR}/*/*.csv.gz"
-  if compgen -G "$GZ_FILES" > /dev/null; then
-    echo "Unzipping csv.gz files matching $GZ_FILES."
-    for file in $GZ_FILES; do gzip -d --force "$file"; done
-  else
-    echo "No csz.gz files to unzip at $GZ_FILES."
-  fi
-else
-  echo "Skipping unzipping."
-fi
 
 echo "Running pre-MEDS conversion."
-python "$PRE_MEDS_PY_FP" input_dir="$MIMICIV_RAW_DIR" cohort_dir="$MIMICIV_PRE_MEDS_DIR"
+python "$PRE_MEDS_PY_FP" input_dir="$AUMC_RAW_DIR" cohort_dir="$AUMC_PRE_MEDS_DIR"
 
 if [ -z "$N_WORKERS" ]; then
   echo "Setting N_WORKERS to 1 to avoid issues with the runners."
diff --git a/MIMIC-IV_Example/configs/extract_MIMIC.yaml b/MIMIC-IV_Example/configs/extract_MIMIC.yaml
index eb9b32ee..275683fd 100644
--- a/MIMIC-IV_Example/configs/extract_MIMIC.yaml
+++ b/MIMIC-IV_Example/configs/extract_MIMIC.yaml
@@ -25,6 +25,8 @@ etl_metadata:
 stage_configs:
   shard_events:
     infer_schema_length: 999999999
+  split_and_shard_subjects:
+    n_patients_per_shard: 1000
 
 stages:
   - shard_events

From fc2458c3be790d46610dc4a9cc7fe1868a4d5c02 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Fri, 18 Oct 2024 16:58:41 +0200
Subject: [PATCH 18/76] AUMC yaml fix

---
 AUMC_Example/configs/extract_AUMC.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/AUMC_Example/configs/extract_AUMC.yaml b/AUMC_Example/configs/extract_AUMC.yaml
index 4a913b78..adb51168 100644
--- a/AUMC_Example/configs/extract_AUMC.yaml
+++ b/AUMC_Example/configs/extract_AUMC.yaml
@@ -25,10 +25,8 @@ etl_metadata:
 stage_configs:
   shard_events:
     infer_schema_length: 999999999
-    # data_input_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
-    # raw_cohort_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
   split_and_shard_subjects:
-    n_patients_per_shard=1000
+    n_patients_per_shard: 1000
 
 stages:
   - shard_events
@@ -38,3 +36,4 @@ stages:
   - extract_code_metadata
   - finalize_MEDS_metadata
   - finalize_MEDS_data
+

From 7296b0416ece9b69c6a9b1a2bbcf26f31cae100a Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Fri, 18 Oct 2024 17:31:19 +0200
Subject: [PATCH 19/76] Harmonization to MIMIC-IV

---
 AUMC_Example/configs/pre_MEDS.yaml | 8 +++++---
 AUMC_Example/pre_MEDS.py           | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMC_Example/configs/pre_MEDS.yaml
index e29ccfa9..ac1967a0 100644
--- a/AUMC_Example/configs/pre_MEDS.yaml
+++ b/AUMC_Example/configs/pre_MEDS.yaml
@@ -1,12 +1,14 @@
 input_dir: ${oc.env:AUMC_RAW_DIR}
 cohort_dir: ${oc.env:AUMC_PRE_MEDS_DIR}
-output_dir: ${oc.env:AUMC_MEDS_COHORT_DIR}
+
+log_dir: ${cohort_dir}/.logs
 
 # Hydra
 hydra:
   job:
     name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S}
   run:
-    dir: ${output_dir}/.logs/${hydra.job.name}
+    dir: ${log_dir}
   sweep:
-    dir: ${output_dir}/.logs/${hydra.job.name}
+    dir: ${log_dir}
+
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 3dd63e2d..5a785dde 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -147,7 +147,7 @@ def main(cfg: DictConfig):
         functions[table_name] = join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg)
 
     raw_cohort_dir = Path(cfg.input_dir)
-    MEDS_input_dir = Path(cfg.output_dir)
+    MEDS_input_dir = Path(cfg.cohort_dir)
 
     patient_out_fp = MEDS_input_dir / "patient.parquet"
     link_out_fp = MEDS_input_dir / "link_patient_to_admission.parquet"

From cedffab71c49c1ad12bd0860885761f19e25e4ac Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Fri, 18 Oct 2024 17:40:00 +0200
Subject: [PATCH 20/76] Update readme

---
 AUMC_Example/README.md | 122 ++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 74 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index 895e6d15..fa35914c 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -6,105 +6,79 @@ up from this one).
 
 ## Step 0: Installation
 
-Download this repository and install the requirements:
-If you want to install via pypi, (note that for now, you still need to copy some files locally even with a
-pypi installation, which is covered below, so make sure you are in a suitable directory) use:
-
 ```bash
 conda create -n MEDS python=3.12
 conda activate MEDS
-pip install MEDS_transforms[examples,local_parallelism]
-mkdir AUMC_Example
-cd AUMC_Example
-wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/joint_script.sh
-wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/joint_script_slurm.sh
-wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/AUMC_Example/pre_MEDS.py
-chmod +x joint_script.sh
-chmod +x joint_script_slurm.sh
-chmod +x pre_MEDS.py
-cd ..
+pip install "MEDS_transforms[local_parallelism,slurm_parallelism]"
 ```
 
-If you want to install locally, use:
+If you want to profile the time and memory costs of your ETL, also install: `pip install hydra-profiler`.
 
+## Step 0.5: Set-up
+Set some environment variables and download the necessary files:
 ```bash
-git clone git@github.com:mmcdermott/MEDS_transforms.git
-cd MEDS_transforms
-conda create -n MEDS python=3.12
-conda activate MEDS
-pip install .[examples,local_parallelism]
+export AUMC_RAW_DIR=??? # set to the directory in which you want to store the raw data
+export AUMC_PRE_MEDS_DIR=??? # set to the directory in which you want to store the intermediate MEDS data
+export AUMC_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the final MEDS data
+
+export VERSION=0.0.8 # or whatever version you want
+export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/AUMC_Example"
+
+wget $URL/run.sh
+wget $URL/pre_MEDS.py
+wget $URL/local_parallelism_runner.yaml
+wget $URL/slurm_runner.yaml
+mkdir configs
+cd configs
+wget $URL/configs/extract_AUMC.yaml
+cd ..
+chmod +x run.sh
+chmod +x pre_MEDS.py
 ```
 
+
 ## Step 1: Download AUMC
 
 Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$AUMC_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored.
 
-## Step 2: Run the basic MEDS ETL
-
-This step contains several sub-steps; luckily, all these substeps can be run via a single script, with the
-`joint_script.sh` script which uses the Hydra `joblib` launcher to run things with local parallelism (make
-sure you enable this feature by including the `[local_parallelism]` option during installation) or via
-`joint_script_slurm.sh` which uses the Hydra `submitit` launcher to run things through slurm (make sure you
-enable this feature by including the `[slurm_parallelism]` option during installation). This script entails
-several steps:
 
-### Step 2.1: Get the data ready for base MEDS extraction
+## Step 2: Run the MEDS ETL
 
-This is a step in a few parts:
+To run the MEDS ETL, run the following command:
 
-1. Join a few tables by `hadm_id` to get the right times in the right rows for processing. In
-   particular, we need to join:
-   - the `hosp/diagnoses_icd` table with the `hosp/admissions` table to get the `dischtime` for each
-     `hadm_id`.
-   - the `hosp/drgcodes` table with the `hosp/admissions` table to get the `dischtime` for each `hadm_id`.
-2. Convert the patient's static data to a more parseable form. This entails:
-   - Get the patient's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and
-     `anchor_offset` fields.
-   - Merge the patient's `dod` with the `deathtime` from the `admissions` table.
-
-After these steps, modified files or symlinks to the original files will be written in a new directory which
-will be used as the input to the actual MEDS extraction ETL. We'll use `$AUMC_PREMEDS_DIR` to denote this
-directory.
-
-This step is run in the `joint_script.sh` script or the `joint_script_slurm.sh` script, but in either case the
-base command that is run is as follows (assumed to be run **not** from this directory but from the
-root directory of this repository):
 ```bash
-export AUMC_RAW_DIR=/path/to/AUMC/raw
-export AUMC_PREMEDS_DIR=/path/to/AUMC/pre_meds
-```bash
-./AUMC_Example/pre_MEDS.py raw_cohort_dir=$AUMC_RAW_DIR output_dir=$AUMC_PREMEDS_DIR
+./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_COHORT_DIR
 ```
+> [!NOTE] 
+> This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_MIMIC.yaml` file.
+> Check that your environment variables are set correctly.
+To not unzip the `.csv.gz` files, set `do_unzip=false` instead of `do_unzip=true`.
 
-In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
-
-### Step 2.2: Run the MEDS extraction ETL
+To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an
+additional argument
 
-We will assume you want to output the final MEDS dataset into a directory we'll denote as `$AUMC_MEDS_DIR`.
-Note this is a different directory than the pre-MEDS directory (though, of course, they can both be
-subdirectories of the same root directory).
-
-This is a step in 4 parts:
-
-1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers
-   performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers.
+```bash
+export N_WORKERS=5
+./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_DIR \
+    stage_runner_fp=slurm_runner.yaml
+```
 
-   This step uses the `./scripts/extraction/shard_events.py` script. See `joint_script*.sh` for the expected
-   format of the command.
+The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used
+at maximum.
 
-2. Extract and form the patient splits and sub-shards. The `./scripts/extraction/split_and_shard_patients.py`
-   script is used for this step. See `joint_script*.sh` for the expected format of the command.
+The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used
+at maximum.
 
-3. Extract patient sub-shards and convert to MEDS events. The
-   `./scripts/extraction/convert_to_sharded_events.py` script is used for this step. See `joint_script*.sh` for
-   the expected format of the command.
+The `slurm_runner.yaml` file (downloaded above) runs each stage across several workers on separate slurm
+worker nodes using the `submitit` launcher. _**You will need to customize this file to your own slurm system
+so that the partition names are correct before use.**_ The memory and time costs are viable in the current
+configuration, but if your nodes are sufficiently different you may need to adjust those as well.
 
-4. Merge the MEDS events into a single file per patient sub-shard. The
-   `./scripts/extraction/merge_to_MEDS_cohort.py` script is used for this step. See `joint_script*.sh` for the
-   expected format of the command.
+The `local_parallelism_runner.yaml` file (downloaded above) runs each stage via separate processes on the
+launching machine. There are no additional arguments needed for this stage beyond the `N_WORKERS` environment
+variable and there is nothing to customize in this file.
 
-5. (Optional) Generate preliminary code statistics and merge to external metadata. This is not performed
-   currently in the `joint_script*.sh` scripts.
+To profile the time and memory costs of your ETL, add the `do_profile=true` flag at the end.
 
 ## Limitations / TO-DOs:
 

From 111bc39123418f673a626f765164ef049c891adc Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Sun, 20 Oct 2024 13:55:38 +0200
Subject: [PATCH 21/76] Cleanup

---
 AUMC_Example/README.md                        |  3 +--
 AUMC_Example/configs/table_preprocessors.yaml | 18 +++++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index fa35914c..222b3b9a 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -50,9 +50,8 @@ To run the MEDS ETL, run the following command:
 ./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_COHORT_DIR
 ```
 > [!NOTE] 
-> This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_MIMIC.yaml` file.
+> This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_AUMC.yaml` file.
 > Check that your environment variables are set correctly.
-To not unzip the `.csv.gz` files, set `do_unzip=false` instead of `do_unzip=true`.
 
 To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an
 additional argument
diff --git a/AUMC_Example/configs/table_preprocessors.yaml b/AUMC_Example/configs/table_preprocessors.yaml
index 048f758e..ccf125f8 100644
--- a/AUMC_Example/configs/table_preprocessors.yaml
+++ b/AUMC_Example/configs/table_preprocessors.yaml
@@ -2,7 +2,7 @@ admissions:
   offset_col:
     - "admittedat"
     - "dischargedat"
-  pseudotime_col: 
+  pseudotime_col:
     - "admittedattime"
     - "dischargedattime"
   output_data_cols:
@@ -21,7 +21,7 @@ numericitems:
     - "measuredat"
     - "registeredat"
     - "updatedat"
-  pseudotime_col: 
+  pseudotime_col:
     - "measuredattime"
     - "registeredattime"
     - "updatedattime"
@@ -35,11 +35,11 @@ numericitems:
     - "How should we deal with `registeredat` and `updatedat`?"
 
 listitems:
-  offset_col: 
+  offset_col:
     - "measuredat"
     - "registeredat"
     - "updatedat"
-  pseudotime_col: 
+  pseudotime_col:
     - "measuredattime"
     - "registeredattime"
     - "updatedattime"
@@ -53,11 +53,11 @@ listitems:
     - "How should we deal with `registeredat` and `updatedat`?"
 
 freetextitems:
-  offset_col: 
+  offset_col:
     - "measuredat"
     - "registeredat"
     - "updatedat"
-  pseudotime_col: 
+  pseudotime_col:
     - "measuredattime"
     - "registeredattime"
     - "updatedattime"
@@ -75,7 +75,7 @@ drugitems:
   offset_col:
     - "start"
     - "stop"
-  pseudotime_col: 
+  pseudotime_col:
     - "starttime"
     - "stoptime"
   output_data_cols:
@@ -92,7 +92,7 @@ drugitems:
     - "administered"
     - "administeredunit"
     - "action"
-  warning_items: 
+  warning_items:
     - "We **IGNORE** several flags here -- this may be a mistake!"
     - "When is the administered dose recorded? Is this done after the fact?"
 
@@ -109,7 +109,7 @@ processitems:
   offset_col:
     - "start"
     - "stop"
-  pseudotime_col: 
+  pseudotime_col:
     - "starttime"
     - "stoptime"
   output_data_cols:

From 92f7e7aacd8ad81d9cd5ebdcd2008f8e7da5ceeb Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Sun, 20 Oct 2024 14:02:24 +0200
Subject: [PATCH 22/76] PR refactorings/fixes

---
 AUMC_Example/README.md       |   7 +--
 AUMC_Example/joint_script.sh | 102 -----------------------------------
 AUMC_Example/pre_MEDS.py     |   3 --
 3 files changed, 4 insertions(+), 108 deletions(-)
 delete mode 100755 AUMC_Example/joint_script.sh

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index 222b3b9a..3e2308ec 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -65,9 +65,6 @@ export N_WORKERS=5
 The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used
 at maximum.
 
-The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used
-at maximum.
-
 The `slurm_runner.yaml` file (downloaded above) runs each stage across several workers on separate slurm
 worker nodes using the `submitit` launcher. _**You will need to customize this file to your own slurm system
 so that the partition names are correct before use.**_ The memory and time costs are viable in the current
@@ -100,6 +97,10 @@ Note: If you use the slurm system and you launch the hydra submitit jobs from an
 may need to run `unset SLURM_CPU_BIND` in your terminal first to avoid errors.
 
 ## Future Work
+Check with AUMCdb authors:
+- How should we deal with `registeredat` and `updatedat`?
+- We **IGNORE** several flags for the `drugitems` -- this may be a mistake!
+- When is the administered dose recorded? Is this done after the fact?
 
 ### Pre-MEDS Processing
 
diff --git a/AUMC_Example/joint_script.sh b/AUMC_Example/joint_script.sh
deleted file mode 100755
index a9e2992b..00000000
--- a/AUMC_Example/joint_script.sh
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env bash
-
-# This makes the script fail if any internal script fails
-set -e
-
-# Function to display help message
-function display_help() {
-    echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PREMEDS_DIR> <AUMC_MEDS_DIR> <N_PARALLEL_WORKERS>"
-    echo
-    echo "This script processes AUMCdb data through several steps, handling raw data conversion,"
-    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
-    echo
-    echo "Arguments:"
-    echo "  AUMC_RAW_DIR        Directory containing raw AUMCdb data files."
-    echo "  AUMC_PREMEDS_DIR    Output directory for pre-MEDS data."
-    echo "  AUMC_MEDS_DIR       Output directory for processed MEDS data."
-    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
-    echo
-    echo "Options:"
-    echo "  -h, --help          Display this help message and exit."
-    exit 1
-}
-
-# Check if the first parameter is '-h' or '--help'
-if [[ "$1" == "-h" || "$1" == "--help" ]]; then
-    display_help
-fi
-
-# Check for mandatory parameters
-if [ "$#" -lt 4 ]; then
-    echo "Error: Incorrect number of arguments provided."
-    display_help
-fi
-
-AUMC_RAW_DIR="$1"
-AUMC_PREMEDS_DIR="$2"
-AUMC_MEDS_DIR="$3"
-N_PARALLEL_WORKERS="$4"
-
-shift 4
-
-echo "Note that AUMCdb has a lot of observations in the numericitems, so to keep to a reasonable "
-echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
-echo "the final unique check (which should not be necessary given the structure of AUMCdb and is expensive) "
-echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
-echo "args when running this script:"
-echo "  * stage_configs.split_and_shard_patients.n_patients_per_shard=10000"
-echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
-
-
-echo "Running pre-MEDS conversion."
-./AUMC_Example/pre_MEDS.py raw_cohort_dir="$AUMC_RAW_DIR" output_dir="$AUMC_PREMEDS_DIR"
-
-echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-MEDS_extract-shard_events \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$AUMC_PREMEDS_DIR" \
-    cohort_dir="$AUMC_MEDS_DIR" \
-    stage="shard_events" \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
-
-echo "Splitting patients in serial"
-MEDS_extract-split_and_shard_patients \
-    input_dir="$AUMC_PREMEDS_DIR" \
-    cohort_dir="$AUMC_MEDS_DIR" \
-    stage="split_and_shard_patients" \
-    stage_configs.split_and_shard_patients.n_patients_per_shard=1000 \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-MEDS_extract-convert_to_sharded_events \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$AUMC_PREMEDS_DIR" \
-    cohort_dir="$AUMC_MEDS_DIR" \
-    stage="convert_to_sharded_events" \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
-
-echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-MEDS_extract-merge_to_MEDS_cohort \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$AUMC_PREMEDS_DIR" \
-    cohort_dir="$AUMC_MEDS_DIR" \
-    stage="merge_to_MEDS_cohort" \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
-
-echo "Finalizing MEDS data with $N_PARALLEL_WORKERS workers in parallel"
-MEDS_extract-finalize_MEDS_data \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$AUMC_RAW_DIR" \
-    cohort_dir="$AUMC_MEDS_DIR" \
-    stage="finalize_MEDS_data" \
-    etl_metadata.dataset_name="AUMCdb" \
-    etl_metadata.dataset_version="1.0.2" \
-    event_conversion_config_fp=./AUMC_Example/configs/event_configs.yaml "$@"
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 5a785dde..18fab619 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -8,7 +8,6 @@
 
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
-import gzip
 from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
@@ -132,8 +131,6 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
 def main(cfg: DictConfig):
     """Performs pre-MEDS data wrangling for AUMCdb.
-
-
     """
 
     hydra_loguru_init()

From 295ab2f5b5f5865d0b4fcd54d5ce8f2a57279d8a Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 10:36:16 +0200
Subject: [PATCH 23/76] remove left-over items from aumc README

---
 AUMC_Example/README.md | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index 3e2308ec..ea8165ad 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -76,21 +76,6 @@ variable and there is nothing to customize in this file.
 
 To profile the time and memory costs of your ETL, add the `do_profile=true` flag at the end.
 
-## Limitations / TO-DOs:
-
-Currently, some tables are ignored, including:
-
-INSERT TABLES IGNORED HERE
-
-Lots of questions remain about how to appropriately handle times of the data -- e.g., things like HCPCS
-events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the
-timeline which is otherwise stored at the _datetime_ resolution?
-
-Other questions:
-
-1. How to handle merging the deathtimes between the hosp table and the patients table?
-2. How to handle the dob nonsense MIMIC has?
-
 ## Notes
 
 Note: If you use the slurm system and you launch the hydra submitit jobs from an interactive slurm node, you
@@ -102,7 +87,3 @@ Check with AUMCdb authors:
 - We **IGNORE** several flags for the `drugitems` -- this may be a mistake!
 - When is the administered dose recorded? Is this done after the fact?
 
-### Pre-MEDS Processing
-
-If you wanted, some other processing could also be done here, such as:
-

From 9b6ec7db16b22ed90460b09f8919ee622c6b7082 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 10:36:46 +0200
Subject: [PATCH 24/76] update birth and death event names

---
 AUMC_Example/configs/event_configs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index d10c370d..b4a5647f 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -2,13 +2,13 @@ subject_id_col: patientid
 
 patient:
   dob: 
-    code: "DOB"
+    code: "MEDS_BIRTH"
     time: col(dateofbirth)
   gender:
     code: ["GENDER", "col(gender)"]
     time: null
   dod: 
-    code: "DEATH"
+    code: "MEDS_DEATH"
     time: col(dateofdeath)
 
 admissions:

From 16785d96313de1ca4d93bd262ffd7eaeaf86107d Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 10:59:28 +0200
Subject: [PATCH 25/76] add missing comments to aumc premeds step

---
 AUMC_Example/pre_MEDS.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 18fab619..c732768d 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -83,7 +83,39 @@ def join_and_get_pseudotime_fntr(
 
     Also raises specified warning strings via the logger for uncertain columns.
 
-    TODO
+    Args: 
+        table_name: name of the AUMCdb table that should be joined
+        offset_col: list of all columns that contain time offsets since the patient's first admission
+        pseudotime_col: list of all timestamp columns derived from `offset_col` and the linked `patient` 
+            table
+        output_data_cols: list of all data columns included in the output 
+        warning_items: any warnings noted in the table_preprocessors.yaml
+
+    Example: 
+        All args except `table_name` are taken from the table_preprocessors.yaml. For example, for the 
+        table `numericitems`, we have the following yaml configuration:
+
+        numericitems:
+            offset_col: 
+                - "measuredat"
+                - "registeredat"
+                - "updatedat"
+            pseudotime_col:
+                - "measuredattime"
+                - "registeredattime"
+                - "updatedattime"
+            output_data_cols:
+                - "item"
+                - "value"
+                - "unit"
+                - "registeredby"
+                - "updatedby"
+            warning_items:
+                - "How should we deal with `registeredat` and `updatedat`?"
+
+    Returns: 
+        Function that expects the raw data stored in the `table_name` table and the joined output of the
+        `process_patient_and_admissions` function. Both inputs are expected to be `pl.DataFrame`s.
     """
 
     if output_data_cols is None:
@@ -131,6 +163,10 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
 def main(cfg: DictConfig):
     """Performs pre-MEDS data wrangling for AUMCdb.
+
+    Inputs are the raw AUMCdb files, read from the `input_dir` config parameter. Output files are written
+    in processed form and as Parquet files to the `cohort_dir` config parameter. Hydra is used to manage
+    configuration parameters and logging. 
     """
 
     hydra_loguru_init()

From d63c8df77c7389c6e81bff38e3be4ff0782df659 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 11:20:45 +0200
Subject: [PATCH 26/76] remove rootutils from aumc premeds step

---
 AUMC_Example/pre_MEDS.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index c732768d..799c9282 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -4,9 +4,6 @@
 
 See the docstring of `main` for more information.
 """
-import rootutils
-
-root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
 from collections.abc import Callable
 from datetime import datetime
@@ -171,7 +168,7 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
-    table_preprocessors_config_fp = Path("./AUMC_Example/configs/table_preprocessors.yaml")
+    table_preprocessors_config_fp = Path("./configs/table_preprocessors.yaml")
     logger.info(f"Loading table preprocessors from {str(table_preprocessors_config_fp.resolve())}...")
     preprocessors = OmegaConf.load(table_preprocessors_config_fp)
     functions = {}

From 2efe7fcefeb61c3b70a652f8ccc31bad81e66a88 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 11:21:15 +0200
Subject: [PATCH 27/76] update mimic and aumc readme to clarify where to run

---
 AUMC_Example/README.md     | 4 +---
 MIMIC-IV_Example/README.md | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index ea8165ad..9e4c025a 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -1,8 +1,6 @@
 # AUMC Example
 
-This is an example of how to extract a MEDS dataset from AUMCdb (https://github.com/AmsterdamUMC/AmsterdamUMCdb). All scripts in this README are assumed to
-be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
-up from this one).
+This is an example of how to extract a MEDS dataset from AUMCdb (https://github.com/AmsterdamUMC/AmsterdamUMCdb). All scripts in this README are assumed to be run from this directory or from the directory in which the files in Step 0.5. were downloaded.
 
 ## Step 0: Installation
 
diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index 516a16d6..1c6c14c1 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -1,8 +1,7 @@
 # MIMIC-IV Example
 
 This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to
-be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
-up from this one).
+be run from this directory or from the directory in which the files in Step 0.5. were downloaded.
 
 ## Step 0: Installation
 

From f1e5223a2a7d4a67173fa259f4a1235a3a7f7ea8 Mon Sep 17 00:00:00 2001
From: prockenschaub <rockenschaub.patrick@gmail.com>
Date: Wed, 23 Oct 2024 13:29:44 +0200
Subject: [PATCH 28/76] fix incorrect default config for n_subjects_per_shard
 (mimic & aumc)

---
 AUMC_Example/configs/extract_AUMC.yaml      | 4 +---
 MIMIC-IV_Example/configs/extract_MIMIC.yaml | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/AUMC_Example/configs/extract_AUMC.yaml b/AUMC_Example/configs/extract_AUMC.yaml
index adb51168..a3af9393 100644
--- a/AUMC_Example/configs/extract_AUMC.yaml
+++ b/AUMC_Example/configs/extract_AUMC.yaml
@@ -23,10 +23,8 @@ etl_metadata:
   dataset_version: 1.0.2
 
 stage_configs:
-  shard_events:
-    infer_schema_length: 999999999
   split_and_shard_subjects:
-    n_patients_per_shard: 1000
+    n_subjects_per_shard: 1000
 
 stages:
   - shard_events
diff --git a/MIMIC-IV_Example/configs/extract_MIMIC.yaml b/MIMIC-IV_Example/configs/extract_MIMIC.yaml
index 275683fd..650d6e56 100644
--- a/MIMIC-IV_Example/configs/extract_MIMIC.yaml
+++ b/MIMIC-IV_Example/configs/extract_MIMIC.yaml
@@ -26,7 +26,7 @@ stage_configs:
   shard_events:
     infer_schema_length: 999999999
   split_and_shard_subjects:
-    n_patients_per_shard: 1000
+    n_subjects_per_shard: 1000
 
 stages:
   - shard_events

From 769f685e0b394892107f785c37ae4719eed6e32d Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 23 Oct 2024 13:07:22 -0500
Subject: [PATCH 29/76] use extraction pipeline

---
 MIMIC-IV_Example/README.md             |  29 ++++---
 eICU_Example/README.md                 |  31 +++----
 eICU_Example/configs/extract_eICU.yaml |   5 +-
 eICU_Example/joint_script.sh           |  84 -------------------
 eICU_Example/run.sh                    | 108 +++++++++++++++----------
 5 files changed, 95 insertions(+), 162 deletions(-)
 delete mode 100755 eICU_Example/joint_script.sh

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index dbfebf9e..e8aa14bb 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -22,7 +22,8 @@ export MIMICIV_PRE_MEDS_DIR=??? # set to the directory in which you want to stor
 export MIMICIV_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
 
 export VERSION=0.0.6 # or whatever version you want
-export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/MIMIC-IV_Example"
+# export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/MIMIC-IV_Example"
+export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/refs/heads/main/MIMIC-IV_Example"
 
 wget $URL/run.sh
 wget $URL/pre_MEDS.py
@@ -31,6 +32,8 @@ wget $URL/slurm_runner.yaml
 mkdir configs
 cd configs
 wget $URL/configs/extract_MIMIC.yaml
+wget $URL/configs/pre_MEDS.yaml
+wget $URL/configs/event_configs.yaml
 cd ..
 chmod +x run.sh
 chmod +x pre_MEDS.py
@@ -46,18 +49,18 @@ the root directory of where the resulting _core data files_ are stored -- e.g.,
 ## Step 1.5: Download MIMIC-IV Metadata files
 
 ```bash
-cd $MIMIC_RAW_DIR
-export MIMIC_URL=https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map
-wget $MIMIC_URL/d_labitems_to_loinc.csv
-wget $MIMIC_URL/inputevents_to_rxnorm.csv
-wget $MIMIC_URL/lab_itemid_to_loinc.csv
-wget $MIMIC_URL/meas_chartevents_main.csv
-wget $MIMIC_URL/meas_chartevents_value.csv
-wget $MIMIC_URL/numerics-summary.csv
-wget $MIMIC_URL/outputevents_to_loinc.csv
-wget $MIMIC_URL/proc_datetimeevents.csv
-wget $MIMIC_URL/proc_itemid.csv
-wget $MIMIC_URL/waveforms-summary.csv
+cd $MIMICIV_RAW_DIR
+export MIMICIV_RAW_DIR=https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map
+wget $MIMICIV_RAW_DIR/d_labitems_to_loinc.csv
+wget $MIMICIV_RAW_DIR/inputevents_to_rxnorm.csv
+wget $MIMICIV_RAW_DIR/lab_itemid_to_loinc.csv
+wget $MIMICIV_RAW_DIR/meas_chartevents_main.csv
+wget $MIMICIV_RAW_DIR/meas_chartevents_value.csv
+wget $MIMICIV_RAW_DIR/numerics-summary.csv
+wget $MIMICIV_RAW_DIR/outputevents_to_loinc.csv
+wget $MIMICIV_RAW_DIR/proc_datetimeevents.csv
+wget $MIMICIV_RAW_DIR/proc_itemid.csv
+wget $MIMICIV_RAW_DIR/waveforms-summary.csv
 ```
 
 ## Step 2: Run the MEDS ETL
diff --git a/eICU_Example/README.md b/eICU_Example/README.md
index cc820067..fb1fb036 100644
--- a/eICU_Example/README.md
+++ b/eICU_Example/README.md
@@ -49,35 +49,22 @@ cd ..
 
 Download the eICU-CRD dataset (version 2.0) from https://physionet.org/content/eicu-crd/2.0/ following the
 instructions on that page. You will need the raw `.csv.gz` files for this example. We will use
-`$EICU_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored -- e.g.,
-there should be a `hosp` and `icu` subdirectory of `$EICU_RAW_DIR`.
+`$EICU_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored
 
-## Step 2: Get the data ready for base MEDS extraction
+## Step 2: Run the MEDS extraction ETL
 
-This is a step in a few parts:
-
-1. Join a few tables by `hadm_id` to get the right timestamps in the right rows for processing. In
-    particular, we need to join:
-    - TODO
-2. Convert the subject's static data to a more parseable form. This entails:
-    - Get the subject's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and
-        `anchor_offset` fields.
-    - Merge the subject's `dod` with the `deathtime` from the `admissions` table.
-
-After these steps, modified files or symlinks to the original files will be written in a new directory which
-will be used as the input to the actual MEDS extraction ETL. We'll use `$EICU_PREMEDS_DIR` to denote this
-directory.
-
-To run this step, you can use the following script (assumed to be run **not** from this directory but from the
-root directory of this repository):
+To run the MEDS ETL, run the following command:
 
 ```bash
-./eICU_Example/pre_MEDS.py raw_cohort_dir=$EICU_RAW_DIR output_dir=$EICU_PREMEDS_DIR
+./run.sh $EICU_RAW_DIR $EICU_PRE_MEDS_DIR $EICU_MEDS_COHORT_DIR $N_PARALLEL_WORKERS do_unzip=true
 ```
 
-In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+To not unzip the `.csv.gz` files, set `do_unzip=false` instead of `do_unzip=true`.
+
+To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an
+additional argument
 
-## Step 3: Run the MEDS extraction ETL
+The `N_PARALLEL_WORKERS` variable controls how many parallel workers should be used at maximum.
 
 Note that eICU has a lot more observations per subject than does MIMIC-IV, so to keep to a reasonable memory
 burden (e.g., \< 150GB per worker), you will want a smaller shard size, as well as to turn off the final unique
diff --git a/eICU_Example/configs/extract_eICU.yaml b/eICU_Example/configs/extract_eICU.yaml
index 47350337..9abe9153 100644
--- a/eICU_Example/configs/extract_eICU.yaml
+++ b/eICU_Example/configs/extract_eICU.yaml
@@ -25,12 +25,15 @@ etl_metadata:
 stage_configs:
   shard_events:
     infer_schema_length: 999999999
+  split_and_shard_subjects:
+    n_subjects_per_shard: 10000
+  merge_to_MEDS_cohort:
+    unique_by: null
 
 stages:
   - shard_events
   - split_and_shard_subjects
   - convert_to_sharded_events
   - merge_to_MEDS_cohort
-  - extract_code_metadata
   - finalize_MEDS_metadata
   - finalize_MEDS_data
diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
deleted file mode 100755
index 0b3ad6c5..00000000
--- a/eICU_Example/joint_script.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env bash
-
-# This makes the script fail if any internal script fails
-set -e
-
-# Function to display help message
-function display_help() {
-    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR> <N_PARALLEL_WORKERS>"
-    echo
-    echo "This script processes eICU data through several steps, handling raw data conversion,"
-    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
-    echo
-    echo "Arguments:"
-    echo "  EICU_RAW_DIR        Directory containing raw eICU data files."
-    echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
-    echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
-    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
-    echo
-    echo "Options:"
-    echo "  -h, --help          Display this help message and exit."
-    exit 1
-}
-
-# Check if the first parameter is '-h' or '--help'
-if [[ "$1" == "-h" || "$1" == "--help" ]]; then
-    display_help
-fi
-
-# Check for mandatory parameters
-if [ "$#" -lt 4 ]; then
-    echo "Error: Incorrect number of arguments provided."
-    display_help
-fi
-
-EICU_RAW_DIR="$1"
-EICU_PREMEDS_DIR="$2"
-EICU_MEDS_DIR="$3"
-N_PARALLEL_WORKERS="$4"
-
-shift 4
-
-echo "Note that eICU has a lot more observations per subject than does MIMIC-IV, so to keep to a reasonable "
-echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
-echo "the final unique check (which should not be necessary given the structure of eICU and is expensive) "
-echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
-echo "args when running this script:"
-echo "  * stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000"
-echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
-
-echo "Running pre-MEDS conversion."
-./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
-
-echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/shard_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-
-echo "Splitting subjects in serial"
-./scripts/extraction/split_and_shard_subjects.py \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/convert_to_sharded_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-
-echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/merge_to_MEDS_cohort.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index 82132156..f8c3a661 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -15,6 +15,7 @@ function display_help() {
     echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
     echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
     echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
+    echo "  (OPTIONAL) do_unzip=true OR do_unzip=false     Optional flag to unzip files before processing."
     echo
     echo "Options:"
     echo "  -h, --help          Display this help message and exit."
@@ -33,10 +34,13 @@ if [ "$#" -lt 4 ]; then
 fi
 
 EICU_RAW_DIR="$1"
-EICU_PREMEDS_DIR="$2"
-EICU_MEDS_DIR="$3"
+EICU_PRE_MEDS_DIR="$2"
+EICU_MEDS_COHORT_DIR="$3"
 N_PARALLEL_WORKERS="$4"
 
+export EICU_PRE_MEDS_DIR="$2"
+export EICU_MEDS_COHORT_DIR="$3"
+
 shift 4
 
 echo "Note that eICU has a lot more observations per subject than does MIMIC-IV, so to keep to a reasonable "
@@ -48,46 +52,66 @@ echo "  * stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000"
 echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
 echo "Additionally, consider reducing N_PARALLEL_WORKERS if > 1"
 
+# Defaults
+_DO_UNZIP_ARG_STR=""
+
+if [ $# -ge 1 ]; then
+  case "$1" in
+    do_unzip=*)
+      _DO_UNZIP_ARG_STR="$1"
+      shift 1
+      ;;
+  esac
+fi
+
+DO_UNZIP="false"
+
+if [ -n "$_DO_UNZIP_ARG_STR" ]; then
+  case "$_DO_UNZIP_ARG_STR" in
+    do_unzip=true)
+      DO_UNZIP="true"
+      ;;
+    do_unzip=false)
+      DO_UNZIP="false"
+      ;;
+    *)
+      echo "Error: Invalid do_unzip value. Use 'do_unzip=true' or 'do_unzip=false'."
+      exit 1
+      ;;
+  esac
+  echo "Setting DO_UNZIP=$DO_UNZIP"
+fi
+
+# TODO: Add wget blocks once testing is validated.
+EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml"
+PIPELINE_CONFIG_FP="$(pwd)/configs/extract_eICU.yaml"
+PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py"
+
+# We export these variables separately from their assignment so that any errors during assignment are caught.
+export EVENT_CONVERSION_CONFIG_FP
+export PIPELINE_CONFIG_FP
+export PRE_MEDS_PY_FP
+
+
+if [ "$DO_UNZIP" == "true" ]; then
+  GZ_FILES="${EICU_RAW_DIR}/*.csv.gz"
+  if compgen -G "$GZ_FILES" > /dev/null; then
+    echo "Unzipping csv.gz files matching $GZ_FILES."
+    for file in $GZ_FILES; do gzip -d --force "$file"; done
+  else
+    echo "No csz.gz files to unzip at $GZ_FILES."
+  fi
+else
+  echo "Skipping unzipping."
+fi
+
 echo "Running pre-MEDS conversion."
 ./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
 
-echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-./src/MEDS_transforms/extract/shard_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
-    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
-    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
-
-echo "Splitting subjects in serial"
-./src/MEDS_transforms/extract/split_and_shard_subjects.py \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
-    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
-    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-./src/MEDS_transforms/extract/convert_to_sharded_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
-    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
-    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
-
-echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-./src/MEDS_transforms/extract/merge_to_MEDS_cohort.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml \
-    stage_configs.split_and_shard_subjects.n_subjects_per_shard=10000 \
-    stage_configs.merge_to_MEDS_cohort.unique_by=null "$@"
+if [ -z "$N_WORKERS" ]; then
+  echo "Setting N_WORKERS to 1 to avoid issues with the runners."
+  export N_WORKERS="1"
+fi
+
+echo "Running extraction pipeline."
+MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"
\ No newline at end of file

From adcbc3e0a5719f55bd3a14aa8a0486a88029f87b Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 23 Oct 2024 13:09:14 -0500
Subject: [PATCH 30/76] cleanup

---
 eICU_Example/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index f8c3a661..dfa473e7 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -114,4 +114,4 @@ if [ -z "$N_WORKERS" ]; then
 fi
 
 echo "Running extraction pipeline."
-MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"
\ No newline at end of file
+MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"

From 0f236d0d3eeb4b48cb3158ddb272e9d95670468d Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 23 Oct 2024 13:11:17 -0500
Subject: [PATCH 31/76] cleanup

---
 eICU_Example/run.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index dfa473e7..9f3d91d5 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -5,7 +5,7 @@ set -e
 
 # Function to display help message
 function display_help() {
-    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR>"
     echo
     echo "This script processes eICU data through several steps, handling raw data conversion,"
     echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
@@ -14,7 +14,6 @@ function display_help() {
     echo "  EICU_RAW_DIR        Directory containing raw eICU data files."
     echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
     echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
-    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
     echo "  (OPTIONAL) do_unzip=true OR do_unzip=false     Optional flag to unzip files before processing."
     echo
     echo "Options:"
@@ -36,7 +35,6 @@ fi
 EICU_RAW_DIR="$1"
 EICU_PRE_MEDS_DIR="$2"
 EICU_MEDS_COHORT_DIR="$3"
-N_PARALLEL_WORKERS="$4"
 
 export EICU_PRE_MEDS_DIR="$2"
 export EICU_MEDS_COHORT_DIR="$3"

From acba46c16d6644bf6b3772d273378bed676bf0f3 Mon Sep 17 00:00:00 2001
From: kingrc15 <kingryan14@gmail.com>
Date: Wed, 23 Oct 2024 14:16:24 -0500
Subject: [PATCH 32/76] cleanup

---
 eICU_Example/pre_MEDS.py | 2 +-
 eICU_Example/run.sh      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 5ebe0582..e00fe72a 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -274,7 +274,7 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
-    table_preprocessors_config_fp = Path("./eICU_Example/configs/table_preprocessors.yaml")
+    table_preprocessors_config_fp = Path("./configs/table_preprocessors.yaml")
     logger.info(f"Loading table preprocessors from {str(table_preprocessors_config_fp.resolve())}...")
     preprocessors = OmegaConf.load(table_preprocessors_config_fp)
     functions = {}
diff --git a/eICU_Example/run.sh b/eICU_Example/run.sh
index 9f3d91d5..c236d3db 100644
--- a/eICU_Example/run.sh
+++ b/eICU_Example/run.sh
@@ -104,7 +104,7 @@ else
 fi
 
 echo "Running pre-MEDS conversion."
-./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+./pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PRE_MEDS_DIR"
 
 if [ -z "$N_WORKERS" ]; then
   echo "Setting N_WORKERS to 1 to avoid issues with the runners."

From 8d272c893df37f66e6647e412238aea2391f7651 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 26 Oct 2024 08:13:41 -0500
Subject: [PATCH 33/76] Made tests not run on non src or tests directories.

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 95fa40b1..bbc64b9c 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -35,7 +35,7 @@ jobs:
       #----------------------------------------------
       - name: Run tests
         run: |
-          pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=docs
+          pytest src/ tests/ -v --doctest-modules --cov=src --junitxml=junit.xml -s
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4.0.1

From 18747010726c405cdf8fdffc527542385383e055 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Sun, 27 Oct 2024 13:01:15 +0100
Subject: [PATCH 34/76] removed extract code metadata for aumc

---
 AUMC_Example/configs/extract_AUMC.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/AUMC_Example/configs/extract_AUMC.yaml b/AUMC_Example/configs/extract_AUMC.yaml
index a3af9393..cf364327 100644
--- a/AUMC_Example/configs/extract_AUMC.yaml
+++ b/AUMC_Example/configs/extract_AUMC.yaml
@@ -31,7 +31,5 @@ stages:
   - split_and_shard_subjects
   - convert_to_sharded_events
   - merge_to_MEDS_cohort
-  - extract_code_metadata
   - finalize_MEDS_metadata
   - finalize_MEDS_data
-

From cbd2b3d9037c73ef4c18d2b8137a3e45902acf64 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Sun, 27 Oct 2024 13:13:04 +0100
Subject: [PATCH 35/76] linting pre-comit

---
 AUMC_Example/README.md                        | 10 +++--
 AUMC_Example/configs/event_configs.yaml       | 24 ++++++------
 AUMC_Example/configs/pre_MEDS.yaml            |  1 -
 AUMC_Example/configs/table_preprocessors.yaml |  6 +--
 AUMC_Example/pre_MEDS.py                      | 39 ++++++++++---------
 5 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/AUMC_Example/README.md b/AUMC_Example/README.md
index 9e4c025a..85a378c3 100644
--- a/AUMC_Example/README.md
+++ b/AUMC_Example/README.md
@@ -13,7 +13,9 @@ pip install "MEDS_transforms[local_parallelism,slurm_parallelism]"
 If you want to profile the time and memory costs of your ETL, also install: `pip install hydra-profiler`.
 
 ## Step 0.5: Set-up
+
 Set some environment variables and download the necessary files:
+
 ```bash
 export AUMC_RAW_DIR=??? # set to the directory in which you want to store the raw data
 export AUMC_PRE_MEDS_DIR=??? # set to the directory in which you want to store the intermediate MEDS data
@@ -34,12 +36,10 @@ chmod +x run.sh
 chmod +x pre_MEDS.py
 ```
 
-
 ## Step 1: Download AUMC
 
 Download the AUMC dataset from following the instructions on https://github.com/AmsterdamUMC/AmsterdamUMCdb?tab=readme-ov-file. You will need the raw `.csv` files for this example. We will use `$AUMC_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored.
 
-
 ## Step 2: Run the MEDS ETL
 
 To run the MEDS ETL, run the following command:
@@ -47,7 +47,8 @@ To run the MEDS ETL, run the following command:
 ```bash
 ./run.sh $AUMC_RAW_DIR $AUMC_PRE_MEDS_DIR $AUMC_MEDS_COHORT_DIR
 ```
-> [!NOTE] 
+
+> \[!NOTE\]
 > This can take up large amounts of memory if not parallelized. You can reduce the shard size to reduce memory usage by setting the `shard_size` parameter in the `extract_AUMC.yaml` file.
 > Check that your environment variables are set correctly.
 
@@ -80,8 +81,9 @@ Note: If you use the slurm system and you launch the hydra submitit jobs from an
 may need to run `unset SLURM_CPU_BIND` in your terminal first to avoid errors.
 
 ## Future Work
+
 Check with AUMCdb authors:
+
 - How should we deal with `registeredat` and `updatedat`?
 - We **IGNORE** several flags for the `drugitems` -- this may be a mistake!
 - When is the administered dose recorded? Is this done after the fact?
-
diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMC_Example/configs/event_configs.yaml
index b4a5647f..7233e20c 100644
--- a/AUMC_Example/configs/event_configs.yaml
+++ b/AUMC_Example/configs/event_configs.yaml
@@ -1,13 +1,13 @@
 subject_id_col: patientid
 
 patient:
-  dob: 
+  dob:
     code: "MEDS_BIRTH"
     time: col(dateofbirth)
   gender:
     code: ["GENDER", "col(gender)"]
     time: null
-  dod: 
+  dod:
     code: "MEDS_DEATH"
     time: col(dateofdeath)
 
@@ -21,7 +21,7 @@ admissions:
       - col(specialty)
     time: col(admittedattime)
   icu_discharge:
-    code: 
+    code:
       - "ICU_DISCHARGE"
       - col(destination)
     time: col(dischargedattime)
@@ -31,7 +31,7 @@ admissions:
       - col(weightsource)
       - col(weightgroup)
     time: col(admittedattime)
-  height: 
+  height:
     code:
       - "HEIGHT_AT_ADMISSION"
       - col(heightsource)
@@ -55,7 +55,7 @@ listitems:
       - col(islabresult)
       - col(value)
     time: col(measuredattime)
-  
+
 freetextitems:
   event:
     code:
@@ -66,7 +66,7 @@ freetextitems:
     text_value: value
 
 procedureorderitems:
-  event: 
+  event:
     code:
       - PROCEDURE
       - col(ordercategoryname)
@@ -74,21 +74,21 @@ procedureorderitems:
     time: col(registeredattime)
 
 processitems:
-  start: 
+  start:
     code:
       - PROCESS
       - START
       - col(item)
     time: col(starttime)
   end:
-    code: 
+    code:
       - PROCESS
       - END
       - col(item)
     time: col(stoptime)
 
-drugitems: 
-  start: 
+drugitems:
+  start:
     code:
       - DRUG
       - START
@@ -96,7 +96,7 @@ drugitems:
       - col(item)
       - col(action)
     time: col(starttime)
-  rate: 
+  rate:
     code:
       - DRUG
       - RATE
@@ -105,7 +105,7 @@ drugitems:
       - col(rateunit)
     time: col(starttime)
     numeric_value: col(rate)
-  dose: 
+  dose:
     code:
       - DRUG
       - DOSE
diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMC_Example/configs/pre_MEDS.yaml
index ac1967a0..ac311776 100644
--- a/AUMC_Example/configs/pre_MEDS.yaml
+++ b/AUMC_Example/configs/pre_MEDS.yaml
@@ -11,4 +11,3 @@ hydra:
     dir: ${log_dir}
   sweep:
     dir: ${log_dir}
-
diff --git a/AUMC_Example/configs/table_preprocessors.yaml b/AUMC_Example/configs/table_preprocessors.yaml
index ccf125f8..6c253bce 100644
--- a/AUMC_Example/configs/table_preprocessors.yaml
+++ b/AUMC_Example/configs/table_preprocessors.yaml
@@ -7,7 +7,7 @@ admissions:
     - "dischargedattime"
   output_data_cols:
     - "location"
-    - "urgency" 
+    - "urgency"
     - "origin"
     - "destination"
     - "weightgroup"
@@ -17,7 +17,7 @@ admissions:
     - "specialty"
 
 numericitems:
-  offset_col: 
+  offset_col:
     - "measuredat"
     - "registeredat"
     - "updatedat"
@@ -113,4 +113,4 @@ processitems:
     - "starttime"
     - "stoptime"
   output_data_cols:
-    - "item"
\ No newline at end of file
+    - "item"
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 799c9282..6826cf7e 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -19,6 +19,7 @@
 ADMISSION_ID = "admissionid"
 PATIENT_ID = "patientid"
 
+
 def load_raw_aumc_file(fp: Path, **kwargs) -> pl.LazyFrame:
     """Load a raw AUMCdb file into a Polars DataFrame.
 
@@ -44,28 +45,29 @@ def process_patient_and_admissions(df: pl.LazyFrame) -> pl.LazyFrame:
     """
 
     origin_pseudotime = pl.datetime(
-        year = pl.col("admissionyeargroup").str.extract(r"(2003|2010)").cast(pl.Int32),
-        month = 1, day = 1
+        year=pl.col("admissionyeargroup").str.extract(r"(2003|2010)").cast(pl.Int32), month=1, day=1
     )
 
-    # TODO: consider using better logic to infer date of birth for patients 
+    # TODO: consider using better logic to infer date of birth for patients
     #       with more than one admission.
-    age_in_years = ((
-        pl.col("agegroup").str.extract("(\\d{2}).?$").cast(pl.Int32) + 
-        pl.col("agegroup").str.extract("^(\\d{2})").cast(pl.Int32)
-    ) / 2).ceil()
+    age_in_years = (
+        (
+            pl.col("agegroup").str.extract("(\\d{2}).?$").cast(pl.Int32)
+            + pl.col("agegroup").str.extract("^(\\d{2})").cast(pl.Int32)
+        )
+        / 2
+    ).ceil()
     age_in_days = age_in_years * 365.25
     # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate
     pseudo_date_of_birth = origin_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2))
     pseudo_date_of_death = origin_pseudotime + pl.duration(milliseconds=pl.col("dateofdeath"))
 
-
     return df.filter(pl.col("admissioncount") == 1).select(
-        PATIENT_ID, 
+        PATIENT_ID,
         pseudo_date_of_birth.alias("dateofbirth"),
         "gender",
         origin_pseudotime.alias("firstadmittedattime"),
-        pseudo_date_of_death.alias("dateofdeath")
+        pseudo_date_of_death.alias("dateofdeath"),
     ), df.select(PATIENT_ID, ADMISSION_ID)
 
 
@@ -80,20 +82,21 @@ def join_and_get_pseudotime_fntr(
 
     Also raises specified warning strings via the logger for uncertain columns.
 
-    Args: 
+    Args:
         table_name: name of the AUMCdb table that should be joined
         offset_col: list of all columns that contain time offsets since the patient's first admission
-        pseudotime_col: list of all timestamp columns derived from `offset_col` and the linked `patient` 
+        pseudotime_col: list of all timestamp columns derived from `offset_col` and the linked `patient`
             table
-        output_data_cols: list of all data columns included in the output 
+        output_data_cols: list of all data columns included in the output
         warning_items: any warnings noted in the table_preprocessors.yaml
 
-    Example: 
-        All args except `table_name` are taken from the table_preprocessors.yaml. For example, for the 
+    Example:
+        All args except `table_name` are taken from the table_preprocessors.yaml. For example, for the
         table `numericitems`, we have the following yaml configuration:
 
+
         numericitems:
-            offset_col: 
+            offset_col:
                 - "measuredat"
                 - "registeredat"
                 - "updatedat"
@@ -110,7 +113,7 @@ def join_and_get_pseudotime_fntr(
             warning_items:
                 - "How should we deal with `registeredat` and `updatedat`?"
 
-    Returns: 
+    Returns:
         Function that expects the raw data stored in the `table_name` table and the joined output of the
         `process_patient_and_admissions` function. Both inputs are expected to be `pl.DataFrame`s.
     """
@@ -163,7 +166,7 @@ def main(cfg: DictConfig):
 
     Inputs are the raw AUMCdb files, read from the `input_dir` config parameter. Output files are written
     in processed form and as Parquet files to the `cohort_dir` config parameter. Hydra is used to manage
-    configuration parameters and logging. 
+    configuration parameters and logging.
     """
 
     hydra_loguru_init()

From a7b614bbb874ceed79075d06eb1b1353da2c053d Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Mon, 28 Oct 2024 10:31:49 +0100
Subject: [PATCH 36/76] added examples and made aumc package

---
 AUMC_Example/__init__.py |  0
 AUMC_Example/pre_MEDS.py | 65 ++++++++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 25 deletions(-)
 create mode 100644 AUMC_Example/__init__.py

diff --git a/AUMC_Example/__init__.py b/AUMC_Example/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/AUMC_Example/pre_MEDS.py b/AUMC_Example/pre_MEDS.py
index 6826cf7e..8341f897 100755
--- a/AUMC_Example/pre_MEDS.py
+++ b/AUMC_Example/pre_MEDS.py
@@ -28,8 +28,16 @@ def load_raw_aumc_file(fp: Path, **kwargs) -> pl.LazyFrame:
 
     Returns:
         The Polars DataFrame containing the AUMCdb data.
+    Example:
+    >>> load_raw_aumc_file(Path("processitems.csv")).collect()
+        ┌─────────────┬────────┬──────────────────────┬──────────┬───────────┬──────────┐
+        │ admissionid ┆ itemid ┆ item                 ┆ start    ┆ stop      ┆ duration │
+        │ ---         ┆ ---    ┆ ---                  ┆ ---      ┆ ---       ┆ ---      │
+        │ i64         ┆ i64    ┆ str                  ┆ i64      ┆ i64       ┆ i64      │
+        ╞═════════════╪════════╪══════════════════════╪══════════╪═══════════╪══════════╡
+        │ 1           ┆ 1      ┆ "Pulse"              ┆ 0        ┆ 100000    ┆ 100000   │
+        └─────────────┴────────┴──────────────────────┴──────────┴───────────┴──────────┘
     """
-
     return pl.scan_csv(fp, infer_schema_length=10000000, encoding="utf8-lossy", **kwargs)
 
 
@@ -79,8 +87,26 @@ def join_and_get_pseudotime_fntr(
     warning_items: list[str] | None = None,
 ) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]:
     """Returns a function that joins a dataframe to the `patient` table and adds pseudotimes.
-
     Also raises specified warning strings via the logger for uncertain columns.
+    All args except `table_name` are taken from the table_preprocessors.yaml. For example, for the
+    table `numericitems`, we have the following yaml configuration:
+    numericitems:
+        offset_col:
+            - "measuredat"
+            - "registeredat"
+            - "updatedat"
+        pseudotime_col:
+            - "measuredattime"
+            - "registeredattime"
+            - "updatedattime"
+        output_data_cols:
+            - "item"
+            - "value"
+            - "unit"
+            - "registeredby"
+            - "updatedby"
+        warning_items:
+            - "How should we deal with `registeredat` and `updatedat`?"
 
     Args:
         table_name: name of the AUMCdb table that should be joined
@@ -90,32 +116,21 @@ def join_and_get_pseudotime_fntr(
         output_data_cols: list of all data columns included in the output
         warning_items: any warnings noted in the table_preprocessors.yaml
 
-    Example:
-        All args except `table_name` are taken from the table_preprocessors.yaml. For example, for the
-        table `numericitems`, we have the following yaml configuration:
-
-
-        numericitems:
-            offset_col:
-                - "measuredat"
-                - "registeredat"
-                - "updatedat"
-            pseudotime_col:
-                - "measuredattime"
-                - "registeredattime"
-                - "updatedattime"
-            output_data_cols:
-                - "item"
-                - "value"
-                - "unit"
-                - "registeredby"
-                - "updatedby"
-            warning_items:
-                - "How should we deal with `registeredat` and `updatedat`?"
-
     Returns:
         Function that expects the raw data stored in the `table_name` table and the joined output of the
         `process_patient_and_admissions` function. Both inputs are expected to be `pl.DataFrame`s.
+
+    Examples:
+        >>> func = join_and_get_pseudotime_fntr("numericitems", ["measuredat", "registeredat", "updatedat"],
+        ["measuredattime", "registeredattime", "updatedattime"],
+        ["item", "value", "unit", "registeredby", "updatedby"],
+        ["How should we deal with `registeredat` and `updatedat`?"])`
+        >>> df = load_raw_aumc_file(in_fp)
+        >>> raw_admissions_df = load_raw_aumc_file(Path("admissions.csv"))
+        >>> patient_df, link_df = process_patient_and_admissions(raw_admissions_df)
+        >>> processed_df = func(df, patient_df)
+        >>> type(processed_df)
+        <class 'polars.lazy.LazyFrame'>
     """
 
     if output_data_cols is None:

From 329a09a764904d19b0ce621426c43d5753be8c78 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Mon, 28 Oct 2024 10:34:01 +0100
Subject: [PATCH 37/76] Rename to AUMCdb

---
 AUMC_Example/run.sh                           | 63 -------------------
 {AUMC_Example => AUMCdb_Example}/README.md    |  0
 {AUMC_Example => AUMCdb_Example}/__init__.py  |  0
 .../configs/event_configs.yaml                |  0
 .../configs/extract_AUMC.yaml                 |  0
 .../configs/pre_MEDS.yaml                     |  0
 .../configs/table_preprocessors.yaml          |  0
 .../local_parallelism_runner.yaml             |  0
 {AUMC_Example => AUMCdb_Example}/pre_MEDS.py  |  0
 .../slurm_runner.yaml                         |  0
 10 files changed, 63 deletions(-)
 delete mode 100755 AUMC_Example/run.sh
 rename {AUMC_Example => AUMCdb_Example}/README.md (100%)
 rename {AUMC_Example => AUMCdb_Example}/__init__.py (100%)
 rename {AUMC_Example => AUMCdb_Example}/configs/event_configs.yaml (100%)
 rename {AUMC_Example => AUMCdb_Example}/configs/extract_AUMC.yaml (100%)
 rename {AUMC_Example => AUMCdb_Example}/configs/pre_MEDS.yaml (100%)
 rename {AUMC_Example => AUMCdb_Example}/configs/table_preprocessors.yaml (100%)
 rename {AUMC_Example => AUMCdb_Example}/local_parallelism_runner.yaml (100%)
 rename {AUMC_Example => AUMCdb_Example}/pre_MEDS.py (100%)
 rename {AUMC_Example => AUMCdb_Example}/slurm_runner.yaml (100%)

diff --git a/AUMC_Example/run.sh b/AUMC_Example/run.sh
deleted file mode 100755
index c83bfaaf..00000000
--- a/AUMC_Example/run.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env bash
-
-# This makes the script fail if any internal script fails
-set -e
-
-# Function to display help message
-function display_help() {
-    echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PRE_MEDS_DIR> <AUMC_MEDS_COHORT_DIR>"
-    echo
-    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
-    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
-    echo
-    echo "Arguments:"
-    echo "  AUMC_RAW_DIR                                Directory containing raw AUMCdb data files."
-    echo "  AUMC_PREMEDS_DIR                            Output directory for pre-MEDS data."
-    echo "  AUMC_MEDS_DIR                               Output directory for processed MEDS data."
-    echo
-    echo "Options:"
-    echo "  -h, --help          Display this help message and exit."
-    exit 1
-}
-
-echo "Unsetting SLURM_CPU_BIND in case you're running this on a slurm interactive node with slurm parallelism"
-unset SLURM_CPU_BIND
-
-# Check if the first parameter is '-h' or '--help'
-if [[ "$1" == "-h" || "$1" == "--help" ]]; then
-    display_help
-fi
-
-# Check for mandatory parameters
-if [ "$#" -lt 3 ]; then
-    echo "Error: Incorrect number of arguments provided."
-    display_help
-fi
-
-export AUMC_RAW_DIR=$1
-export AUMC_PRE_MEDS_DIR=$2
-export AUMC_MEDS_COHORT_DIR=$3
-shift 3
-
-# TODO: Add wget blocks once testing is validated.
-
-EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml"
-PIPELINE_CONFIG_FP="$(pwd)/configs/extract_AUMC.yaml"
-PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py"
-
-# We export these variables separately from their assignment so that any errors during assignment are caught.
-export EVENT_CONVERSION_CONFIG_FP
-export PIPELINE_CONFIG_FP
-export PRE_MEDS_PY_FP
-
-
-echo "Running pre-MEDS conversion."
-python "$PRE_MEDS_PY_FP" input_dir="$AUMC_RAW_DIR" cohort_dir="$AUMC_PRE_MEDS_DIR"
-
-if [ -z "$N_WORKERS" ]; then
-  echo "Setting N_WORKERS to 1 to avoid issues with the runners."
-  export N_WORKERS="1"
-fi
-
-echo "Running extraction pipeline."
-MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"
diff --git a/AUMC_Example/README.md b/AUMCdb_Example/README.md
similarity index 100%
rename from AUMC_Example/README.md
rename to AUMCdb_Example/README.md
diff --git a/AUMC_Example/__init__.py b/AUMCdb_Example/__init__.py
similarity index 100%
rename from AUMC_Example/__init__.py
rename to AUMCdb_Example/__init__.py
diff --git a/AUMC_Example/configs/event_configs.yaml b/AUMCdb_Example/configs/event_configs.yaml
similarity index 100%
rename from AUMC_Example/configs/event_configs.yaml
rename to AUMCdb_Example/configs/event_configs.yaml
diff --git a/AUMC_Example/configs/extract_AUMC.yaml b/AUMCdb_Example/configs/extract_AUMC.yaml
similarity index 100%
rename from AUMC_Example/configs/extract_AUMC.yaml
rename to AUMCdb_Example/configs/extract_AUMC.yaml
diff --git a/AUMC_Example/configs/pre_MEDS.yaml b/AUMCdb_Example/configs/pre_MEDS.yaml
similarity index 100%
rename from AUMC_Example/configs/pre_MEDS.yaml
rename to AUMCdb_Example/configs/pre_MEDS.yaml
diff --git a/AUMC_Example/configs/table_preprocessors.yaml b/AUMCdb_Example/configs/table_preprocessors.yaml
similarity index 100%
rename from AUMC_Example/configs/table_preprocessors.yaml
rename to AUMCdb_Example/configs/table_preprocessors.yaml
diff --git a/AUMC_Example/local_parallelism_runner.yaml b/AUMCdb_Example/local_parallelism_runner.yaml
similarity index 100%
rename from AUMC_Example/local_parallelism_runner.yaml
rename to AUMCdb_Example/local_parallelism_runner.yaml
diff --git a/AUMC_Example/pre_MEDS.py b/AUMCdb_Example/pre_MEDS.py
similarity index 100%
rename from AUMC_Example/pre_MEDS.py
rename to AUMCdb_Example/pre_MEDS.py
diff --git a/AUMC_Example/slurm_runner.yaml b/AUMCdb_Example/slurm_runner.yaml
similarity index 100%
rename from AUMC_Example/slurm_runner.yaml
rename to AUMCdb_Example/slurm_runner.yaml

From e52337390c825e81e18681d682f5c52a7cb212c9 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Mon, 28 Oct 2024 10:34:16 +0100
Subject: [PATCH 38/76] rename folder

---
 AUMCdb_Example/run.sh | 63 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100755 AUMCdb_Example/run.sh

diff --git a/AUMCdb_Example/run.sh b/AUMCdb_Example/run.sh
new file mode 100755
index 00000000..c83bfaaf
--- /dev/null
+++ b/AUMCdb_Example/run.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PRE_MEDS_DIR> <AUMC_MEDS_COHORT_DIR>"
+    echo
+    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  AUMC_RAW_DIR                                Directory containing raw AUMCdb data files."
+    echo "  AUMC_PREMEDS_DIR                            Output directory for pre-MEDS data."
+    echo "  AUMC_MEDS_DIR                               Output directory for processed MEDS data."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+echo "Unsetting SLURM_CPU_BIND in case you're running this on a slurm interactive node with slurm parallelism"
+unset SLURM_CPU_BIND
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -lt 3 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
+export AUMC_RAW_DIR=$1
+export AUMC_PRE_MEDS_DIR=$2
+export AUMC_MEDS_COHORT_DIR=$3
+shift 3
+
+# TODO: Add wget blocks once testing is validated.
+
+EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml"
+PIPELINE_CONFIG_FP="$(pwd)/configs/extract_AUMC.yaml"
+PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py"
+
+# We export these variables separately from their assignment so that any errors during assignment are caught.
+export EVENT_CONVERSION_CONFIG_FP
+export PIPELINE_CONFIG_FP
+export PRE_MEDS_PY_FP
+
+
+echo "Running pre-MEDS conversion."
+python "$PRE_MEDS_PY_FP" input_dir="$AUMC_RAW_DIR" cohort_dir="$AUMC_PRE_MEDS_DIR"
+
+if [ -z "$N_WORKERS" ]; then
+  echo "Setting N_WORKERS to 1 to avoid issues with the runners."
+  export N_WORKERS="1"
+fi
+
+echo "Running extraction pipeline."
+MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@"

From de2435d40960b23930a743b2768c6592565ce407 Mon Sep 17 00:00:00 2001
From: Robin van de Water <r.p.vandewater@gmail.com>
Date: Mon, 28 Oct 2024 10:43:09 +0100
Subject: [PATCH 39/76] corrected run

---
 AUMCdb_Example/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/AUMCdb_Example/run.sh b/AUMCdb_Example/run.sh
index c83bfaaf..4097ab7c 100755
--- a/AUMCdb_Example/run.sh
+++ b/AUMCdb_Example/run.sh
@@ -7,8 +7,8 @@ set -e
 function display_help() {
     echo "Usage: $0 <AUMC_RAW_DIR> <AUMC_PRE_MEDS_DIR> <AUMC_MEDS_COHORT_DIR>"
     echo
-    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
-    echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
+    echo "This script processes the AUMCdb (AmsterdamUMCdb, Amsterdam University Medical Center database, short version: AUMC) data through several steps,"
+    echo "handling raw data conversion, sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort."
     echo
     echo "Arguments:"
     echo "  AUMC_RAW_DIR                                Directory containing raw AUMCdb data files."

From 4c48897217402f624c4e79504a6901a831e27ce0 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 29 Oct 2024 16:29:36 -0400
Subject: [PATCH 40/76] Upgraded pre-commit

---
 .pre-commit-config.yaml | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61bde520..4f12be1f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ exclude: "docs/index.md|MIMIC-IV_Example/README.md|eICU_Example/README.md"
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       # list of supported hooks: https://pre-commit.com/hooks.html
       - id: trailing-whitespace
@@ -22,27 +22,27 @@ repos:
 
   # python code formatting
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 24.10.0
     hooks:
       - id: black
         args: [--line-length, "110"]
 
   # python import sorting
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         args: ["--profile", "black", "--filter-files", "-o", "wandb"]
 
   - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.0
+    rev: v2.3.1
     hooks:
       - id: autoflake
         args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.19.0
     hooks:
       - id: pyupgrade
         args: [--py311-plus]
@@ -56,7 +56,7 @@ repos:
 
   # python check (PEP8), programming errors and code complexity
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         args:
@@ -73,7 +73,7 @@ repos:
 
   # yaml formatting
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.3
+    rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
         types: [yaml]
@@ -81,13 +81,13 @@ repos:
 
   # shell scripts linter
   - repo: https://github.com/shellcheck-py/shellcheck-py
-    rev: v0.9.0.5
+    rev: v0.10.0.1
     hooks:
       - id: shellcheck
 
   # md formatting
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.17
+    rev: 0.7.18
     hooks:
       - id: mdformat
         args: ["--number"]
@@ -104,7 +104,7 @@ repos:
 
   # word spelling linter
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.3.0
     hooks:
       - id: codespell
         args:
@@ -113,17 +113,21 @@ repos:
 
   # jupyter notebook cell output clearing
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.6.1
+    rev: 0.7.1
     hooks:
       - id: nbstripout
 
   # jupyter notebook linting
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.0
+    rev: 1.8.7
     hooks:
       - id: nbqa-black
         args: ["--line-length=110"]
       - id: nbqa-isort
         args: ["--profile=black"]
       - id: nbqa-flake8
-        args: ["--extend-ignore=E203,E402,E501,F401,F841", "--exclude=logs/*,data/*"]
+        args:
+          [
+            "--extend-ignore=E203,E402,E501,F401,F841",
+            "--exclude=logs/*,data/*",
+          ]

From 6fb2540663d38d83ac0f130f4892a15c4f85b475 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 29 Oct 2024 16:32:48 -0400
Subject: [PATCH 41/76] Added doctests for getting the script from the stage.

---
 src/MEDS_transforms/runner.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index e99e014a..e10ac5bf 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -33,6 +33,20 @@ def get_script_from_name(stage_name: str) -> str | None:
 
     Returns:
         The script name for the given stage name.
+
+    Examples:
+        >>> get_script_from_name("shard_events")
+        'MEDS_extract-shard_events'
+        >>> get_script_from_name("fit_vocabulary_indices")
+        'MEDS_transform-fit_vocabulary_indices'
+        >>> get_script_from_name("filter_subjects")
+        'MEDS_transform-filter_subjects'
+        >>> get_script_from_name("reorder_measurements")
+        'MEDS_transform-reorder_measurements'
+        >>> get_script_from_name("nonexistent_stage")
+        Traceback (most recent call last):
+            ...
+        ValueError: Could not find a script for stage nonexistent_stage.
     """
 
     try:

From fac58c4aa18b325a6ef432c1cc7bc0183369ea3b Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 29 Oct 2024 16:46:19 -0400
Subject: [PATCH 42/76] Added doctests for getting the parallelization
 arguments

---
 src/MEDS_transforms/runner.py | 47 ++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index e10ac5bf..3acd23ba 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -68,7 +68,41 @@ def get_script_from_name(stage_name: str) -> str | None:
 def get_parallelization_args(
     parallelization_cfg: dict | DictConfig | None, default_parallelization_cfg: dict | DictConfig
 ) -> list[str]:
-    """Gets the parallelization args."""
+    """Extracts the specific parallelization arguments given the default and stage-specific configurations.
+
+    Args:
+        parallelization_cfg: The stage-specific parallelization configuration.
+        default_parallelization_cfg: The default parallelization configuration.
+
+    Returns:
+        A list of command-line arguments for parallelization.
+
+    Examples:
+        >>> get_parallelization_args({}, {})
+        []
+        >>> get_parallelization_args(None, {"n_workers": 4})
+        []
+        >>> get_parallelization_args({"n_workers": 2, "launcher_params": 'foo'}, {})
+        Traceback (most recent call last):
+            ...
+        ValueError: If launcher_params is provided, launcher must also be provided.
+        >>> get_parallelization_args({"n_workers": 2}, {})
+        ['--multirun', 'worker="range(0,2)"']
+        >>> get_parallelization_args(
+        ...     {"launcher": "slurm"},
+        ...     {"n_workers": 3, "launcher": "joblib"}
+        ... )
+        ['--multirun', 'worker="range(0,3)"', 'hydra/launcher=slurm']
+        >>> get_parallelization_args(
+        ...     {"n_workers": 2, "launcher": "joblib"},
+        ...     {"n_workers": 5, "launcher_params": {"foo": "bar"}},
+        ... )
+        ['--multirun', 'worker="range(0,2)"', 'hydra/launcher=joblib', 'hydra.launcher.foo=bar']
+        >>> get_parallelization_args(
+        ...     {"n_workers": 5, "launcher_params": {"biz": "baz"}, "launcher": "slurm"}, {}
+        ... )
+        ['--multirun', 'worker="range(0,5)"', 'hydra/launcher=slurm', 'hydra.launcher.biz=baz']
+    """
 
     if parallelization_cfg is None:
         return []
@@ -96,11 +130,11 @@ def get_parallelization_args(
         launcher = None
 
     if launcher is None:
-        return parallelization_args
-
         if "launcher_params" in parallelization_cfg:
             raise ValueError("If launcher_params is provided, launcher must also be provided.")
 
+        return parallelization_args
+
     parallelization_args.append(f"hydra/launcher={launcher}")
 
     if "launcher_params" in parallelization_cfg:
@@ -116,7 +150,12 @@ def get_parallelization_args(
     return parallelization_args
 
 
-def run_stage(cfg: DictConfig, stage_name: str, default_parallelization_cfg: dict | DictConfig | None = None):
+def run_stage(
+    cfg: DictConfig,
+    stage_name: str,
+    default_parallelization_cfg: dict | DictConfig | None = None,
+    runner_fn: callable = subprocess.run,  # For dependency injection
+):
     """Runs a single stage of the pipeline.
 
     Args:

From c905f2ae06a7546e8b70840c24c0d1831d49de74 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:21:05 -0400
Subject: [PATCH 43/76] Added tests to run command.

---
 src/MEDS_transforms/runner.py | 84 ++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index 3acd23ba..434e09a7 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -161,6 +161,57 @@ def run_stage(
     Args:
         cfg: The configuration for the entire pipeline.
         stage_name: The name of the stage to run.
+
+    Raises:
+        ValueError: If the stage fails to run.
+
+    Examples:
+        >>> def fake_shell_succeed(cmd, shell, capture_output):
+        ...     print(cmd)
+        ...     return subprocess.CompletedProcess(args=cmd, returncode=0, stdout=b"", stderr=b"")
+        >>> def fake_shell_fail(cmd, shell, capture_output):
+        ...     print(cmd)
+        ...     return subprocess.CompletedProcess(args=cmd, returncode=1, stdout=b"", stderr=b"")
+        >>> cfg = OmegaConf.create({
+        ...     "pipeline_config_fp": "pipeline_config.yaml",
+        ...     "do_profile": False,
+        ...     "_local_pipeline_config": {
+        ...         "stage_configs": {
+        ...             "shard_events": {},
+        ...             "fit_vocabulary_indices": {"_script": "foobar"},
+        ...         },
+        ...     },
+        ...     "_stage_runners": {
+        ...         "shard_events": {"_script": "not used"},
+        ...         "fit_vocabulary_indices": {},
+        ...         "baz": {"script": "baz_script"},
+        ...     },
+        ... })
+        >>> run_stage(cfg, "shard_events", runner_fn=fake_shell_succeed) # doctest: +NORMALIZE_WHITESPACE
+        MEDS_extract-shard_events --config-dir=... --config-name=pipeline_config
+            'hydra.searchpath=[pkg://MEDS_transforms.configs]' stage=shard_events
+        >>> run_stage(
+        ...     cfg, "fit_vocabulary_indices", runner_fn=fake_shell_succeed
+        ... ) # doctest: +NORMALIZE_WHITESPACE
+        foobar --config-dir=... --config-name=pipeline_config
+            'hydra.searchpath=[pkg://MEDS_transforms.configs]' stage=fit_vocabulary_indices
+        >>> run_stage(cfg, "baz", runner_fn=fake_shell_succeed) # doctest: +NORMALIZE_WHITESPACE
+        baz_script --config-dir=... --config-name=pipeline_config
+            'hydra.searchpath=[pkg://MEDS_transforms.configs]' stage=baz
+        >>> cfg.do_profile = True
+        >>> run_stage(cfg, "baz", runner_fn=fake_shell_succeed) # doctest: +NORMALIZE_WHITESPACE
+        baz_script --config-dir=... --config-name=pipeline_config
+            'hydra.searchpath=[pkg://MEDS_transforms.configs]' stage=baz
+            ++hydra.callbacks.profiler._target_=hydra_profiler.profiler.ProfilerCallback
+        >>> cfg._stage_runners.baz.parallelize = {"n_workers": 2}
+        >>> cfg.do_profile = False
+        >>> run_stage(cfg, "baz", runner_fn=fake_shell_succeed) # doctest: +NORMALIZE_WHITESPACE
+        baz_script --config-dir=... --config-name=pipeline_config --multirun
+            'hydra.searchpath=[pkg://MEDS_transforms.configs]' stage=baz worker="range(0,2)"
+        >>> run_stage(cfg, "baz", runner_fn=fake_shell_fail)
+        Traceback (most recent call last):
+            ...
+        ValueError: Stage baz failed via ...
     """
 
     if default_parallelization_cfg is None:
@@ -200,7 +251,7 @@ def run_stage(
 
     full_cmd = " ".join(command_parts)
     logger.info(f"Running command: {full_cmd}")
-    command_out = subprocess.run(full_cmd, shell=True, capture_output=True)
+    command_out = runner_fn(full_cmd, shell=True, capture_output=True)
 
     # https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
     # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.parse
@@ -282,6 +333,37 @@ def main(cfg: DictConfig):
 
 
 def load_yaml_file(path: str | None) -> dict | DictConfig:
+    """Loads a YAML file as an OmegaConf object.
+
+    Args:
+        path: The path to the YAML file.
+
+    Returns:
+        The OmegaConf object representing the YAML file, or None if no path is provided.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+
+    Examples:
+        >>> load_yaml_file(None)
+        {}
+        >>> load_yaml_file("nonexistent_file.yaml")
+        Traceback (most recent call last):
+            ...
+        FileNotFoundError: File nonexistent_file.yaml does not exist.
+        >>> import tempfile
+        >>> with tempfile.NamedTemporaryFile(suffix=".yaml") as f:
+        ...     _ = f.write(b"foo: bar")
+        ...     f.flush()
+        ...     load_yaml_file(f.name)
+        {'foo': 'bar'}
+        >>> with tempfile.NamedTemporaryFile(suffix=".yaml") as f:
+        ...     cfg = OmegaConf.create({"foo": "bar"})
+        ...     OmegaConf.save(cfg, f.name)
+        ...     load_yaml_file(f.name)
+        {'foo': 'bar'}
+    """
+
     if not path:
         return {}
 

From ba0c559cfb05443decfe9cf3cd0e05ad5cc48057 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:24:06 -0400
Subject: [PATCH 44/76] Added a no-cover case

---
 src/MEDS_transforms/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index 434e09a7..037db005 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -297,7 +297,7 @@ def main(cfg: DictConfig):
 
     log_dir = Path(cfg.log_dir)
 
-    if cfg.get("do_profile", False):
+    if cfg.get("do_profile", False):  # pragma: no cover
         try:
             import hydra_profiler  # noqa: F401
         except ImportError as e:

From 33efafb889b13f1f7aeba9903fec5e830692aade Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:26:59 -0400
Subject: [PATCH 45/76] Corrected typo in reshard to split and added a no-cover
 case.

---
 src/MEDS_transforms/reshard_to_split.py        | 2 +-
 tests/MEDS_Transforms/test_reshard_to_split.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/MEDS_transforms/reshard_to_split.py b/src/MEDS_transforms/reshard_to_split.py
index deccc49f..650aa57e 100644
--- a/src/MEDS_transforms/reshard_to_split.py
+++ b/src/MEDS_transforms/reshard_to_split.py
@@ -112,7 +112,7 @@ def main(cfg: DictConfig):
 
     max_iters = cfg.get("max_iters", 10)
     iters = 0
-    while not valid_json_file(shards_fp) and iters < max_iters:
+    while not valid_json_file(shards_fp) and iters < max_iters:  # pragma: no cover
         logger.info(f"Waiting to begin until shards map is written. Iteration {iters}/{max_iters}...")
         time.sleep(cfg.polling_time)
         iters += 1
diff --git a/tests/MEDS_Transforms/test_reshard_to_split.py b/tests/MEDS_Transforms/test_reshard_to_split.py
index d0094a96..bd6d1f29 100644
--- a/tests/MEDS_Transforms/test_reshard_to_split.py
+++ b/tests/MEDS_Transforms/test_reshard_to_split.py
@@ -4,7 +4,6 @@
 scripts.
 """
 
-
 from meds import subject_id_field
 
 from tests.MEDS_Transforms import RESHARD_TO_SPLIT_SCRIPT
@@ -207,7 +206,7 @@ def test_reshard_to_split():
     single_stage_transform_tester(
         transform_script=RESHARD_TO_SPLIT_SCRIPT,
         stage_name="reshard_to_split",
-        transform_stage_kwargs={"n_patients_per_shard": 2, "+train_only": True},
+        transform_stage_kwargs={"n_subjects_per_shard": 2, "+train_only": True},
         want_data=WANT_SHARDS,
         input_shards=IN_SHARDS,
         input_shards_map=IN_SHARDS_MAP,

From d35f08aefc90818a76a23085f46145d2d5eb8bad Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:32:49 -0400
Subject: [PATCH 46/76] Added some tests for some utils and some no-cover
 cases.

---
 src/MEDS_transforms/utils.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/MEDS_transforms/utils.py b/src/MEDS_transforms/utils.py
index 871b90a0..ebffb03b 100644
--- a/src/MEDS_transforms/utils.py
+++ b/src/MEDS_transforms/utils.py
@@ -110,7 +110,21 @@ def get_package_version() -> str:
 
 
 def get_script_docstring(filename: str | None = None) -> str:
-    """Returns the docstring of the main function of the script from which this function was called."""
+    """Returns the docstring of the main function of the calling script or the file specified.
+
+    Args:
+        filename: The name of the file to get the docstring from. If None, the calling script's docstring is
+            returned.
+
+    Returns:
+        str: The docstring of the main function of the specified file, if it exists.
+
+    Examples:
+        >>> get_script_docstring()
+        ''
+        >>> get_script_docstring("reshard_to_split")
+        'Re-shard a MEDS cohort to in a manner that subdivides subject splits.'
+    """
 
     if filename is not None:
         main_module = importlib.import_module(f"MEDS_transforms.{filename}")
@@ -129,7 +143,7 @@ def current_script_name() -> str:
     main_func = getattr(main_module, "main", None)
     if main_func and callable(main_func):
         func_module = main_func.__module__
-        if func_module == "__main__":
+        if func_module == "__main__":  # pragma: no cover
             return Path(sys.argv[0]).stem
         else:
             return func_module.split(".")[-1]

From db0d4f3cfe58a98074a0b677eac57d51ffa392e0 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:33:49 -0400
Subject: [PATCH 47/76] Added a no-cover case for a reduce

---
 src/MEDS_transforms/aggregate_code_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/aggregate_code_metadata.py b/src/MEDS_transforms/aggregate_code_metadata.py
index 1ac8cdfd..c235d2c7 100755
--- a/src/MEDS_transforms/aggregate_code_metadata.py
+++ b/src/MEDS_transforms/aggregate_code_metadata.py
@@ -695,7 +695,7 @@ def run_map_reduce(cfg: DictConfig):
 
     logger.info("Starting reduction process")
 
-    while not all(is_complete_parquet_file(fp) for fp in all_out_fps):
+    while not all(is_complete_parquet_file(fp) for fp in all_out_fps):  # pragma: no cover
         logger.info("Waiting to begin reduction for all files to be written...")
         time.sleep(cfg.polling_time)
 

From d77c14252136d12adb4d53d5c9968b7096b811d4 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:39:49 -0400
Subject: [PATCH 48/76] added some error cases tests for add time derived
 measurements

---
 .../add_time_derived_measurements.py          | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/MEDS_transforms/transforms/add_time_derived_measurements.py b/src/MEDS_transforms/transforms/add_time_derived_measurements.py
index d1d3d1e1..38adae85 100644
--- a/src/MEDS_transforms/transforms/add_time_derived_measurements.py
+++ b/src/MEDS_transforms/transforms/add_time_derived_measurements.py
@@ -330,6 +330,26 @@ def time_of_day_fntr(cfg: DictConfig) -> Callable[[pl.DataFrame], pl.DataFrame]:
         │ 2          ┆ 2023-01-03 12:00:00 ┆ time_of_day//[12,18) │
         │ 3          ┆ 2022-01-01 18:00:00 ┆ time_of_day//[18,24) │
         └────────────┴─────────────────────┴──────────────────────┘
+        >>> time_of_day_fntr(DictConfig({"endpoints": []}))
+        Traceback (most recent call last):
+            ...
+        ValueError: The 'endpoints' key must contain at least one endpoint for time of day categories.
+        >>> time_of_day_fntr(DictConfig({"endpoints": [6, 12, 36]}))
+        Traceback (most recent call last):
+            ...
+        ValueError: All endpoints must be between 0 and 24 inclusive. Got: [6, 12, 36]
+        >>> time_of_day_fntr(DictConfig({"endpoints": [6, 1.2]}))
+        Traceback (most recent call last):
+            ...
+        ValueError: All endpoints must be integer, whole-hour boundaries, but got: [6, 1.2]
+        >>> time_of_day_fntr(DictConfig({"endpoints": [6, 6]}))
+        Traceback (most recent call last):
+            ...
+        ValueError: All endpoints must be unique. Got: [6, 6]
+        >>> time_of_day_fntr(DictConfig({"endpoints": [6, 12, 10]}))
+        Traceback (most recent call last):
+            ...
+        ValueError: All endpoints must be in sorted order. Got: [6, 12, 10]
     """
     if not cfg.endpoints:
         raise ValueError("The 'endpoints' key must contain at least one endpoint for time of day categories.")
@@ -337,8 +357,10 @@ def time_of_day_fntr(cfg: DictConfig) -> Callable[[pl.DataFrame], pl.DataFrame]:
         raise ValueError(f"All endpoints must be between 0 and 24 inclusive. Got: {cfg.endpoints}")
     if not all(isinstance(endpoint, int) for endpoint in cfg.endpoints):
         raise ValueError(f"All endpoints must be integer, whole-hour boundaries, but got: {cfg.endpoints}")
-    if len(cfg.endpoints) != len(set(cfg.endpoints)) or cfg.endpoints != sorted(cfg.endpoints):
-        raise ValueError(f"All endpoints must be unique and in sorted order. Got: {cfg.endpoints}")
+    if len(cfg.endpoints) != len(set(cfg.endpoints)):
+        raise ValueError(f"All endpoints must be unique. Got: {cfg.endpoints}")
+    if cfg.endpoints != sorted(cfg.endpoints):
+        raise ValueError(f"All endpoints must be in sorted order. Got: {cfg.endpoints}")
 
     def fn(df: pl.LazyFrame) -> pl.LazyFrame:
         hour = pl.col("time").dt.hour()

From d40ee04f872c4a0c07eec5ea07f6e631aa6c3335 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:41:48 -0400
Subject: [PATCH 49/76] added some error cases tests for add time derived
 measurements

---
 .../add_time_derived_measurements.py          | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/MEDS_transforms/transforms/add_time_derived_measurements.py b/src/MEDS_transforms/transforms/add_time_derived_measurements.py
index 38adae85..a785336c 100644
--- a/src/MEDS_transforms/transforms/add_time_derived_measurements.py
+++ b/src/MEDS_transforms/transforms/add_time_derived_measurements.py
@@ -387,6 +387,27 @@ def tod_code(start: int, end: int) -> str:
 
 
 def add_time_derived_measurements_fntr(stage_cfg: DictConfig) -> Callable[[pl.LazyFrame], pl.LazyFrame]:
+    """Adds all requested time-derived measurements to a DataFrame.
+
+    Args:
+        stage_cfg: The configuration for the time-derived measurements. Recognized time derived functors
+            include the following keys:
+                - "age": The configuration for the age function.
+                - "time_of_day": The configuration for the time of day function.
+
+    Returns:
+        A function that adds all requested time-derived measurements to a DataFrame.
+
+    Raises:
+        ValueError: If an unrecognized time-derived measurement is requested.
+
+    Examples:
+        >>> add_time_derived_measurements_fntr(DictConfig({"buzz": {}}))
+        Traceback (most recent call last):
+            ...
+        ValueError: Unknown time-derived measurement: buzz
+    """
+
     compute_fns = []
     # We use the raw stages object as the induced `stage_cfg` has extra properties like the input and output
     # directories.

From fab36fe3aff3e1f8ae0e7e925d53ab0569df159e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:43:41 -0400
Subject: [PATCH 50/76] Added some more no-cover cases for logging checks.

---
 src/MEDS_transforms/transforms/extract_values.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/MEDS_transforms/transforms/extract_values.py b/src/MEDS_transforms/transforms/extract_values.py
index f2335504..033dcc6f 100644
--- a/src/MEDS_transforms/transforms/extract_values.py
+++ b/src/MEDS_transforms/transforms/extract_values.py
@@ -95,18 +95,18 @@ def extract_values_fntr(stage_cfg: DictConfig) -> Callable[[pl.LazyFrame], pl.La
         match out_col_n:
             case str() if out_col_n in MANDATORY_TYPES:
                 expr = expr.cast(MANDATORY_TYPES[out_col_n])
-                if out_col_n == subject_id_field:
+                if out_col_n == subject_id_field:  # pragma: no cover
                     logger.warning(
                         f"You should almost CERTAINLY not be extracting {subject_id_field} as a value."
                     )
-                if out_col_n == "time":
+                if out_col_n == "time":  # pragma: no cover
                     logger.warning("Warning: `time` is being extracted post-hoc!")
-            case str() if out_col_n in DEPRECATED_NAMES:
+            case str() if out_col_n in DEPRECATED_NAMES:  # pragma: no cover
                 logger.warning(
                     f"Deprecated column name: {out_col_n} -> {DEPRECATED_NAMES[out_col_n]}. "
                     "This column name will not be re-typed."
                 )
-            case str():
+            case str():  # pragma: no cover
                 pass
             case _:
                 raise ValueError(f"Invalid column name: {out_col_n}")

From 0ecf8b1984e21a329b21b6d25e23ae8b69c46609 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:48:53 -0400
Subject: [PATCH 51/76] Added a row-index collission check and a note about
 columns retained.

---
 src/MEDS_transforms/transforms/normalization.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/MEDS_transforms/transforms/normalization.py b/src/MEDS_transforms/transforms/normalization.py
index 5109861a..9e5e9262 100644
--- a/src/MEDS_transforms/transforms/normalization.py
+++ b/src/MEDS_transforms/transforms/normalization.py
@@ -165,6 +165,20 @@ def normalize(
         │ 2          ┆ 2022-10-02 00:00:00 ┆ 2    ┆ null          │
         │ 3          ┆ 2022-10-02 00:00:00 ┆ 5    ┆ null          │
         └────────────┴─────────────────────┴──────┴───────────────┘
+
+        Note that while this function is robust to the inclusion of the default row index column name, it
+        doesn't retain any extra columns after the operation. If you want to retain the row index, you should
+        file a GitHub issue with this request and we can add it in a future release.
+        >>> MEDS_df = MEDS_df.with_columns(pl.lit(1).alias("_row_idx"), pl.lit(2).alias("foobar"))
+        >>> normalize(MEDS_df.head(1).lazy(), code_metadata, ["unit"]).collect()
+        shape: (1, 4)
+        ┌────────────┬─────────────────────┬──────┬───────────────┐
+        │ subject_id ┆ time                ┆ code ┆ numeric_value │
+        │ ---        ┆ ---                 ┆ ---  ┆ ---           │
+        │ u32        ┆ datetime[μs]        ┆ u32  ┆ f64           │
+        ╞════════════╪═════════════════════╪══════╪═══════════════╡
+        │ 1          ┆ 2021-01-01 00:00:00 ┆ 0    ┆ -2.0          │
+        └────────────┴─────────────────────┴──────┴───────────────┘
     """
 
     if code_modifiers is None:

From 0be11ca43d94f1ea80dd1a9a7c5ec8dfcdd7279e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 10:51:26 -0400
Subject: [PATCH 52/76] Added a null-case test to occulde outliers

---
 .../transforms/occlude_outliers.py               | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/MEDS_transforms/transforms/occlude_outliers.py b/src/MEDS_transforms/transforms/occlude_outliers.py
index d65ecd59..b670f9e7 100644
--- a/src/MEDS_transforms/transforms/occlude_outliers.py
+++ b/src/MEDS_transforms/transforms/occlude_outliers.py
@@ -54,6 +54,22 @@ def occlude_outliers_fntr(
         │ 2          ┆ A    ┆ 2         ┆ null          ┆ false                   │
         │ 2          ┆ C    ┆ 2         ┆ 1.0           ┆ true                    │
         └────────────┴──────┴───────────┴───────────────┴─────────────────────────┘
+
+        If no standard deviation cutoff is provided, the function should return the input DataFrame unchanged:
+        >>> stage_cfg = DictConfig({})
+        >>> fn = occlude_outliers_fntr(stage_cfg, code_metadata_df, ["modifier1"])
+        >>> fn(data).collect()
+        shape: (4, 4)
+        ┌────────────┬──────┬───────────┬───────────────┐
+        │ subject_id ┆ code ┆ modifier1 ┆ numeric_value │
+        │ ---        ┆ ---  ┆ ---       ┆ ---           │
+        │ i64        ┆ str  ┆ i64       ┆ f64           │
+        ╞════════════╪══════╪═══════════╪═══════════════╡
+        │ 1          ┆ A    ┆ 1         ┆ 15.0          │
+        │ 1          ┆ B    ┆ 1         ┆ 16.0          │
+        │ 2          ┆ A    ┆ 2         ┆ 3.9           │
+        │ 2          ┆ C    ┆ 2         ┆ 1.0           │
+        └────────────┴──────┴───────────┴───────────────┘
     """
 
     stddev_cutoff = stage_cfg.get("stddev_cutoff", None)

From b35798894df6bdf931d939f0e705762b545686d9 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 11:55:45 -0400
Subject: [PATCH 53/76] Added some error case tests and removed an impossible
 line.

---
 src/MEDS_transforms/transforms/tensorization.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/MEDS_transforms/transforms/tensorization.py b/src/MEDS_transforms/transforms/tensorization.py
index 5fd73899..1c780663 100644
--- a/src/MEDS_transforms/transforms/tensorization.py
+++ b/src/MEDS_transforms/transforms/tensorization.py
@@ -75,6 +75,19 @@ def convert_to_NRT(df: pl.LazyFrame) -> JointNestedRaggedTensorDict:
         time_delta_days
         [[nan 12.]
          [nan  0.]]
+
+    With the wrong number of time delta columns, it doesn't work:
+        >>> nrt = convert_to_NRT(df.drop("time_delta_days").lazy())
+        Traceback (most recent call last):
+            ...
+        ValueError: Expected at least one time delta column, found none
+        >>> nrt = convert_to_NRT(
+        ...     df.with_columns(pl.lit([1, 2]).alias("time_delta_hours")).lazy()
+        ... ) # doctest: +NORMALIZE_WHITESPACE
+        Traceback (most recent call last):
+            ...
+        ValueError: Expected exactly one time delta column, found columns:
+            ['time_delta_days', 'time_delta_hours']
     """
 
     # There should only be one time delta column, but this ensures we catch it regardless of the unit of time
@@ -94,10 +107,6 @@ def convert_to_NRT(df: pl.LazyFrame) -> JointNestedRaggedTensorDict:
         logger.warning("All columns are empty. Returning an empty tensor dict.")
         return JointNestedRaggedTensorDict({})
 
-    for k, v in tensors_dict.items():
-        if not v:
-            raise ValueError(f"Column {k} is empty")
-
     return JointNestedRaggedTensorDict(tensors_dict)
 
 

From 8dfeb7d0b24ae264be21755d4ecd0b0a874fc600 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 11:59:53 -0400
Subject: [PATCH 54/76] Corrected a test typo that was checking the wrong
 error.

---
 tests/MEDS_Transforms/test_tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/MEDS_Transforms/test_tokenization.py b/tests/MEDS_Transforms/test_tokenization.py
index 4a416c83..802ada2d 100644
--- a/tests/MEDS_Transforms/test_tokenization.py
+++ b/tests/MEDS_Transforms/test_tokenization.py
@@ -296,7 +296,7 @@ def test_tokenization():
     single_stage_transform_tester(
         transform_script=TOKENIZATION_SCRIPT,
         stage_name="tokenization",
-        transform_stage_kwargs={"train_only": True},
+        transform_stage_kwargs={"++train_only": True},
         input_shards=NORMALIZED_SHARDS,
         want_data={**WANT_SCHEMAS, **WANT_EVENT_SEQS},
         should_error=True,

From d8be8984c4c21f1ad368fdeeb530b032ab172aad Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 12:03:12 -0400
Subject: [PATCH 55/76] Added some no-cover cases.

---
 src/MEDS_transforms/extract/extract_code_metadata.py  | 4 ++--
 src/MEDS_transforms/extract/finalize_MEDS_metadata.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/MEDS_transforms/extract/extract_code_metadata.py b/src/MEDS_transforms/extract/extract_code_metadata.py
index 3460cfbd..279ce513 100644
--- a/src/MEDS_transforms/extract/extract_code_metadata.py
+++ b/src/MEDS_transforms/extract/extract_code_metadata.py
@@ -397,13 +397,13 @@ def main(cfg: DictConfig):
 
     logger.info("Extracted metadata for all events. Merging.")
 
-    if cfg.worker != 0:
+    if cfg.worker != 0:  # pragma: no cover
         logger.info("Code metadata extraction completed. Exiting")
         return
 
     logger.info("Starting reduction process")
 
-    while not all(fp.exists() for fp in all_out_fps):
+    while not all(fp.exists() for fp in all_out_fps):  # pragma: no cover
         missing_files_str = "\n".join(f"  - {str(fp.resolve())}" for fp in all_out_fps if not fp.exists())
         logger.info("Waiting to begin reduction for all files to be written...\n" f"{missing_files_str}")
         time.sleep(cfg.polling_time)
diff --git a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
index a0201803..81f840a4 100755
--- a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
+++ b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
@@ -147,7 +147,7 @@ def main(cfg: DictConfig):
         etl_metadata.dataset_version: The version of the dataset being extracted.
     """
 
-    if cfg.worker != 0:
+    if cfg.worker != 0:  # pragma: no cover
         logger.info("Non-zero worker found in reduce-only stage. Exiting")
         return
 
@@ -219,7 +219,7 @@ def main(cfg: DictConfig):
     for split, cnt in seen_splits.items():
         if cnt:
             logger.info(f"Split {split} has {cnt} subjects")
-        else:
+        else:  # pragma: no cover
             logger.warning(f"Split {split} not found in shards map")
 
     subject_splits_tbl = pa.Table.from_pylist(subject_splits, schema=subject_split_schema)

From 42d4d9df57a0741e996a28a846e422ca424b4ddf Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:12:05 -0400
Subject: [PATCH 56/76] Added edge case tests to rwlock_wrap

---
 src/MEDS_transforms/mapreduce/utils.py | 101 +++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 6 deletions(-)

diff --git a/src/MEDS_transforms/mapreduce/utils.py b/src/MEDS_transforms/mapreduce/utils.py
index 716ddc09..f203e083 100644
--- a/src/MEDS_transforms/mapreduce/utils.py
+++ b/src/MEDS_transforms/mapreduce/utils.py
@@ -134,6 +134,7 @@ def rwlock_wrap(
     compute_fn: Callable[[DF_T], DF_T],
     do_overwrite: bool = False,
     out_fp_checker: Callable[[Path], bool] = default_file_checker,
+    register_lock_fn: Callable[[Path], tuple[datetime, Path]] = register_lock,  # For dependency injection
 ) -> bool:
     """Wrap a series of file-in file-out map transformations on a dataframe with caching and locking.
 
@@ -161,6 +162,8 @@ def rwlock_wrap(
         >>> import polars as pl
         >>> import tempfile
         >>> directory = tempfile.TemporaryDirectory()
+        >>> read_fn = pl.read_csv
+        >>> write_fn = pl.DataFrame.write_csv
         >>> root = Path(directory.name)
         >>> # For this example we'll use a simple CSV file, but in practice we *strongly* recommend using
         >>> # Parquet files for performance reasons.
@@ -168,9 +171,8 @@ def rwlock_wrap(
         >>> out_fp = root / "output.csv"
         >>> in_df = pl.DataFrame({"a": [1, 3, 3], "b": [2, 4, 5], "c": [3, -1, 6]})
         >>> in_df.write_csv(in_fp)
-        >>> read_fn = pl.read_csv
-        >>> write_fn = pl.DataFrame.write_csv
-        >>> compute_fn = lambda df: df.with_columns(pl.col("c") * 2).filter(pl.col("c") > 4)
+        >>> def compute_fn(df: pl.DataFrame) -> pl.DataFrame:
+        ...     return df.with_columns(pl.col("c") * 2).filter(pl.col("c") > 4)
         >>> result_computed = rwlock_wrap(in_fp, out_fp, read_fn, write_fn, compute_fn)
         >>> assert result_computed
         >>> print(out_fp.read_text())
@@ -178,21 +180,108 @@ def rwlock_wrap(
         1,2,6
         3,5,12
         <BLANKLINE>
+        >>> in_df_2 = pl.DataFrame({"a": [1], "b": [3], "c": [-1]})
+        >>> in_fp_2 = root / "input_2.csv"
+        >>> in_df_2.write_csv(in_fp_2)
+        >>> compute_fn = lambda df: df
+        >>> result_computed = rwlock_wrap(in_fp_2, out_fp, read_fn, write_fn, compute_fn, do_overwrite=True)
+        >>> assert result_computed
+        >>> print(out_fp.read_text())
+        a,b,c
+        1,3,-1
+        <BLANKLINE>
         >>> out_fp.unlink()
         >>> compute_fn = lambda df: df.with_columns(pl.col("c") * 2).filter(pl.col("d") > 4)
         >>> rwlock_wrap(in_fp, out_fp, read_fn, write_fn, compute_fn)
         Traceback (most recent call last):
             ...
         polars.exceptions.ColumnNotFoundError: unable to find column "d"; valid columns: ["a", "b", "c"]
+        >>> assert not out_fp.is_file() # Out file should not be created when the process crashes
+
+        If we check the locks during computation, one should be present
         >>> cache_directory = root / f".output.csv_cache"
         >>> lock_dir = cache_directory / "locks"
-        >>> assert not list(lock_dir.iterdir())
+        >>> assert not list(lock_dir.iterdir()), "Lock dir starts empty"
         >>> def lock_dir_checker_fn(df: pl.DataFrame) -> pl.DataFrame:
         ...     print(f"Lock dir empty? {not (list(lock_dir.iterdir()))}")
         ...     return df
         >>> result_computed = rwlock_wrap(in_fp, out_fp, read_fn, write_fn, lock_dir_checker_fn)
         Lock dir empty? False
-        >>> assert result_computed
+        >>> result_computed
+        True
+        >>> assert not list(lock_dir.iterdir()), "Lock dir should be empty again"
+        >>> out_fp.unlink()
+
+        If we register a lock before we run, the process won't actually compute
+        >>> compute_fn = lambda df: df
+        >>> lock_time, lock_fp = register_lock(cache_directory)
+        >>> result_computed = rwlock_wrap(in_fp, out_fp, read_fn, write_fn, compute_fn)
+        >>> result_computed
+        False
+        >>> len(list(lock_dir.iterdir())) # The lock file at lock_fp should still exist
+        1
+        >>> lock_fp.unlink()
+        >>> assert not list(lock_dir.iterdir()), "Lock dir should be empty again"
+
+        If two processes collide when writing locks during lock registration before reading, the one that
+        writes a lock with an earlier timestamp wins and the later one does not read:
+        >>> def read_fn_and_print(in_fp: Path) -> pl.DataFrame:
+        ...     print("Reading!")
+        ...     return read_fn(in_fp)
+        >>> def register_lock_with_conflict_fntr(early: bool) -> callable:
+        ...     fake_lock_time = datetime(2021, 1, 1, 0, 0, 0) if early else datetime(5000, 1, 2, 0, 0, 0)
+        ...     def fn(cache_directory: Path) -> tuple[datetime, Path]:
+        ...         lock_fp = cache_directory / "locks" / f"{fake_lock_time.strftime(LOCK_TIME_FMT)}.json"
+        ...         lock_fp.write_text(json.dumps({"start": fake_lock_time.strftime(LOCK_TIME_FMT)}))
+        ...         return register_lock(cache_directory)
+        ...     return fn
+        >>> result_computed = rwlock_wrap(
+        ...     in_fp, out_fp, read_fn_and_print, write_fn, compute_fn,
+        ...     register_lock_fn=register_lock_with_conflict_fntr(early=True)
+        ... )
+        >>> result_computed
+        False
+        >>> len(list(lock_dir.iterdir())) # The lock file added during the registration should still exist.
+        1
+        >>> next(lock_dir.iterdir()).unlink()
+        >>> result_computed = rwlock_wrap(
+        ...     in_fp, out_fp, read_fn_and_print, write_fn, compute_fn,
+        ...     register_lock_fn=register_lock_with_conflict_fntr(early=False)
+        ... )
+        Reading!
+        >>> result_computed
+        True
+        >>> len(list(lock_dir.iterdir())) # The lock file added during the registration should still exist.
+        1
+        >>> next(lock_dir.iterdir()).unlink()
+        >>> out_fp.unlink()
+
+        If two processes collide when writing locks during reading, the one that writes a lock with an earlier
+        timestamp wins:
+        >>> def read_fn_with_lock_fntr(early: bool) -> callable:
+        ...     fake_lock_time = datetime(2021, 1, 1, 0, 0, 0) if early else datetime(5000, 1, 2, 0, 0, 0)
+        ...     def fn(in_fp: Path) -> pl.DataFrame:
+        ...         print("Reading!")
+        ...         df = read_fn(in_fp)
+        ...         lock_fp = lock_dir / f"{fake_lock_time.strftime(LOCK_TIME_FMT)}.json"
+        ...         lock_fp.write_text(json.dumps({"start": fake_lock_time.strftime(LOCK_TIME_FMT)}))
+        ...         return df
+        ...     return fn
+        >>> result_computed = rwlock_wrap(in_fp, out_fp, read_fn_with_lock_fntr(True), write_fn, compute_fn)
+        Reading!
+        >>> result_computed
+        False
+        >>> len(list(lock_dir.iterdir())) # The lock file added during the read should still exist.
+        1
+        >>> next(lock_dir.iterdir()).unlink()
+        >>> result_computed = rwlock_wrap(in_fp, out_fp, read_fn_with_lock_fntr(False), write_fn, compute_fn)
+        Reading!
+        >>> result_computed
+        True
+        >>> len(list(lock_dir.iterdir())) # The lock file added during the read should still exist.
+        1
+        >>> next(lock_dir.iterdir()).unlink()
+        >>> out_fp.unlink()
         >>> directory.cleanup()
     """
 
@@ -212,7 +301,7 @@ def rwlock_wrap(
         logger.info(f"{out_fp} is in progress as of {earliest_lock_time}. Returning.")
         return False
 
-    st_time, lock_fp = register_lock(cache_directory)
+    st_time, lock_fp = register_lock_fn(cache_directory)
 
     logger.info(f"Registered lock at {st_time}. Double checking no earlier locks have been registered.")
     earliest_lock_time = get_earliest_lock(cache_directory)

From 277e657a6712df0d1d557b4fc6a99d0f3e173241 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:15:52 -0400
Subject: [PATCH 57/76] Corrected a small bug and added tests for shard
 shuffling.

---
 src/MEDS_transforms/mapreduce/utils.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/MEDS_transforms/mapreduce/utils.py b/src/MEDS_transforms/mapreduce/utils.py
index f203e083..cfd184c6 100644
--- a/src/MEDS_transforms/mapreduce/utils.py
+++ b/src/MEDS_transforms/mapreduce/utils.py
@@ -352,13 +352,23 @@ def shuffle_shards(shards: list[str], cfg: DictConfig) -> list[str]:
         The shuffled list of shards.
 
     Examples:
-        >>> cfg = DictConfig({"worker": 1})
         >>> shards = ["train/0", "train/1", "tuning", "held_out"]
-        >>> shuffle_shards(shards, cfg)
+        >>> shuffle_shards(shards, DictConfig({"worker": 1}))
         ['train/1', 'held_out', 'tuning', 'train/0']
-        >>> cfg = DictConfig({"worker": 2})
-        >>> shuffle_shards(shards, cfg)
+        >>> shuffle_shards(shards, DictConfig({"worker": 2}))
         ['tuning', 'held_out', 'train/1', 'train/0']
+
+        It can also shuffle the shards without a worker ID, but the order is then based on the time, which
+        is not consistent across runs.
+        >>> sorted(shuffle_shards(shards, DictConfig({})))
+        ['held_out', 'train/0', 'train/1', 'tuning']
+
+        If the shards aren't unique, it will error
+        >>> shards = ["train/0", "train/0", "tuning", "held_out"]
+        >>> shuffle_shards(shards, DictConfig({"worker": 1}))
+        Traceback (most recent call last):
+            ...
+        ValueError: Hash collision for shard train/0 with add_str 1!
     """
 
     if "worker" in cfg:
@@ -368,10 +378,10 @@ def shuffle_shards(shards: list[str], cfg: DictConfig) -> list[str]:
 
     shard_keys = []
     for shard in shards:
-        shard_hash = hashlib.sha256((add_str + shard).encode("utf-8")).hexdigest()
+        shard_hash = int(hashlib.sha256((add_str + shard).encode("utf-8")).hexdigest(), 16)
         if shard_hash in shard_keys:
             raise ValueError(f"Hash collision for shard {shard} with add_str {add_str}!")
-        shard_keys.append(int(shard_hash, 16))
+        shard_keys.append(shard_hash)
 
     return [shard for _, shard in sorted(zip(shard_keys, shards))]
 

From 125076a9346bd415e29584b058b67d66d780b92d Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:19:43 -0400
Subject: [PATCH 58/76] Added an error test for a mapper function.

---
 src/MEDS_transforms/mapreduce/mapper.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/MEDS_transforms/mapreduce/mapper.py b/src/MEDS_transforms/mapreduce/mapper.py
index ade9910a..79e1da13 100644
--- a/src/MEDS_transforms/mapreduce/mapper.py
+++ b/src/MEDS_transforms/mapreduce/mapper.py
@@ -426,12 +426,26 @@ def match_revise_fntr(cfg: DictConfig, stage_cfg: DictConfig, compute_fn: ANY_CO
         ValueError: Missing needed columns {'missing'} for local matcher 0:
             [(col("missing")) == (String(CODE//TEMP_2))].all_horizontal()
         Columns available: 'code', 'initial_idx', 'subject_id', 'time'
+
+        It will throw an error if the match and revise configuration is missing.
         >>> stage_cfg = DictConfig({"global_code_end": "foo"})
         >>> cfg = DictConfig({"stage_cfg": stage_cfg})
         >>> match_revise_fn = match_revise_fntr(cfg, stage_cfg, compute_fn)
         Traceback (most recent call last):
             ...
         ValueError: Invalid match and revise configuration...
+
+        It does not accept invalid modes.
+        >>> stage_cfg = DictConfig({
+        ...     "global_code_end": "foo",
+        ...     "_match_revise_mode": "foobar",
+        ...     "_match_revise": [{"_matcher": {"code": "CODE//TEMP_2"}}]
+        ... })
+        >>> cfg = DictConfig({"stage_cfg": stage_cfg})
+        >>> match_revise_fn = match_revise_fntr(cfg, stage_cfg, compute_fn)
+        Traceback (most recent call last):
+            ...
+        ValueError: Invalid match and revise mode: foobar
     """
     try:
         validate_match_revise(stage_cfg)

From 8207c2f3f1065814c5183715d62ede1b417ddd5c Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:29:41 -0400
Subject: [PATCH 59/76] added some edge cases to the doctests for convert to
 sharded events.

---
 .../extract/convert_to_sharded_events.py      | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/MEDS_transforms/extract/convert_to_sharded_events.py b/src/MEDS_transforms/extract/convert_to_sharded_events.py
index 39aea54f..e459d91d 100755
--- a/src/MEDS_transforms/extract/convert_to_sharded_events.py
+++ b/src/MEDS_transforms/extract/convert_to_sharded_events.py
@@ -344,6 +344,52 @@ def extract_event(
         │ 2          ┆ DISCHARGE//Home ┆ 2021-01-05 15:23:45 ┆ AOx4              ┆ Home       │
         │ 3          ┆ DISCHARGE//SNF  ┆ 2021-01-06 16:34:56 ┆ AOx4              ┆ SNF        │
         └────────────┴─────────────────┴─────────────────────┴───────────────────┴────────────┘
+
+        If we make a non-key field use the `col(...)` syntax, it will log a warning but parse the field.
+        >>> valid_discharge_event_cfg = {
+        ...     "code": ["DISCHARGE", "col(discharge_location)"],
+        ...     "time": "col(discharge_time)",
+        ...     "categorical_value": "col(discharge_status)", # Note the raw dtype of this col is str
+        ...     "text_value": "discharge_location", # Note the raw dtype of this col is categorical
+        ... }
+        >>> extract_event(complex_raw_data, valid_discharge_event_cfg)
+        shape: (6, 5)
+        ┌────────────┬─────────────────┬─────────────────────┬───────────────────┬────────────┐
+        │ subject_id ┆ code            ┆ time                ┆ categorical_value ┆ text_value │
+        │ ---        ┆ ---             ┆ ---                 ┆ ---               ┆ ---        │
+        │ u8         ┆ str             ┆ datetime[μs]        ┆ str               ┆ str        │
+        ╞════════════╪═════════════════╪═════════════════════╪═══════════════════╪════════════╡
+        │ 1          ┆ DISCHARGE//Home ┆ 2021-01-01 11:23:45 ┆ AOx4              ┆ Home       │
+        │ 1          ┆ DISCHARGE//SNF  ┆ 2021-01-02 12:34:56 ┆ AO                ┆ SNF        │
+        │ 2          ┆ DISCHARGE//Home ┆ 2021-01-03 13:45:56 ┆ AAO               ┆ Home       │
+        │ 2          ┆ DISCHARGE//SNF  ┆ 2021-01-04 14:56:45 ┆ AOx3              ┆ SNF        │
+        │ 2          ┆ DISCHARGE//Home ┆ 2021-01-05 15:23:45 ┆ AOx4              ┆ Home       │
+        │ 3          ┆ DISCHARGE//SNF  ┆ 2021-01-06 16:34:56 ┆ AOx4              ┆ SNF        │
+        └────────────┴─────────────────┴─────────────────────┴───────────────────┴────────────┘
+
+        If a `categorical_value` field is of non-string type, it will be converted.
+        >>> valid_admission_event_cfg = {
+        ...     "code": ["ADMISSION", "col(admission_type)"],
+        ...     "time": "col(admission_time)",
+        ...     "time_format": "%Y-%m-%d %H:%M:%S",
+        ...     "categorical_value": "severity_score",
+        ... }
+        >>> extract_event(complex_raw_data, valid_admission_event_cfg)
+        shape: (6, 4)
+        ┌────────────┬──────────────┬─────────────────────┬───────────────────┐
+        │ subject_id ┆ code         ┆ time                ┆ categorical_value │
+        │ ---        ┆ ---          ┆ ---                 ┆ ---               │
+        │ u8         ┆ str          ┆ datetime[μs]        ┆ str               │
+        ╞════════════╪══════════════╪═════════════════════╪═══════════════════╡
+        │ 1          ┆ ADMISSION//A ┆ 2021-01-01 00:00:00 ┆ 1.0               │
+        │ 1          ┆ ADMISSION//B ┆ 2021-01-02 00:00:00 ┆ 2.0               │
+        │ 2          ┆ ADMISSION//C ┆ 2021-01-03 00:00:00 ┆ 3.0               │
+        │ 2          ┆ ADMISSION//D ┆ 2021-01-04 00:00:00 ┆ 4.0               │
+        │ 2          ┆ ADMISSION//E ┆ 2021-01-05 00:00:00 ┆ 5.0               │
+        │ 3          ┆ ADMISSION//F ┆ 2021-01-06 00:00:00 ┆ 6.0               │
+        └────────────┴──────────────┴─────────────────────┴───────────────────┘
+
+        More examples:
         >>> extract_event(complex_raw_data, valid_death_event_cfg)
         shape: (3, 3)
         ┌────────────┬───────┬─────────────────────┐
@@ -395,6 +441,10 @@ def extract_event(
         Traceback (most recent call last):
             ...
         ValueError: Source column 'discharge_time' for event column foobar is not numeric, string, or categorical! Cannot be used as an event col.
+        >>> extract_event(complex_raw_data, {"code": "col(NOT_PRESENT)", "time": None})
+        Traceback (most recent call last):
+            ...
+        KeyError: "Source column 'NOT_PRESENT' for event column code not found in DataFrame schema."
     """  # noqa: E501
     event_cfg = copy.deepcopy(event_cfg)
     event_exprs = {"subject_id": pl.col("subject_id")}

From 6246ad98e606ad6db024157e483ca51739d81bc3 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:34:52 -0400
Subject: [PATCH 60/76] Added error case tests for missing event config file
 for all extract stages.

---
 .../test_convert_to_sharded_events.py         | 18 +++++++++++++++++
 .../test_extract_code_metadata.py             | 18 ++++++++++++++++-
 .../MEDS_Extract/test_merge_to_MEDS_cohort.py | 17 +++++++++++++++-
 tests/MEDS_Extract/test_shard_events.py       | 15 ++++++++++++++
 .../test_split_and_shard_subjects.py          | 20 +++++++++++++++++++
 5 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/tests/MEDS_Extract/test_convert_to_sharded_events.py b/tests/MEDS_Extract/test_convert_to_sharded_events.py
index 653ce737..6b0de6bd 100644
--- a/tests/MEDS_Extract/test_convert_to_sharded_events.py
+++ b/tests/MEDS_Extract/test_convert_to_sharded_events.py
@@ -336,6 +336,24 @@ def test_convert_to_sharded_events():
         df_check_kwargs={"check_row_order": False, "check_column_order": False, "check_dtypes": False},
     )
 
+    # If we don't provide the event_cfgs.yaml file, the script should error.
+    single_stage_tester(
+        script=CONVERT_TO_SHARDED_EVENTS_SCRIPT,
+        stage_name="convert_to_sharded_events",
+        stage_kwargs={"do_dedup_text_and_numeric": True},
+        config_name="extract",
+        input_files={
+            "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)),
+            "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)),
+            "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)),
+            "metadata/.shards.json": SHARDS_JSON,
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        shards_map_fp="{input_dir}/metadata/.shards.json",
+        test_name="Stage tester: convert_to_sharded_events ; with dedup",
+        should_error=True,
+    )
+
     single_stage_tester(
         script=CONVERT_TO_SHARDED_EVENTS_SCRIPT,
         stage_name="convert_to_sharded_events",
diff --git a/tests/MEDS_Extract/test_extract_code_metadata.py b/tests/MEDS_Extract/test_extract_code_metadata.py
index 7700426f..d75f9ea8 100644
--- a/tests/MEDS_Extract/test_extract_code_metadata.py
+++ b/tests/MEDS_Extract/test_extract_code_metadata.py
@@ -4,7 +4,6 @@
 scripts.
 """
 
-
 import polars as pl
 
 from tests.MEDS_Extract import EXTRACT_CODE_METADATA_SCRIPT
@@ -202,3 +201,20 @@ def test_convert_to_sharded_events():
         df_check_kwargs={"check_row_order": False, "check_column_order": False, "check_dtypes": True},
         assert_no_other_outputs=False,
     )
+
+    # The script should error if the event config file is missing.
+    single_stage_tester(
+        script=EXTRACT_CODE_METADATA_SCRIPT,
+        stage_name="extract_code_metadata",
+        stage_kwargs=None,
+        config_name="extract",
+        input_files={
+            **INPUT_SHARDS,
+            "demo_metadata.csv": DEMO_METADATA_FILE,
+            "input_metadata.csv": INPUT_METADATA_FILE,
+            "metadata/.shards.json": SHARDS_JSON,
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        shards_map_fp="{input_dir}/metadata/.shards.json",
+        should_error=True,
+    )
diff --git a/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py b/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py
index 74688043..6339da79 100644
--- a/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py
+++ b/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py
@@ -250,7 +250,7 @@
 )
 
 
-def test_convert_to_sharded_events():
+def test_merge_to_MEDS_cohort():
     single_stage_tester(
         script=MERGE_TO_MEDS_COHORT_SCRIPT,
         stage_name="merge_to_MEDS_cohort",
@@ -266,3 +266,18 @@ def test_convert_to_sharded_events():
         want_outputs=WANT_OUTPUTS,
         df_check_kwargs={"check_column_order": False},
     )
+
+    # Should error without event conversion file
+    single_stage_tester(
+        script=MERGE_TO_MEDS_COHORT_SCRIPT,
+        stage_name="merge_to_MEDS_cohort",
+        stage_kwargs=None,
+        config_name="extract",
+        input_files={
+            **INPUT_SHARDS,
+            "metadata/.shards.json": SHARDS_JSON,
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        shards_map_fp="{input_dir}/metadata/.shards.json",
+        should_error=True,
+    )
diff --git a/tests/MEDS_Extract/test_shard_events.py b/tests/MEDS_Extract/test_shard_events.py
index f19746ec..0da979c5 100644
--- a/tests/MEDS_Extract/test_shard_events.py
+++ b/tests/MEDS_Extract/test_shard_events.py
@@ -112,3 +112,18 @@ def test_shard_events():
         },
         df_check_kwargs={"check_column_order": False},
     )
+
+    # Should error without event conversion config.
+    single_stage_tester(
+        script=SHARD_EVENTS_SCRIPT,
+        stage_name="shard_events",
+        stage_kwargs={"row_chunksize": 10},
+        config_name="extract",
+        input_files={
+            "subjects.csv": SUBJECTS_CSV,
+            "admit_vitals.csv": ADMIT_VITALS_CSV,
+            "admit_vitals.parquet": pl.read_csv(StringIO(ADMIT_VITALS_CSV)),
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        should_error=True,
+    )
diff --git a/tests/MEDS_Extract/test_split_and_shard_subjects.py b/tests/MEDS_Extract/test_split_and_shard_subjects.py
index db74896d..66b54212 100644
--- a/tests/MEDS_Extract/test_split_and_shard_subjects.py
+++ b/tests/MEDS_Extract/test_split_and_shard_subjects.py
@@ -131,3 +131,23 @@ def test_split_and_shard():
         event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
         want_outputs={"metadata/.shards.json": EXPECTED_SPLITS},
     )
+
+    # Should error without event config file.
+    single_stage_tester(
+        script=SPLIT_AND_SHARD_SCRIPT,
+        stage_name="split_and_shard_subjects",
+        stage_kwargs={
+            "split_fracs.train": 4 / 6,
+            "split_fracs.tuning": 1 / 6,
+            "split_fracs.held_out": 1 / 6,
+            "n_subjects_per_shard": 2,
+        },
+        config_name="extract",
+        input_files={
+            "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)),
+            "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)),
+            "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)),
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        should_error=True,
+    )

From b72b9a6993cca9e72bbda761e704e2c41d161318 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 30 Oct 2024 14:36:23 -0400
Subject: [PATCH 61/76] Added a no-cover case.

---
 src/MEDS_transforms/extract/convert_to_sharded_events.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/extract/convert_to_sharded_events.py b/src/MEDS_transforms/extract/convert_to_sharded_events.py
index e459d91d..1cbb665e 100755
--- a/src/MEDS_transforms/extract/convert_to_sharded_events.py
+++ b/src/MEDS_transforms/extract/convert_to_sharded_events.py
@@ -811,7 +811,7 @@ def compute_fn(df: pl.LazyFrame) -> pl.LazyFrame:
                             event_cfgs=copy.deepcopy(event_cfgs),
                             do_dedup_text_and_numeric=cfg.stage_cfg.get("do_dedup_text_and_numeric", False),
                         )
-                    except Exception as e:
+                    except Exception as e:  # pragma: no cover
                         raise ValueError(
                             f"Error converting {str(shard_fp.resolve())} for {sp}/{input_prefix}: {e}"
                         ) from e

From e96210f809b86f297a41da9fbe7dad16e1f88fb7 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 6 Nov 2024 15:57:46 -0500
Subject: [PATCH 62/76] Added no worker specified case to the runner
 parallelization args getter doctest.

---
 src/MEDS_transforms/runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index 037db005..e40a1877 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -82,6 +82,8 @@ def get_parallelization_args(
         []
         >>> get_parallelization_args(None, {"n_workers": 4})
         []
+        >>> get_parallelization_args({"launcher": "joblib"}, {})
+        ['--multirun', 'worker="range(0,1)"', 'hydra/launcher=joblib']
         >>> get_parallelization_args({"n_workers": 2, "launcher_params": 'foo'}, {})
         Traceback (most recent call last):
             ...

From 11b7e5711bf65b0470738ee786316bdcfa3de3db Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Wed, 6 Nov 2024 16:21:44 -0500
Subject: [PATCH 63/76] Added more tests and a no-cover case for an odd
 edge-case that has to do with inheritance.

---
 src/MEDS_transforms/runner.py |  2 +-
 tests/test_with_runner.py     | 81 ++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index e40a1877..c2436f5f 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -375,7 +375,7 @@ def load_yaml_file(path: str | None) -> dict | DictConfig:
 
     try:
         return OmegaConf.load(path)
-    except Exception as e:
+    except Exception as e:  # pragma: no cover
         logger.warning(f"Failed to load {path} as an OmegaConf: {e}. Trying as a plain YAML file.")
         yaml_text = path.read_text()
         return yaml.load(yaml_text, Loader=Loader)
diff --git a/tests/test_with_runner.py b/tests/test_with_runner.py
index 7913cb13..17443274 100644
--- a/tests/test_with_runner.py
+++ b/tests/test_with_runner.py
@@ -17,7 +17,6 @@
 The stage configuration arguments will be as given in the yaml block below:
 """
 
-
 from functools import partial
 
 from meds import code_metadata_filepath, subject_splits_filepath
@@ -87,6 +86,14 @@
 {STAGE_RUNNER_YAML}
 """
 
+PIPELINE_NO_STAGES_YAML = """
+defaults:
+  - _preprocess
+  - _self_
+
+input_dir: {{input_dir}}
+cohort_dir: {{cohort_dir}}
+"""
 
 PIPELINE_YAML = f"""
 defaults:
@@ -273,3 +280,75 @@ def test_pipeline():
         do_include_dirs=False,
         df_check_kwargs={"check_column_order": False},
     )
+
+    single_stage_tester(
+        script=RUNNER_SCRIPT,
+        config_name="runner",
+        stage_name=None,
+        stage_kwargs=None,
+        do_pass_stage_name=False,
+        do_use_config_yaml=False,
+        input_files={
+            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
+            code_metadata_filepath: MEDS_CODE_METADATA,
+            subject_splits_filepath: SPLITS_DF,
+        },
+        should_error=True,
+        pipeline_config_fp="{input_dir}/pipeline.yaml",
+        test_name="Runner should error without pipeline.yaml",
+        do_include_dirs=False,
+    )
+
+    single_stage_tester(
+        script=RUNNER_SCRIPT,
+        config_name="runner",
+        stage_name=None,
+        stage_kwargs=None,
+        do_pass_stage_name=False,
+        do_use_config_yaml=False,
+        input_files={
+            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
+            code_metadata_filepath: MEDS_CODE_METADATA,
+            subject_splits_filepath: SPLITS_DF,
+            "pipeline.json": partial(add_params, PIPELINE_YAML),
+        },
+        should_error=True,
+        pipeline_config_fp="{input_dir}/pipeline.json",
+        test_name="Runner should error when pipeline is not in yaml format",
+    )
+
+    single_stage_tester(
+        script=RUNNER_SCRIPT,
+        config_name="runner",
+        stage_name=None,
+        stage_kwargs=None,
+        do_pass_stage_name=False,
+        do_use_config_yaml=False,
+        input_files={
+            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
+            code_metadata_filepath: MEDS_CODE_METADATA,
+            subject_splits_filepath: SPLITS_DF,
+            "_preprocess.yaml": partial(add_params, PIPELINE_YAML),
+        },
+        should_error=True,
+        pipeline_config_fp="{input_dir}/_preprocess.yaml",
+        test_name="Runner should fail if the pipeline config has an invalid name",
+    )
+
+    single_stage_tester(
+        script=RUNNER_SCRIPT,
+        config_name="runner",
+        stage_name=None,
+        stage_kwargs=None,
+        do_pass_stage_name=False,
+        do_use_config_yaml=False,
+        input_files={
+            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
+            code_metadata_filepath: MEDS_CODE_METADATA,
+            subject_splits_filepath: SPLITS_DF,
+            "pipeline.yaml": partial(add_params, PIPELINE_NO_STAGES_YAML),
+        },
+        should_error=True,
+        pipeline_config_fp="{input_dir}/pipeline.yaml",
+        test_name="Runner should fail if the pipeline has no stages",
+    )

From 95df7103ffdc57b7fa744cded03bfabd4116b22e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 10:47:18 -0500
Subject: [PATCH 64/76] Removed some edge case errors that could never actually
 trigger and added more runner tests.

---
 src/MEDS_transforms/runner.py |  8 ++------
 tests/test_with_runner.py     | 38 ++---------------------------------
 2 files changed, 4 insertions(+), 42 deletions(-)

diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py
index c2436f5f..ca281223 100755
--- a/src/MEDS_transforms/runner.py
+++ b/src/MEDS_transforms/runner.py
@@ -279,13 +279,7 @@ def main(cfg: DictConfig):
     pipeline.
     """
 
-    hydra_loguru_init()
-
     pipeline_config_fp = Path(cfg.pipeline_config_fp)
-    if not pipeline_config_fp.exists():
-        raise FileNotFoundError(f"Pipeline configuration file {pipeline_config_fp} does not exist.")
-    if not pipeline_config_fp.suffix == ".yaml":
-        raise ValueError(f"Pipeline configuration file {pipeline_config_fp} must have a .yaml extension.")
     if pipeline_config_fp.stem in RESERVED_CONFIG_NAMES:
         raise ValueError(
             f"Pipeline configuration file {pipeline_config_fp} must not have a name in "
@@ -297,6 +291,8 @@ def main(cfg: DictConfig):
     if not stages:
         raise ValueError("Pipeline configuration must specify at least one stage.")
 
+    hydra_loguru_init()
+
     log_dir = Path(cfg.log_dir)
 
     if cfg.get("do_profile", False):  # pragma: no cover
diff --git a/tests/test_with_runner.py b/tests/test_with_runner.py
index 17443274..84d9ad63 100644
--- a/tests/test_with_runner.py
+++ b/tests/test_with_runner.py
@@ -281,42 +281,6 @@ def test_pipeline():
         df_check_kwargs={"check_column_order": False},
     )
 
-    single_stage_tester(
-        script=RUNNER_SCRIPT,
-        config_name="runner",
-        stage_name=None,
-        stage_kwargs=None,
-        do_pass_stage_name=False,
-        do_use_config_yaml=False,
-        input_files={
-            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
-            code_metadata_filepath: MEDS_CODE_METADATA,
-            subject_splits_filepath: SPLITS_DF,
-        },
-        should_error=True,
-        pipeline_config_fp="{input_dir}/pipeline.yaml",
-        test_name="Runner should error without pipeline.yaml",
-        do_include_dirs=False,
-    )
-
-    single_stage_tester(
-        script=RUNNER_SCRIPT,
-        config_name="runner",
-        stage_name=None,
-        stage_kwargs=None,
-        do_pass_stage_name=False,
-        do_use_config_yaml=False,
-        input_files={
-            **{f"data/{k}": v for k, v in MEDS_SHARDS.items()},
-            code_metadata_filepath: MEDS_CODE_METADATA,
-            subject_splits_filepath: SPLITS_DF,
-            "pipeline.json": partial(add_params, PIPELINE_YAML),
-        },
-        should_error=True,
-        pipeline_config_fp="{input_dir}/pipeline.json",
-        test_name="Runner should error when pipeline is not in yaml format",
-    )
-
     single_stage_tester(
         script=RUNNER_SCRIPT,
         config_name="runner",
@@ -330,6 +294,7 @@ def test_pipeline():
             subject_splits_filepath: SPLITS_DF,
             "_preprocess.yaml": partial(add_params, PIPELINE_YAML),
         },
+        do_include_dirs=False,
         should_error=True,
         pipeline_config_fp="{input_dir}/_preprocess.yaml",
         test_name="Runner should fail if the pipeline config has an invalid name",
@@ -348,6 +313,7 @@ def test_pipeline():
             subject_splits_filepath: SPLITS_DF,
             "pipeline.yaml": partial(add_params, PIPELINE_NO_STAGES_YAML),
         },
+        do_include_dirs=False,
         should_error=True,
         pipeline_config_fp="{input_dir}/pipeline.yaml",
         test_name="Runner should fail if the pipeline has no stages",

From d643eae85c8d9b99d30dbba062c83961bc21b4ab Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 10:53:09 -0500
Subject: [PATCH 65/76] Added (properly) failing test for broken non-split
 shard filtering

---
 .../test_aggregate_code_metadata.py               | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/MEDS_Transforms/test_aggregate_code_metadata.py b/tests/MEDS_Transforms/test_aggregate_code_metadata.py
index a2abce52..21cb1cbe 100644
--- a/tests/MEDS_Transforms/test_aggregate_code_metadata.py
+++ b/tests/MEDS_Transforms/test_aggregate_code_metadata.py
@@ -9,6 +9,7 @@
 from tests.MEDS_Transforms import AGGREGATE_CODE_METADATA_SCRIPT
 from tests.MEDS_Transforms.transform_tester_base import (
     MEDS_CODE_METADATA_SCHEMA,
+    MEDS_SHARDS,
     single_stage_transform_tester,
 )
 
@@ -186,3 +187,17 @@ def test_aggregate_code_metadata():
         assert_no_other_outputs=False,
         df_check_kwargs={"check_column_order": False},
     )
+
+    # Test with shards re-mapped so it has to use the splits file.
+    remapped_shards = {str(i): v for i, v in enumerate(MEDS_SHARDS.values())}
+    single_stage_transform_tester(
+        transform_script=AGGREGATE_CODE_METADATA_SCRIPT,
+        stage_name="aggregate_code_metadata",
+        transform_stage_kwargs={"aggregations": AGGREGATIONS, "do_summarize_over_all_codes": True},
+        want_metadata=WANT_OUTPUT_CODE_METADATA_FILE,
+        input_code_metadata=MEDS_CODE_METADATA_FILE,
+        do_use_config_yaml=True,
+        assert_no_other_outputs=False,
+        df_check_kwargs={"check_column_order": False},
+        input_shards=remapped_shards,
+    )

From 329fbfa998fb7e8717777e69c3921f89e5a10507 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 10:54:43 -0500
Subject: [PATCH 66/76] Closes #221

---
 src/MEDS_transforms/mapreduce/mapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/mapreduce/mapper.py b/src/MEDS_transforms/mapreduce/mapper.py
index 79e1da13..61ef6124 100644
--- a/src/MEDS_transforms/mapreduce/mapper.py
+++ b/src/MEDS_transforms/mapreduce/mapper.py
@@ -653,7 +653,7 @@ def map_over(
                 .collect()[subject_id_field]
                 .to_list()
             )
-            read_fn = read_and_filter_fntr(train_subjects, read_fn)
+            read_fn = read_and_filter_fntr(pl.col("subject_id").is_in(train_subjects), read_fn)
         else:
             raise FileNotFoundError(
                 f"Train split requested, but shard prefixes can't be used and "

From d6da577455ac9bbf43ce15a71583a8a355f36a9f Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 10:58:40 -0500
Subject: [PATCH 67/76] Added an error case checker for missing mapreduce
 files.

---
 .../MEDS_Transforms/test_aggregate_code_metadata.py  | 12 ++++++++++++
 tests/MEDS_Transforms/transform_tester_base.py       |  6 ++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/MEDS_Transforms/test_aggregate_code_metadata.py b/tests/MEDS_Transforms/test_aggregate_code_metadata.py
index 21cb1cbe..90850c3c 100644
--- a/tests/MEDS_Transforms/test_aggregate_code_metadata.py
+++ b/tests/MEDS_Transforms/test_aggregate_code_metadata.py
@@ -201,3 +201,15 @@ def test_aggregate_code_metadata():
         df_check_kwargs={"check_column_order": False},
         input_shards=remapped_shards,
     )
+
+    single_stage_transform_tester(
+        transform_script=AGGREGATE_CODE_METADATA_SCRIPT,
+        stage_name="aggregate_code_metadata",
+        transform_stage_kwargs={"aggregations": AGGREGATIONS, "do_summarize_over_all_codes": True},
+        want_metadata=WANT_OUTPUT_CODE_METADATA_FILE,
+        input_code_metadata=MEDS_CODE_METADATA_FILE,
+        do_use_config_yaml=True,
+        input_shards=remapped_shards,
+        splits_fp=None,
+        should_error=True,
+    )
diff --git a/tests/MEDS_Transforms/transform_tester_base.py b/tests/MEDS_Transforms/transform_tester_base.py
index 7a26c855..e955946c 100644
--- a/tests/MEDS_Transforms/transform_tester_base.py
+++ b/tests/MEDS_Transforms/transform_tester_base.py
@@ -4,7 +4,6 @@
 scripts.
 """
 
-
 from collections import defaultdict
 from io import StringIO
 from pathlib import Path
@@ -158,6 +157,7 @@ def remap_inputs_for_transform(
     input_shards: dict[str, pl.DataFrame] | None = None,
     input_shards_map: dict[str, list[int]] | None = None,
     input_splits_map: dict[str, list[int]] | None = None,
+    splits_fp: Path | str | None = subject_splits_filepath,
 ) -> dict[str, FILE_T]:
     unified_inputs = {}
 
@@ -192,7 +192,9 @@ def remap_inputs_for_transform(
 
         input_splits_df = pl.DataFrame(input_splits_as_df)
 
-    unified_inputs[subject_splits_filepath] = input_splits_df
+    if splits_fp is not None:
+        # This case is added for error testing; not for general use.
+        unified_inputs[splits_fp] = input_splits_df
 
     return unified_inputs
 

From d80b12bc52947f47f4f610221d5daedfc1f3042e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 10:59:18 -0500
Subject: [PATCH 68/76] Added a no-cover case for a very odd edge case checker.

---
 src/MEDS_transforms/mapreduce/mapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MEDS_transforms/mapreduce/mapper.py b/src/MEDS_transforms/mapreduce/mapper.py
index 61ef6124..9ce69c8d 100644
--- a/src/MEDS_transforms/mapreduce/mapper.py
+++ b/src/MEDS_transforms/mapreduce/mapper.py
@@ -659,7 +659,7 @@ def map_over(
                 f"Train split requested, but shard prefixes can't be used and "
                 f"subject split file not found at {str(split_fp.resolve())}."
             )
-    elif includes_only_train:
+    elif includes_only_train:  # pragma: no cover
         raise ValueError("All splits should be used, but shard iterator is returning only train splits?!?")
 
     if is_match_revise(cfg.stage_cfg):

From 2f0cc1d50b75103d37c240cbbfa92d20bcce927a Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:07:36 -0500
Subject: [PATCH 69/76] Added error case testers for merging and a test for
 extracting metadata.

---
 .../extract/finalize_MEDS_metadata.py         |  8 +---
 .../extract/merge_to_MEDS_cohort.py           | 38 ++++++++++++++++++-
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
index 81f840a4..a02932b8 100755
--- a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
+++ b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py
@@ -2,6 +2,7 @@
 """Utilities for finalizing the metadata files for extracted MEDS datasets."""
 
 import json
+from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
 
@@ -17,12 +18,9 @@
     code_metadata_schema,
     dataset_metadata_filepath,
     dataset_metadata_schema,
-    held_out_split,
     subject_id_field,
     subject_split_schema,
     subject_splits_filepath,
-    train_split,
-    tuning_split,
 )
 from omegaconf import DictConfig
 
@@ -206,12 +204,10 @@ def main(cfg: DictConfig):
     logger.info("Creating subject splits from {str(shards_map_fp.resolve())}")
     shards_map = json.loads(shards_map_fp.read_text())
     subject_splits = []
-    seen_splits = {train_split: 0, tuning_split: 0, held_out_split: 0}
+    seen_splits = defaultdict(int)
     for shard, subject_ids in shards_map.items():
         split = "/".join(shard.split("/")[:-1])
 
-        if split not in seen_splits:
-            seen_splits[split] = 0
         seen_splits[split] += len(subject_ids)
 
         subject_splits.extend([{subject_id_field: pid, "split": split} for pid in subject_ids])
diff --git a/src/MEDS_transforms/extract/merge_to_MEDS_cohort.py b/src/MEDS_transforms/extract/merge_to_MEDS_cohort.py
index 2b75eb0f..adf36cfc 100755
--- a/src/MEDS_transforms/extract/merge_to_MEDS_cohort.py
+++ b/src/MEDS_transforms/extract/merge_to_MEDS_cohort.py
@@ -137,7 +137,7 @@ def merge_subdirs_and_sort(
         ...     merge_subdirs_and_sort(
         ...         sp_dir,
         ...         event_subsets=["subdir1", "subdir2"],
-        ...         unique_by=["subject_id", "time", "code"],
+        ...         unique_by=["subject_id", "time", "code", "missing_col_will_not_error"],
         ...         additional_sort_by=["code", "numeric_value"]
         ...     ).select("subject_id", "time", "code").collect()
         shape: (6, 3)
@@ -153,6 +153,42 @@ def merge_subdirs_and_sort(
         │ 2          ┆ 20   ┆ B    │
         │ 3          ┆ 8    ┆ E    │
         └────────────┴──────┴──────┘
+        >>> with TemporaryDirectory() as tmpdir:
+        ...     sp_dir = Path(tmpdir)
+        ...     (sp_dir / "subdir1").mkdir()
+        ...     df1.write_parquet(sp_dir / "subdir1" / "file1.parquet")
+        ...     df2.write_parquet(sp_dir / "subdir1" / "file2.parquet")
+        ...     (sp_dir / "subdir2").mkdir()
+        ...     df3.write_parquet(sp_dir / "subdir2" / "df.parquet")
+        ...     # We just display the subject ID, time, and code columns as the numeric value column
+        ...     # is not guaranteed to be deterministic in the output given some rows will be dropped due to
+        ...     # the unique-by constraint.
+        ...     merge_subdirs_and_sort(
+        ...         sp_dir,
+        ...         event_subsets=["subdir1", "subdir2"],
+        ...         unique_by=352.2, # This will error
+        ...     )
+        Traceback (most recent call last):
+            ...
+        ValueError: Invalid unique_by value: 352.2
+        >>> with TemporaryDirectory() as tmpdir:
+        ...     sp_dir = Path(tmpdir)
+        ...     (sp_dir / "subdir1").mkdir()
+        ...     df1.write_parquet(sp_dir / "subdir1" / "file1.parquet")
+        ...     df2.write_parquet(sp_dir / "subdir1" / "file2.parquet")
+        ...     (sp_dir / "subdir2").mkdir()
+        ...     df3.write_parquet(sp_dir / "subdir2" / "df.parquet")
+        ...     # We just display the subject ID, time, and code columns as the numeric value column
+        ...     # is not guaranteed to be deterministic in the output given some rows will be dropped due to
+        ...     # the unique-by constraint.
+        ...     merge_subdirs_and_sort(
+        ...         sp_dir,
+        ...         event_subsets=["subdir1", "subdir2", "subdir3", "this is missing so will error"],
+        ...         unique_by=None,
+        ...     )
+        Traceback (most recent call last):
+            ...
+        RuntimeError: Number of found subsets (2) does not match number of subsets in event_config (4): ...
     """
     files_to_read = [fp for es in event_subsets for fp in (sp_dir / es).glob("*.parquet")]
     if not files_to_read:

From 5f85c6aa0290b143a867b6fa215b916a31b0b701 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:12:21 -0500
Subject: [PATCH 70/76] Added some error tests to shard events.

---
 tests/MEDS_Extract/test_shard_events.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/MEDS_Extract/test_shard_events.py b/tests/MEDS_Extract/test_shard_events.py
index 0da979c5..f4afa280 100644
--- a/tests/MEDS_Extract/test_shard_events.py
+++ b/tests/MEDS_Extract/test_shard_events.py
@@ -21,6 +21,10 @@
 68729,03/09/1978,HAZEL,160.3953106166676
 """
 
+EMPTY_SUBJECTS_CSV = """
+MRN,dob,eye_color,height
+"""
+
 ADMIT_VITALS_CSV = """
 subject_id,admit_date,disch_date,department,vitals_date,HR,temp
 239684,"05/11/2010, 17:41:51","05/11/2010, 19:27:19",CARDIAC,"05/11/2010, 18:57:18",112.6,95.5
@@ -113,7 +117,6 @@ def test_shard_events():
         df_check_kwargs={"check_column_order": False},
     )
 
-    # Should error without event conversion config.
     single_stage_tester(
         script=SHARD_EVENTS_SCRIPT,
         stage_name="shard_events",
@@ -122,8 +125,19 @@ def test_shard_events():
         input_files={
             "subjects.csv": SUBJECTS_CSV,
             "admit_vitals.csv": ADMIT_VITALS_CSV,
-            "admit_vitals.parquet": pl.read_csv(StringIO(ADMIT_VITALS_CSV)),
         },
         event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
         should_error=True,
+        test_name="Shard events should error without event conversion config",
+    )
+
+    single_stage_tester(
+        script=SHARD_EVENTS_SCRIPT,
+        stage_name="shard_events",
+        stage_kwargs={"row_chunksize": 10},
+        config_name="extract",
+        input_files={"subjects.csv": EMPTY_SUBJECTS_CSV, "event_cfgs.yaml": EVENT_CFGS_YAML},
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        should_error=True,
+        test_name="Shard events should error when an input file is empty",
     )

From 2b676c1a03055f15fdc4fe1a8e9ea746ded16d77 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:17:16 -0500
Subject: [PATCH 71/76] Added (properly) failing test for passing an external
 splits file

---
 .../test_split_and_shard_subjects.py          | 25 ++++++++++++++++++-
 tests/utils.py                                |  3 +++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests/MEDS_Extract/test_split_and_shard_subjects.py b/tests/MEDS_Extract/test_split_and_shard_subjects.py
index 66b54212..a5b0fd5b 100644
--- a/tests/MEDS_Extract/test_split_and_shard_subjects.py
+++ b/tests/MEDS_Extract/test_split_and_shard_subjects.py
@@ -132,7 +132,29 @@ def test_split_and_shard():
         want_outputs={"metadata/.shards.json": EXPECTED_SPLITS},
     )
 
-    # Should error without event config file.
+    single_stage_tester(
+        script=SPLIT_AND_SHARD_SCRIPT,
+        stage_name="split_and_shard_subjects",
+        stage_kwargs={
+            "split_fracs.train": 4 / 6,
+            "split_fracs.tuning": 1 / 6,
+            "split_fracs.held_out": 1 / 6,
+            "n_subjects_per_shard": 2,
+            "external_splits_json_fp": "{input_dir}/external_splits.json",
+        },
+        config_name="extract",
+        input_files={
+            "external_splits.json": EXPECTED_SPLITS,
+            "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)),
+            "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)),
+            "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)),
+            "event_cfgs.yaml": EVENT_CFGS_YAML,
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        want_outputs={"metadata/.shards.json": EXPECTED_SPLITS},
+        test_name="Split and shard events should work with an external splits file.",
+    )
+
     single_stage_tester(
         script=SPLIT_AND_SHARD_SCRIPT,
         stage_name="split_and_shard_subjects",
@@ -150,4 +172,5 @@ def test_split_and_shard():
         },
         event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
         should_error=True,
+        test_name="Split and shard events should error without an event config file.",
     )
diff --git a/tests/utils.py b/tests/utils.py
index 0e9ae943..71e60d64 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -409,6 +409,9 @@ def single_stage_tester(
         for k, v in pipeline_kwargs.items():
             if type(v) is str and "{input_dir}" in v:
                 pipeline_kwargs[k] = v.format(input_dir=str(input_dir.resolve()))
+        for k, v in stage_kwargs.items():
+            if type(v) is str and "{input_dir}" in v:
+                stage_kwargs[k] = v.format(input_dir=str(input_dir.resolve()))
 
         pipeline_config_kwargs = {
             "hydra.verbose": hydra_verbose,

From 51e8f8787821671d6d852c53189668febde93c9a Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:19:34 -0500
Subject: [PATCH 72/76] Closes #222

---
 src/MEDS_transforms/extract/split_and_shard_subjects.py | 2 +-
 tests/MEDS_Extract/test_split_and_shard_subjects.py     | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/MEDS_transforms/extract/split_and_shard_subjects.py b/src/MEDS_transforms/extract/split_and_shard_subjects.py
index 0dc3342f..cf968e52 100755
--- a/src/MEDS_transforms/extract/split_and_shard_subjects.py
+++ b/src/MEDS_transforms/extract/split_and_shard_subjects.py
@@ -251,7 +251,7 @@ def main(cfg: DictConfig):
         if not external_splits_json_fp.exists():
             raise FileNotFoundError(f"External splits JSON file not found at {external_splits_json_fp}")
 
-        logger.info(f"Reading external splits from {str(cfg.stage_cfg.external_splits_json_fp.resolve())}")
+        logger.info(f"Reading external splits from {str(external_splits_json_fp.resolve())}")
         external_splits = json.loads(external_splits_json_fp.read_text())
 
         size_strs = ", ".join(f"{k}: {len(v)}" for k, v in external_splits.items())
diff --git a/tests/MEDS_Extract/test_split_and_shard_subjects.py b/tests/MEDS_Extract/test_split_and_shard_subjects.py
index a5b0fd5b..9de3136a 100644
--- a/tests/MEDS_Extract/test_split_and_shard_subjects.py
+++ b/tests/MEDS_Extract/test_split_and_shard_subjects.py
@@ -103,6 +103,12 @@
     "held_out/0": [1500733],
 }
 
+EXTERNAL_SPLITS = {
+    "train": [239684, 1195293, 68729, 814703],
+    "tuning": [754281],
+    "held_out": [1500733],
+}
+
 SUBJECT_SPLITS_DF = pl.DataFrame(
     {
         "subject_id": [239684, 1195293, 68729, 814703, 754281, 1500733],
@@ -144,7 +150,7 @@ def test_split_and_shard():
         },
         config_name="extract",
         input_files={
-            "external_splits.json": EXPECTED_SPLITS,
+            "external_splits.json": EXTERNAL_SPLITS,
             "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)),
             "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)),
             "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)),

From cab7b32ae83d172325b3391beefa8664c64b4a27 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:21:03 -0500
Subject: [PATCH 73/76] Added error test case for external splits

---
 .../test_split_and_shard_subjects.py          | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/MEDS_Extract/test_split_and_shard_subjects.py b/tests/MEDS_Extract/test_split_and_shard_subjects.py
index 9de3136a..ef80f7b9 100644
--- a/tests/MEDS_Extract/test_split_and_shard_subjects.py
+++ b/tests/MEDS_Extract/test_split_and_shard_subjects.py
@@ -180,3 +180,25 @@ def test_split_and_shard():
         should_error=True,
         test_name="Split and shard events should error without an event config file.",
     )
+
+    single_stage_tester(
+        script=SPLIT_AND_SHARD_SCRIPT,
+        stage_name="split_and_shard_subjects",
+        stage_kwargs={
+            "split_fracs.train": 4 / 6,
+            "split_fracs.tuning": 1 / 6,
+            "split_fracs.held_out": 1 / 6,
+            "n_subjects_per_shard": 2,
+            "external_splits_json_fp": "{input_dir}/external_splits.json",
+        },
+        config_name="extract",
+        input_files={
+            "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)),
+            "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)),
+            "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)),
+            "event_cfgs.yaml": EVENT_CFGS_YAML,
+        },
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        should_error=True,
+        test_name="Split and shard events should error if an external splits file is requested but absent.",
+    )

From 7c5d3bfe2a624cc088f2700b7d8693825bddf957 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:32:44 -0500
Subject: [PATCH 74/76] Fixed small test typo

---
 tests/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index 71e60d64..4bb9741c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -405,6 +405,9 @@ def single_stage_tester(
     if df_check_kwargs is None:
         df_check_kwargs = {}
 
+    if stage_kwargs is None:
+        stage_kwargs = {}
+
     with input_dataset(input_files) as (input_dir, cohort_dir):
         for k, v in pipeline_kwargs.items():
             if type(v) is str and "{input_dir}" in v:

From 105f8e121420dc63498ff768e06718f6a2345fdb Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:51:04 -0500
Subject: [PATCH 75/76] Added a test that must have been accidentally deleted
 or overwritten.

---
 tests/MEDS_Extract/test_shard_events.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/MEDS_Extract/test_shard_events.py b/tests/MEDS_Extract/test_shard_events.py
index f4afa280..7e1c696a 100644
--- a/tests/MEDS_Extract/test_shard_events.py
+++ b/tests/MEDS_Extract/test_shard_events.py
@@ -131,6 +131,17 @@ def test_shard_events():
         test_name="Shard events should error without event conversion config",
     )
 
+    single_stage_tester(
+        script=SHARD_EVENTS_SCRIPT,
+        stage_name="shard_events",
+        stage_kwargs={"row_chunksize": 10},
+        config_name="extract",
+        input_files={"event_cfgs.yaml": EVENT_CFGS_YAML},
+        event_conversion_config_fp="{input_dir}/event_cfgs.yaml",
+        should_error=True,
+        test_name="Shard events should error when missing all input files",
+    )
+
     single_stage_tester(
         script=SHARD_EVENTS_SCRIPT,
         stage_name="shard_events",

From 04c82a65503737aa08a898a856f32a920e642cc7 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 8 Nov 2024 11:56:25 -0500
Subject: [PATCH 76/76] Corrected pre-commit errors

---
 .pre-commit-config.yaml                       | 2 +-
 eICU_Example/configs/table_preprocessors.yaml | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f12be1f..0d7f40da 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 default_language_version:
   python: python3.12
 
-exclude: "docs/index.md|MIMIC-IV_Example/README.md|eICU_Example/README.md"
+exclude: "docs/index.md|MIMIC-IV_Example/README.md|eICU_Example/README.md|AUMCdb_Example/README.md"
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index 13cfe4ad..29049887 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -2,7 +2,9 @@ admissionDx:
   offset_col: "admitdxenteredoffset"
   pseudotime_col: "admitDxEnteredTimestamp"
   output_data_cols: ["admitdxname", "admissiondxid"]
-  warning_items: ["How should we use `admitdxtest`?", "How should we use `admitdxpath`?"]
+  warning_items:
+    - "How should we use `admitdxtest`?"
+    - "How should we use `admitdxpath`?"
 
 allergy:
   offset_col: "allergyenteredoffset"