diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml index ba2caf4..874da12 100644 --- a/.github/workflows/code-quality-main.yaml +++ b/.github/workflows/code-quality-main.yaml @@ -13,12 +13,16 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v3 + - name: Set up Python + uses: actions/setup-python@v5 with: python-version: "3.10" + - name: Install packages + run: | + pip install .[dev] + - name: Run pre-commits uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml index 9a33678..bee2e11 100644 --- a/.github/workflows/code-quality-pr.yaml +++ b/.github/workflows/code-quality-pr.yaml @@ -16,13 +16,17 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" + - name: Install packages + run: | + pip install .[dev] + - name: Find modified files id: file_changes uses: trilom/file-changes-action@v1.2.4 diff --git a/.github/workflows/python-build.yaml b/.github/workflows/python-build.yaml index 3f3c96d..b22ff87 100644 --- a/.github/workflows/python-build.yaml +++ b/.github/workflows/python-build.yaml @@ -10,7 +10,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.10" - name: Install pypa/build diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 3faf789..31a46da 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,10 +17,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5c5591c..38d66f1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ exclude: "docs/index.md" repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: # list of supported hooks: https://pre-commit.com/hooks.html - id: trailing-whitespace diff --git a/README.md b/README.md index c7460b6..0d9c209 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,129 @@ This repository contains the dataset, task, model training recipes, and results effort for EHR machine learning. Note that this repository is _not_ a place where functional code is stored. Rather, this repository stores -configuration files, training recipes, results, etc. for the MEDS-DEV benchmarking effort -- runnable code will +configuration files, training recipes, results, etc. for the MEDS-DEV benchmarking effort -- runnable code +will often come from other repositories, with suitable permalinks being present in the various configuration files or commit messages for associated contributions to this repository. +## Example workflow + +### (Optional) Set up the MEDS project with environment + +```bash +# Create and enter a MEDS project directory +mkdir $MY_MEDS_PROJECT_ROOT +cd $MY_MEDS_PROJECT_ROOT + +conda create -n $MY_MEDS_CONDA_ENV python=3.10 +conda activate $MY_MEDS_CONDA_ENV +``` + +Additionally install any model-related dependencies. + +### Install MEDS-DEV + +Clone the MEDS-DEV GitHub repo and install it locally. +This will additionally install some MEDS data processing dependencies: + +```bash +git clone https://github.com/mmcdermott/MEDS-DEV.git +cd ./MEDS-DEV +pip install -e . +``` + +Install the MEDS evaluation package: + +```bash +git clone https://github.com/kamilest/meds-evaluation.git +pip install -e ./meds-evaluation +``` + +Additionally, make sure any model-related dependencies are installed. + +### Extract a task from the MEDS dataset + +This step prepares the MEDS dataset for a task by extracting a cohort using inclusion/exclusion criteria and +processing the data to create the label files. + +### Find the task configuration file + +Task-related information is stored in Hydra configuration files (in `.yaml` format) under +`MEDS-DEV/src/MEDS_DEV/tasks/criteria`. + +Task names are defined in a way that corresponds to the path to their configuration, +starting from the `MEDS-DEV/src/MEDS_DEV/tasks/criteria` directory. +For example, +`MEDS-DEV/src/MEDS_DEV/tasks/criteria/mortality/in_icu/first_24h.yaml` directory corresponds to a `$TASK_NAME` +of +`mortality/in_icu/first_24h`. + +**To add a task** + +If your task is not supported, you will need to add a directory and define an appropriate configuration file +in +a corresponding location. + +### Dataset configuration file + +Task configuration files are incomplete, because some concepts (predicates) have to be defined in a +dataset-specific +way (e.g. `icu_admission` in `mortality/in_icu/first_24h`). + +These dataset-specific predicate definitions are found in +`MEDS-DEV/src/MEDS_DEV/datasets/$DATASET_NAME/predicates.yaml` Hydra configuration files. + +In addition to `$DATASET_NAME` (e.g. `MIMIC-IV`), you will also need to have your MEDS dataset directory +ready (i.e. +`$MEDS_ROOT_DIR`). + +**To add a dataset configuration file** + +If your dataset is not supported, you will need to add a directory and define an appropriate configuration +file in +a corresponding location. + +### Run the MEDS task extraction helper + +From your project directory (`$MY_MEDS_PROJECT_ROOT`) where `MEDS-DEV` is located, run + +```bash +./MEDS-DEV/src/MEDS_DEV/helpers/extract_task.sh $MEDS_ROOT_DIR $DATASET_NAME $TASK_NAME +``` + +This will use information from task and dataset-specific predicate configs to extract cohorts and labels from +`$MEDS_ROOT_DIR/data`, and place them in `$MEDS_ROOT_DIR/task_labels/$TASK_NAME/` subdirectories, retaining +the same +sharded structure as the `$MEDS_ROOT_DIR/data` directory. + +### Train the model + +This step depends on the API of your particular model. + +For example, the command below will call a helper script that will generate random outputs for binary +classification, +conforming to MEDS binary classification prediction schema: + +```bash +./MEDS-DEV/src/MEDS_DEV/helpers/generate_predictions.sh $MEDS_ROOT_DIR $TASK_NAME +``` + +### Evaluate the model + +You can use the `meds-evaluation` package by running `meds-evaluation-cli` and providing the path to +predictions +dataframe as well as the output directory. For example, + +```bash +meds-evaluation-cli \ + predictions_path="./<$MEDS_ROOT_DIR>/task_predictions/$TASK_NAME//*.parquet" \ + output_dir="./<$MEDS_ROOT_DIR>/task_evaluation/$TASK_NAME//..." +``` + +This will create a JSON file with the results in the directory provided by the `output_dir` argument. + +Note this package currently supports binary classification only. + ## Contributing to MEDS-DEV ### To Add a Model diff --git a/pyproject.toml b/pyproject.toml index cc51b8b..649eef6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = ["meds==0.3.3", "es-aces==0.5.0"] [tool.setuptools_scm] [project.optional-dependencies] -dev = ["pre-commit"] +dev = ["pre-commit<4"] tests = ["pytest", "pytest-cov", "rootutils"] docs = [ "mkdocs==1.6.0", "mkdocs-material==9.5.31", "mkdocstrings[python,shell]==0.25.2", "mkdocs-gen-files==0.5.0", diff --git a/src/MEDS_DEV/configs/predictions.yaml b/src/MEDS_DEV/configs/predictions.yaml new file mode 100644 index 0000000..736ae72 --- /dev/null +++ b/src/MEDS_DEV/configs/predictions.yaml @@ -0,0 +1,6 @@ +defaults: + - _ACES_MD + - _self_ + - override hydra/hydra_logging: disabled + +cohort_predictions_dir: "${oc.env:MEDS_ROOT_DIR}/task_predictions" diff --git a/src/MEDS_DEV/helpers/generate_predictions.sh b/src/MEDS_DEV/helpers/generate_predictions.sh new file mode 100755 index 0000000..606d0f4 --- /dev/null +++ b/src/MEDS_DEV/helpers/generate_predictions.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export MEDS_ROOT_DIR=$1 +export MEDS_DATASET_NAME=$2 +export MEDS_TASK_NAME=$3 + +shift 3 + +MEDS_DEV_REPO_DIR=$(python -c "from importlib.resources import files; print(files(\"MEDS_DEV\"))") +export MEDS_DEV_REPO_DIR + +# TODO improve efficiency of prediction generator by using this +# SHARDS=$(expand_shards "$MEDS_ROOT_DIR"/data) + +python -m MEDS_DEV.helpers.generate_random_predictions --config-path="$MEDS_DEV_REPO_DIR"/configs \ +--config-name="predictions" "hydra.searchpath=[pkg://aces.configs]" "$@" diff --git a/src/MEDS_DEV/helpers/generate_random_predictions.py b/src/MEDS_DEV/helpers/generate_random_predictions.py new file mode 100644 index 0000000..5502438 --- /dev/null +++ b/src/MEDS_DEV/helpers/generate_random_predictions.py @@ -0,0 +1,94 @@ +import os +from importlib.resources import files +from pathlib import Path + +import hydra +import numpy as np +import polars as pl +from omegaconf import DictConfig + +SUBJECT_ID = "subject_id" +PREDICTION_TIME = "prediction_time" + +BOOLEAN_VALUE_COLUMN = "boolean_value" +PREDICTED_BOOLEAN_VALUE_COLUMN = "predicted_boolean_value" +PREDICTED_BOOLEAN_PROBABILITY_COLUMN = "predicted_boolean_probability" + +CONFIG = files("MEDS_DEV").joinpath("configs/predictions.yaml") + + +@hydra.main(version_base=None, config_path=str(CONFIG.parent.resolve()), config_name=CONFIG.stem) +def generate_random_predictions(cfg: DictConfig) -> None: + cohort_dir = cfg.cohort_dir # cohort_dir: "${oc.env:MEDS_ROOT_DIR}/task_labels" + cohort_name = cfg.cohort_name # cohort_name: ${task_name}; task_name: ${oc.env:MEDS_TASK_NAME} + + cohort_dir = Path(cohort_dir) / cohort_name + cohort_predictions_dir = ( + cfg.cohort_predictions_dir + ) # cohort_predictions_dir: "${oc.env:MEDS_ROOT_DIR}/task_predictions" + + # TODO: use expand_shards helper from the script to access sharded dataframes directly + for split in cohort_dir.iterdir(): + if split.is_dir() and split.name in {"train", "tuning", "held_out"}: # train | tuning | held_out + for file in split.iterdir(): + if file.is_file(): + dataframe = pl.read_parquet(file) + predictions = _generate_random_predictions(dataframe) # sharded dataframes + + # $MEDS_ROOT_DIR/task_predictions/$TASK_NAME//.parquet + predictions_path = Path(cohort_predictions_dir) / cohort_name / split.name + os.makedirs(predictions_path, exist_ok=True) + + predictions.write_parquet(predictions_path / file.name) + elif split.is_file(): + dataframe = pl.read_parquet(split) + predictions = _generate_random_predictions(dataframe) + + predictions_path = Path(cohort_predictions_dir) / cohort_name + os.makedirs(predictions_path, exist_ok=True) + + predictions.write_parquet(predictions_path / split.name) + + +def _generate_random_predictions(dataframe: pl.DataFrame, seed: int = 1) -> pl.DataFrame: + """Augments the input dataframe with random predictions. + + Args: + dataframe: Input dataframe with at least the columns: [subject_id, prediction_time, boolean_value] + seed: Seed for the random number generator. + + Returns: + An augmented dataframe with the boolean value and probability columns. + + Example: + >>> df = pl.DataFrame({ + ... "subject_id": [1, 2, 3], + ... "prediction_time": [0, 1, 2], + ... "boolean_value": [True, False, True] + ... }) + >>> _generate_random_predictions(df).drop(["prediction_time", "boolean_value"]) + shape: (3, 3) + ┌────────────┬─────────────────────────┬───────────────────────────────┐ + │ subject_id ┆ predicted_boolean_value ┆ predicted_boolean_probability │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ bool ┆ f64 │ + ╞════════════╪═════════════════════════╪═══════════════════════════════╡ + │ 1 ┆ true ┆ 0.511822 │ + │ 2 ┆ true ┆ 0.950464 │ + │ 3 ┆ false ┆ 0.14416 │ + └────────────┴─────────────────────────┴───────────────────────────────┘ + """ + + output = dataframe.select([SUBJECT_ID, PREDICTION_TIME, BOOLEAN_VALUE_COLUMN]) + rng = np.random.default_rng(seed) + probabilities = rng.uniform(0, 1, len(dataframe)) + # TODO: meds-evaluation currently cares about the order of columns and types, so the new columns have to + # be inserted at the correct position and cast to the correct type + output.insert_column(3, pl.Series(PREDICTED_BOOLEAN_VALUE_COLUMN, probabilities.round()).cast(pl.Boolean)) + output.insert_column(4, pl.Series(PREDICTED_BOOLEAN_PROBABILITY_COLUMN, probabilities)) + + return output + + +if __name__ == "__main__": + generate_random_predictions()