From 91d30eeb96bc8fdd76a873e53599ce96215e87c1 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 14 Dec 2024 17:14:23 -0800 Subject: [PATCH] fixed e2e meds transform bug with eicu where the extraction config still had the infusionDrug table. Added ACES and meds-dev label extraction --- .gitignore | 2 + demo/aces.ipynb | 662 +++++++++++++++++++++++------ demo/configs/extract_MIMIC.yaml | 8 +- demo/extract_meds_data.ipynb | 3 +- demo/meds_tab.ipynb | 77 ---- pyproject.toml | 2 +- src/MEDS_DEV/demo/meds_cehrbert.py | 397 ----------------- src/MEDS_DEV/demo/meds_tab.ipynb | 485 --------------------- src/MEDS_DEV/demo/meds_tab.py | 240 ----------- 9 files changed, 551 insertions(+), 1325 deletions(-) delete mode 100644 src/MEDS_DEV/demo/meds_cehrbert.py delete mode 100644 src/MEDS_DEV/demo/meds_tab.ipynb delete mode 100644 src/MEDS_DEV/demo/meds_tab.py diff --git a/.gitignore b/.gitignore index c2b1661..3a41f4d 100644 --- a/.gitignore +++ b/.gitignore @@ -361,3 +361,5 @@ $RECYCLE.BIN/ meds_env/* src/MEDS_DEV/demo/download/* src/MEDS_DEV/demo/content/* + +demo/work_dir \ No newline at end of file diff --git a/demo/aces.ipynb b/demo/aces.ipynb index 8d03cbf..7b31586 100644 --- a/demo/aces.ipynb +++ b/demo/aces.ipynb @@ -30,20 +30,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo/\n" + ] + } + ], "source": [ - "ROOT_DIR = \"\"" + "#@title Download E-ICU demo\n", + "import tempfile\n", + "import os\n", + "from pathlib import Path\n", + "notebook_dir = os.getcwd()\n", + "\n", + "# Choose MIMICIV or eicu\n", + "ROOT_DIR=f\"{notebook_dir}/work_dir/mimiciv_demo/\"\n", + "# ROOT_DIR=f\"{notebook_dir}/work_dir/eicu_demo/\"\n", + "Path(ROOT_DIR).mkdir(parents=True, exist_ok=True)\n", + "\n", + "!echo {ROOT_DIR}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": { "id": "H6fqe217XDhi" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TASK_DIR\n", + "mkdir: -p: File exists\n" + ] + } + ], "source": [ "# From the ACES documentation\n", "\n", @@ -88,16 +116,19 @@ " end_inclusive: True\n", " label: discharge_or_death\n", "\"\"\"\n", - "!mkdir /content/tasks/ -p\n", - "TASK_NAME = \"in_hospital_3d_los_after_48h\"\n", - "TASK_CONFIG_FP = f\"/content/tasks/{TASK_NAME}.yaml\"\n", + "MEDS_DIR = ROOT_DIR + \"/meds\"\n", + "TASK_DIR = MEDS_DIR + \"/task_labels\"\n", + "! echo TASK_DIR\n", + "TASK_NAME = \"los_in_hospital_first_48h\"\n", + "TASK_CONFIG_FP = f\"{TASK_DIR}/{TASK_NAME}.yaml\"\n", + "!mkdir {TASK_DIR}/{TASK_NAME} -p\n", "with open(TASK_CONFIG_FP, 'w') as f:\n", " f.write(task_config)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -105,14 +136,178 @@ "id": "bXLiJGEry-Gb", "outputId": "7d954ab4-cf5c-4d02-a99c-669b5822bf44" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-14 17:02:13,334][HYDRA] Launching 3 jobs locally\n", + "[2024-12-14 17:02:13,334][HYDRA] \t#0 : data=sharded data.standard=meds data.root=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data data.shard=held_out/0 cohort_dir=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels cohort_name=los_in_hospital_first_48h config_path=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml\n", + "\u001b[32m2024-12-14 17:02:13.542\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.545\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.545\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.545\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.547\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: /Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data\n", + "shard: held_out/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.547\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.561\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.566\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.567\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.575\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.576\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.576\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.576\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.578\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┣━━ input.start\n", + "┃ ┗━━ gap.end\n", + "┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.578\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.578\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.579\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.580\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 4,155 rows as they failed to satisfy '1 <= hospital_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.580\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.593\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.622\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 0 rows as they failed to satisfy 'None <= hospital_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 4 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.634\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.642\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 16 valid rows returned corresponding to 10 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.642\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'discharge_or_death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.642\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.645\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.776\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.233536. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h/held_out/0.parquet'.\u001b[0m\n", + "[2024-12-14 17:02:13,777][HYDRA] \t#1 : data=sharded data.standard=meds data.root=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data data.shard=train/0 cohort_dir=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels cohort_name=los_in_hospital_first_48h config_path=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml\n", + "\u001b[32m2024-12-14 17:02:13.842\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.845\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.845\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.845\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.846\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: /Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data\n", + "shard: train/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.891\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.907\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.910\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.910\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.927\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.927\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.927\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.928\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.930\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┣━━ input.start\n", + "┃ ┗━━ gap.end\n", + "┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.930\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.930\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.930\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.930\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 72,649 rows as they failed to satisfy '1 <= hospital_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.931\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:13.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.059\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.144\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 0 rows as they failed to satisfy 'None <= hospital_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.144\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 26 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.148\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 212 valid rows returned corresponding to 75 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'discharge_or_death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.240\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.243\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.399837. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h/train/0.parquet'.\u001b[0m\n", + "[2024-12-14 17:02:14,243][HYDRA] \t#2 : data=sharded data.standard=meds data.root=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data data.shard=tuning/0 cohort_dir=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels cohort_name=los_in_hospital_first_48h config_path=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml\n", + "\u001b[32m2024-12-14 17:02:14.309\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.312\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.312\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.312\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: /Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data\n", + "shard: tuning/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.315\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.316\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.317\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'hospital_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.317\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.318\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.320\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.320\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.320\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.320\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┣━━ input.start\n", + "┃ ┗━━ gap.end\n", + "┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 6,240 rows as they failed to satisfy '1 <= hospital_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.331\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.342\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.349\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 0 rows as they failed to satisfy 'None <= hospital_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.349\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 2 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.352\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.360\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 15 valid rows returned corresponding to 9 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.360\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'discharge_or_death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.360\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.361\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:14.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.054431. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h/tuning/0.parquet'.\u001b[0m\n" + ] + } + ], "source": [ - "!aces-cli --multirun data=sharded data.standard=meds data.root=\"$MIMICIV_MEDS_DIR/data\" \"data.shard=$(expand_shards /content/meds/data/)\" cohort_dir=\" /content/tasks\" cohort_name=\"$TASK_NAME\" config_path=\"$TASK_CONFIG_FP\"" + "!aces-cli --multirun data=sharded data.standard=meds data.root={MEDS_DIR}/data data.shard=$(expand_shards {MEDS_DIR}/data/) cohort_dir={TASK_DIR} cohort_name={TASK_NAME} config_path={TASK_CONFIG_FP}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -121,145 +316,364 @@ "id": "7Vvac7DIWyRT", "outputId": "40493f0e-48ba-4f5e-9d9a-401e26f1a9b7" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train prevalence: 0.231\n", + "tuning prevalence: 0.133\n", + "held_out prevalence: 0.25\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (243, 6)
subject_idprediction_timeboolean_valueinteger_valuefloat_valuecategorical_value
i64datetime[μs]booli64f64str
100128532175-04-07 15:36:00falsenullnullnull
100128532176-11-27 21:28:00falsenullnullnull
100147292125-03-01 07:15:00falsenullnullnull
100147292125-03-21 16:58:00falsenullnullnull
100167422178-07-05 21:13:00falsenullnullnull
100399972135-11-09 02:42:00truenullnullnull
100400252143-03-20 12:34:00truenullnullnull
100400252145-07-05 23:46:00truenullnullnull
100207402150-09-17 14:09:00truenullnullnull
100207402151-01-17 15:25:00truenullnullnull
" + ], + "text/plain": [ + "shape: (243, 6)\n", + "┌────────────┬───────────────────┬───────────────┬───────────────┬─────────────┬───────────────────┐\n", + "│ subject_id ┆ prediction_time ┆ boolean_value ┆ integer_value ┆ float_value ┆ categorical_value │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ datetime[μs] ┆ bool ┆ i64 ┆ f64 ┆ str │\n", + "╞════════════╪═══════════════════╪═══════════════╪═══════════════╪═════════════╪═══════════════════╡\n", + "│ 10012853 ┆ 2175-04-07 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 15:36:00 ┆ ┆ ┆ ┆ │\n", + "│ 10012853 ┆ 2176-11-27 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 21:28:00 ┆ ┆ ┆ ┆ │\n", + "│ 10014729 ┆ 2125-03-01 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 07:15:00 ┆ ┆ ┆ ┆ │\n", + "│ 10014729 ┆ 2125-03-21 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 16:58:00 ┆ ┆ ┆ ┆ │\n", + "│ 10016742 ┆ 2178-07-05 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 21:13:00 ┆ ┆ ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 10039997 ┆ 2135-11-09 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 02:42:00 ┆ ┆ ┆ ┆ │\n", + "│ 10040025 ┆ 2143-03-20 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 12:34:00 ┆ ┆ ┆ ┆ │\n", + "│ 10040025 ┆ 2145-07-05 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 23:46:00 ┆ ┆ ┆ ┆ │\n", + "│ 10020740 ┆ 2150-09-17 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 14:09:00 ┆ ┆ ┆ ┆ │\n", + "│ 10020740 ┆ 2151-01-17 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 15:25:00 ┆ ┆ ┆ ┆ │\n", + "└────────────┴───────────────────┴───────────────┴───────────────┴─────────────┴───────────────────┘" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import polars as pl\n", "\n", "# execute query and get results\n", - "df = pl.read_parquet(f\"/content/tasks/{TASK_NAME}/**/*.parquet\")\n", + "df = pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/**/*.parquet\")\n", "\n", - "print(\"train prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/train/*.parquet\")['boolean_value'].mean(), 3)))\n", - "print(\"tuning prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/tuning/*.parquet\")['boolean_value'].mean(), 3)))\n", - "print(\"held_out prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/held_out/*.parquet\")['boolean_value'].mean(), 3)))\n", + "print(\"train prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/train/*.parquet\")['boolean_value'].mean(), 3)))\n", + "print(\"tuning prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/tuning/*.parquet\")['boolean_value'].mean(), 3)))\n", + "print(\"held_out prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/held_out/*.parquet\")['boolean_value'].mean(), 3)))\n", "\n", "\n", "df.sort('boolean_value')" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "XWB7O1UGhRIo", - "outputId": "e3416d5e-7427-4cf4-c0ab-20053a9d3430" - }, - "outputs": [], - "source": [ - "#@title Install meds-tab\n", - "\n", - "!pip uninstall es-aces -y\n", - "!pip install meds-tab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SeGawIqli0nn" - }, - "outputs": [], - "source": [ - "MIMICIV_MEDS_DIR = \"/content/meds/\"\n", - "OUTPUT_TABULARIZATION_DIR=\"/content/tabularized/\"\n", - "TASK_DIR=\"/content/tasks/\"\n", - "TASK_NAME=\"in_hospital_3d_los_after_48h\"\n", - "OUTPUT_MODEL_DIR=\"/content/output/meds_tab/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Tud0_0cgjljP", - "outputId": "fb3417e0-3ba4-4f9a-ab95-ce3ba8731ca1" - }, - "outputs": [], - "source": [ - "!meds-tab-describe input_dir={MIMICIV_MEDS_DIR}/data output_dir={OUTPUT_TABULARIZATION_DIR}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RVLBdOn1mnV5" - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "# Define the window sizes and aggregations to generate features for\n", - "WINDOW_SIZES = \"tabularization.window_sizes=[1d,30d,365d]\"\n", - "AGGREGATIONS = \"tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]\"" + "### MEDS-DEV Has tons of pre-defined tasks we can use!!!" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KhCPqBmduNYK" - }, - "outputs": [], + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running task mortality/in_icu/first_24h on dataset MIMIC-IV with MEDS_ROOT_DIR=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds and SHARDS=held_out/0,train/0,tuning/0\n", + "[2024-12-14 17:02:21,042][HYDRA] Launching 3 jobs locally\n", + "[2024-12-14 17:02:21,042][HYDRA] \t#0 : data.shard=held_out/0\n", + "\u001b[32m2024-12-14 17:02:21.188\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/tasks/criteria/mortality/in_icu/first_24h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.190\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mOverriding predicates and/or demographics from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/datasets/MIMIC-IV/predicates.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.207\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.207\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.207\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.209\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: ${oc.env:MEDS_ROOT_DIR}/data\n", + "shard: held_out/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.209\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.223\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.230\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.237\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.240\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┗━━ input.start\n", + "┗━━ gap.end\n", + " ┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.240\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.240\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.242\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.242\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 4,163 rows as they failed to satisfy '1 <= icu_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.243\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.255\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.277\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.285\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 0 rows as they failed to satisfy 'None <= icu_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.285\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 6 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.286\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.302\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 6 valid rows returned corresponding to 4 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.302\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.303\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m142\u001b[0m - \u001b[33m\u001b[1mAll labels in the extracted cohort are the same: '0'. This may indicate an issue with the task logic. Please double-check your configuration file if this is not expected.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.303\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.306\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.398\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.209584. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/mortality/in_icu/first_24h/held_out/0.parquet'.\u001b[0m\n", + "[2024-12-14 17:02:21,399][HYDRA] \t#1 : data.shard=train/0\n", + "\u001b[32m2024-12-14 17:02:21.469\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/tasks/criteria/mortality/in_icu/first_24h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.470\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mOverriding predicates and/or demographics from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/datasets/MIMIC-IV/predicates.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.486\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: ${oc.env:MEDS_ROOT_DIR}/data\n", + "shard: train/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.487\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.516\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.546\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.549\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.549\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.564\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.565\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.565\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.565\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.567\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┗━━ input.start\n", + "┗━━ gap.end\n", + " ┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.568\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 72,774 rows as they failed to satisfy '1 <= icu_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.569\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.622\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.680\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.714\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 2 rows as they failed to satisfy 'None <= icu_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.715\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 53 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.715\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.804\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 60 valid rows returned corresponding to 47 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.804\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.805\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.806\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.810\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.340355. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/mortality/in_icu/first_24h/train/0.parquet'.\u001b[0m\n", + "[2024-12-14 17:02:21,810][HYDRA] \t#2 : data.shard=tuning/0\n", + "\u001b[32m2024-12-14 17:02:21.878\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/tasks/criteria/mortality/in_icu/first_24h.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.879\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mOverriding predicates and/or demographics from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/datasets/MIMIC-IV/predicates.yaml'\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.894\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1341\u001b[0m - \u001b[1mParsing windows...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.895\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1350\u001b[0m - \u001b[1mParsing trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.895\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.config\u001b[0m:\u001b[36mload\u001b[0m:\u001b[36m1392\u001b[0m - \u001b[1mParsing predicates...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.895\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mAttempting to get predicates dataframe given:\n", + "standard: meds\n", + "ts_format: '%m/%d/%Y %H:%M'\n", + "root: ${oc.env:MEDS_ROOT_DIR}/data\n", + "shard: tuning/0\n", + "path: ${data.root}/${data.shard}.parquet\n", + "_prefix: /${data.shard}\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.896\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m269\u001b[0m - \u001b[1mLoading MEDS data...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1mGenerating plain predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_admission'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.901\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'icu_discharge'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.901\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m277\u001b[0m - \u001b[1mAdded predicate column 'death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.901\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mgenerate_plain_predicates_from_meds\u001b[0m:\u001b[36m280\u001b[0m - \u001b[1mCleaning up predicates dataframe...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.903\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m703\u001b[0m - \u001b[1mLoaded plain predicates. Generating derived predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.903\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mAdded predicate column 'discharge_or_death'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.904\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.predicates\u001b[0m:\u001b[36mget_predicates_df\u001b[0m:\u001b[36m724\u001b[0m - \u001b[1mGenerating special predicate columns...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.904\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mChecking if '(subject_id, timestamp)' columns are unique...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.904\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.utils\u001b[0m:\u001b[36mlog_tree\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1m\n", + "trigger\n", + "┣━━ input.end\n", + "┃ ┗━━ input.start\n", + "┗━━ gap.end\n", + " ┗━━ target.end\n", + "\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.904\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mBeginning query...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.904\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mNo static variable criteria specified, removing all rows with null timestamps...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m99\u001b[0m - \u001b[1mIdentifying possible trigger nodes based on the specified trigger event...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 6,242 rows as they failed to satisfy '1 <= icu_admission <= None'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.911\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'input.start'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.924\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'gap.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.931\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 0 rows as they failed to satisfy 'None <= icu_admission <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.931\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.constraints\u001b[0m:\u001b[36mcheck_constraints\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mExcluding 7 rows as they failed to satisfy 'None <= discharge_or_death <= 0'.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.extract_subtree\u001b[0m:\u001b[36mextract_subtree\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mSummarizing subtree rooted at 'target.end'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.945\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m114\u001b[0m - \u001b[1mDone. 8 valid rows returned corresponding to 5 subjects.\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.945\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mExtracting label 'death' from window 'target'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.945\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.query\u001b[0m:\u001b[36mquery\u001b[0m:\u001b[36m150\u001b[0m - \u001b[1mSetting index timestamp as 'end' of window 'input'...\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.946\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mget_and_validate_label_schema\u001b[0m:\u001b[36m114\u001b[0m - \u001b[33m\u001b[1mOutput contains columns that are not valid MEDS label columns. For now, we are dropping them.\n", + "If you need these columns, please comment on https://github.com/justin13601/ACES/issues/97\n", + "Columns:\n", + " - trigger\n", + " - input.end_summary\n", + " - input.start_summary\n", + " - gap.end_summary\n", + " - target.end_summary\u001b[0m\n", + "\u001b[32m2024-12-14 17:02:21.950\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36maces.__main__\u001b[0m:\u001b[36mmain\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mCompleted in 0:00:00.071263. Results saved to '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/mortality/in_icu/first_24h/tuning/0.parquet'.\u001b[0m\n" + ] + } + ], "source": [ - "!rm -rf /content/tabularized/tabularize/" + "TASK_NAME=\"mortality/in_icu/first_24h\"\n", + "!../src/MEDS_DEV/helpers/extract_task.sh {MEDS_DIR} \"MIMIC-IV\" {TASK_NAME}" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p_D07KzxjVUl", - "outputId": "8836b076-cf64-4f29-da81-ac5125ab7608" - }, - "outputs": [], + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/mortality/in_icu/first_24h/**/*.parquet\n" + ] + } + ], "source": [ - "!meds-tab-tabularize-static \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" + "!echo \"{TASK_DIR}/{TASK_NAME}/**/*.parquet\"" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train prevalence: 0.133\n", + "tuning prevalence: 0.125\n", + "held_out prevalence: 0.0\n" + ] }, - "id": "u-e-mV2Hk-Qf", - "outputId": "c292be12-ff74-44e4-f039-758e10ccc909" - }, - "outputs": [], + { + "data": { + "text/html": [ + "
\n", + "shape: (74, 6)
subject_idprediction_timeboolean_valueinteger_valuefloat_valuecategorical_value
i64datetime[μs]booli64f64str
100128532176-11-27 02:34:49falsenullnullnull
100147292125-02-28 10:03:08falsenullnullnull
100167422178-07-04 22:45:00falsenullnullnull
100167422178-07-14 08:16:00falsenullnullnull
100167422178-07-23 08:19:00falsenullnullnull
100104712155-12-03 20:33:00truenullnullnull
100159312177-03-25 21:48:07truenullnullnull
100378612117-03-15 16:34:58truenullnullnull
100379752185-01-18 19:12:12truenullnullnull
100380812115-10-10 10:15:25truenullnullnull
" + ], + "text/plain": [ + "shape: (74, 6)\n", + "┌────────────┬───────────────────┬───────────────┬───────────────┬─────────────┬───────────────────┐\n", + "│ subject_id ┆ prediction_time ┆ boolean_value ┆ integer_value ┆ float_value ┆ categorical_value │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ datetime[μs] ┆ bool ┆ i64 ┆ f64 ┆ str │\n", + "╞════════════╪═══════════════════╪═══════════════╪═══════════════╪═════════════╪═══════════════════╡\n", + "│ 10012853 ┆ 2176-11-27 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 02:34:49 ┆ ┆ ┆ ┆ │\n", + "│ 10014729 ┆ 2125-02-28 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 10:03:08 ┆ ┆ ┆ ┆ │\n", + "│ 10016742 ┆ 2178-07-04 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 22:45:00 ┆ ┆ ┆ ┆ │\n", + "│ 10016742 ┆ 2178-07-14 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 08:16:00 ┆ ┆ ┆ ┆ │\n", + "│ 10016742 ┆ 2178-07-23 ┆ false ┆ null ┆ null ┆ null │\n", + "│ ┆ 08:19:00 ┆ ┆ ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 10010471 ┆ 2155-12-03 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 20:33:00 ┆ ┆ ┆ ┆ │\n", + "│ 10015931 ┆ 2177-03-25 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 21:48:07 ┆ ┆ ┆ ┆ │\n", + "│ 10037861 ┆ 2117-03-15 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 16:34:58 ┆ ┆ ┆ ┆ │\n", + "│ 10037975 ┆ 2185-01-18 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 19:12:12 ┆ ┆ ┆ ┆ │\n", + "│ 10038081 ┆ 2115-10-10 ┆ true ┆ null ┆ null ┆ null │\n", + "│ ┆ 10:15:25 ┆ ┆ ┆ ┆ │\n", + "└────────────┴───────────────────┴───────────────┴───────────────┴─────────────┴───────────────────┘" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!meds-tab-tabularize-time-series --multirun \"worker=range(0,2)\" \"hydra/launcher=joblib\" \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NmaR_-Fik4eH" - }, - "outputs": [], - "source": [ - "!meds-tab-cache-task \"input_dir={MIMICIV_MEDS_DIR}/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" \"input_label_dir=$TASK_DIR/$TASK_NAME/\" \"task_name=$TASK_NAME\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" + "import polars as pl\n", + "\n", + "\n", + "# execute query and get results\n", + "df = pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/**/*.parquet\")\n", + "\n", + "print(\"train prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/train/*.parquet\")['boolean_value'].mean(), 3)))\n", + "print(\"tuning prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/tuning/*.parquet\")['boolean_value'].mean(), 3)))\n", + "print(\"held_out prevalence: \" + str(round(pl.read_parquet(f\"{TASK_DIR}/{TASK_NAME}/held_out/*.parquet\")['boolean_value'].mean(), 3)))\n", + "\n", + "\n", + "df.sort('boolean_value')" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "dLIkOzTblBB2" - }, + "metadata": {}, "outputs": [], - "source": [ - "!meds-tab-xgboost --multirun \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" \"output_model_dir=$OUTPUT_MODEL_DIR/$TASK_NAME/\" \"task_name=$TASK_NAME\" do_overwrite=False \"hydra.sweeper.n_trials=10\" $WINDOW_SIZES $AGGREGATIONS \"tabularization.min_code_inclusion_count=10\"" - ] + "source": [] } ], "metadata": { @@ -267,11 +681,21 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "dev", + "language": "python", "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/demo/configs/extract_MIMIC.yaml b/demo/configs/extract_MIMIC.yaml index 53577f5..4002089 100644 --- a/demo/configs/extract_MIMIC.yaml +++ b/demo/configs/extract_MIMIC.yaml @@ -27,10 +27,10 @@ stage_configs: infer_schema_length: 999999999 split_and_shard_subjects: n_subjects_per_shard: 1000 - split_fracs: - train: 0.5 - tuning: 0.25 - held_out: 0.25 + split_fracs: + train: 0.5 + tuning: 0.25 + held_out: 0.25 stages: - shard_events diff --git a/demo/extract_meds_data.ipynb b/demo/extract_meds_data.ipynb index 55a228f..f6c55b6 100644 --- a/demo/extract_meds_data.ipynb +++ b/demo/extract_meds_data.ipynb @@ -3619,7 +3619,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -3635,7 +3635,6 @@ "import tempfile\n", "import os\n", "from pathlib import Path\n", - "temp_dir = tempfile.TemporaryDirectory()\n", "notebook_dir = os.getcwd()\n", "\n", "ROOT_DIR=f\"{notebook_dir}/work_dir/eicu_demo/\"\n", diff --git a/demo/meds_tab.ipynb b/demo/meds_tab.ipynb index cde4110..20de31f 100644 --- a/demo/meds_tab.ipynb +++ b/demo/meds_tab.ipynb @@ -9,83 +9,6 @@ "# Using an example MEDS tool, ACES for labeling" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "z3_pG9YAWpKy", - "outputId": "efa4c286-413d-4a91-a53d-fb41769cd4f2" - }, - "outputs": [], - "source": [ - "#@title Install ACES\n", - "\n", - "\n", - "!pip install es-aces" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H6fqe217XDhi" - }, - "outputs": [], - "source": [ - "# From the ACES documentation\n", - "\n", - "task_config = \"\"\"\n", - "description: >-\n", - " This file specifies the base configuration for the prediction of a hospital los being greater than 3days,\n", - " leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window\n", - " and the target window. Patients who die or are discharged in the gap window are excluded. Note that this\n", - " task is in-**hospital** los, not in-**ICU** los which is a different task.\n", - "\n", - "predicates:\n", - " hospital_admission:\n", - " code: {regex: \"HOSPITAL_ADMISSION//.*\"}\n", - " hospital_discharge:\n", - " code: {regex: \"HOSPITAL_DISCHARGE//.*\"}\n", - " death:\n", - " code: MEDS_DEATH\n", - " discharge_or_death:\n", - " expr: or(hospital_discharge, death)\n", - "\n", - "trigger: hospital_admission\n", - "\n", - "windows:\n", - " input:\n", - " start: NULL\n", - " end: trigger + 48h\n", - " start_inclusive: True\n", - " end_inclusive: True\n", - " index_timestamp: end\n", - " gap:\n", - " start: input.end\n", - " end: start + 24h\n", - " start_inclusive: False\n", - " end_inclusive: True\n", - " has:\n", - " hospital_admission: (None, 0)\n", - " discharge_or_death: (None, 0)\n", - " target:\n", - " start: trigger\n", - " end: start + 3d\n", - " start_inclusive: False\n", - " end_inclusive: True\n", - " label: discharge_or_death\n", - "\"\"\"\n", - "!mkdir /content/tasks/ -p\n", - "TASK_NAME = \"in_hospital_3d_los_after_48h\"\n", - "TASK_CONFIG_FP = f\"/content/tasks/{TASK_NAME}.yaml\"\n", - "with open(TASK_CONFIG_FP, 'w') as f:\n", - " f.write(task_config)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/pyproject.toml b/pyproject.toml index 649eef6..152d01d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["meds==0.3.3", "es-aces==0.5.0"] +dependencies = ["meds>=0.3.3", "es-aces>=0.5.0"] [tool.setuptools_scm] diff --git a/src/MEDS_DEV/demo/meds_cehrbert.py b/src/MEDS_DEV/demo/meds_cehrbert.py deleted file mode 100644 index de1c37f..0000000 --- a/src/MEDS_DEV/demo/meds_cehrbert.py +++ /dev/null @@ -1,397 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.4 -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [Colab-only] Switch Colab to python 3.12 -# !sudo apt-get install python3.12 python3.12-venv -# import sys -# !python3.12 -m venv meds_env -# import os -# os.environ['PATH'] = '/content/meds_env/bin:' + os.environ['PATH'] -# !pip install --upgrade pip - -# # Then in a new code cell: -# import sys -# sys.executable = '/content/meds_env/bin/python' - -# # Confirm python version is 3.12 -# !python --version - -# %% [markdown] -# ## Install dependencies - -# %% -!pip -q install meds_etl==0.3.6 meds_transforms==0.0.7 - -# %% [markdown] -# # Download MIMIC-IV demo - -# %% -# macOS users should install wget (e.g. through brew) -!wget -q -r -N -c --no-host-directories --cut-dirs=1 -np -P ./content/download https://physionet.org/files/mimic-iv-demo/2.2/ - -# %% -# Download pre-meds script, event config (defining how raw data is converted to meds data), and meds-transform config -!mkdir -p ./content/meds-transform/ -!git clone --depth 1 https://github.com/mmcdermott/MEDS_transforms.git ./content/tmp/ -!mv ./content/tmp/MIMIC-IV_Example ./content/MIMIC-IV_Example - -# %% -# Download MIMIC IV metadata -MIMICIV_RAW_DIR = "https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map" -MIMICIV_PRE_MEDS_DIR = "./content/pre_meds/" -!mkdir {MIMICIV_PRE_MEDS_DIR} - -OUTPUT_DIR = "./content/download/mimic-iv-demo/2.2" - -files = [ - 'd_labitems_to_loinc.csv', - 'inputevents_to_rxnorm.csv', - 'lab_itemid_to_loinc.csv', - 'meas_chartevents_main.csv', - 'meas_chartevents_value.csv', - 'numerics-summary.csv', - 'outputevents_to_loinc.csv', - 'proc_datetimeevents.csv', - 'proc_itemid.csv', - 'waveforms-summary.csv' -] - -for file in files: - !wget -O {OUTPUT_DIR}/{file} {MIMICIV_RAW_DIR}/{file} - !wget -O {MIMICIV_PRE_MEDS_DIR}/{file} {MIMICIV_RAW_DIR}/{file} - -# %% -# Convert to MEDS -CURRENT_DIR = !pwd -CURRENT_DIR = CURRENT_DIR[0] - -# %% -# Convert to MEDS -TUTORIAL_DIR = CURRENT_DIR + "/content/MIMIC-IV_Example" -MIMICIV_RAW_DIR = CURRENT_DIR + "/content/download/mimic-iv-demo/2.2" -MIMICIV_PRE_MEDS_DIR = CURRENT_DIR + "/content/pre_meds" -MIMICIV_MEDS_DIR = CURRENT_DIR + "/content/meds" - -EVENT_CONVERSION_CONFIG_FP = CURRENT_DIR + "/content/MIMIC-IV_Example/configs/event_config.yaml" -PIPELINE_CONFIG_PATH = CURRENT_DIR + "/content/MIMIC-IV_Example/configs/pipeline_config.yaml" -!cd {TUTORIAL_DIR} && ./run.sh {MIMICIV_RAW_DIR} {MIMICIV_PRE_MEDS_DIR} {MIMICIV_MEDS_DIR} do_unzip=true - -# %% [markdown] -# # Examine MEDS data - -# %% -import polars as pl - -data = pl.read_parquet('./content/meds/data/**/*.parquet') -data[['subject_id', 'time', 'code', 'numeric_value']] - -# %% [markdown] -# # A simple Polars analysis - -# %% -icd10_events = data.filter(pl.col('code').str.starts_with('DIAGNOSIS//ICD//10//')) -icd10_events.group_by('code').count().sort('count', descending=True) - -# %% -df = pl.read_parquet("./content/meds/metadata/codes.parquet") -df - -# %% [markdown] -# ## Using an example MEDS tool, ACES for labeling - -# %% [markdown] -# ## Install ACES - -# %% -!pip install es-aces - -# %% - -# From ACES documentation -task_config = """ -description: >- - This file specifies the base configuration for the prediction of a hospital los being greater than 3days, - leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window - and the target window. Patients who die or are discharged in the gap window are excluded. Note that this - task is in-**hospital** los, not in-**ICU** los which is a different task. - -predicates: - hospital_admission: - code: {regex: "HOSPITAL_ADMISSION//.*"} - hospital_discharge: - code: {regex: "HOSPITAL_DISCHARGE//.*"} - death: - code: MEDS_DEATH - discharge_or_death: - expr: or(hospital_discharge, death) - -trigger: hospital_admission - -windows: - input: - start: NULL - end: trigger + 48h - start_inclusive: True - end_inclusive: True - index_timestamp: end - gap: - start: input.end - end: start + 24h - start_inclusive: False - end_inclusive: True - has: - hospital_admission: (None, 0) - discharge_or_death: (None, 0) - target: - start: trigger - end: start + 3d - start_inclusive: False - end_inclusive: True - label: discharge_or_death -""" - -!mkdir ./content/tasks/ -p -TASK_NAME = "in_hospital_3d_los_after_48h" -TASK_CONFIG_FP = f"./content/tasks/{TASK_NAME}.yaml" -with open(TASK_CONFIG_FP, 'w') as f: - f.write(task_config) - - -# %% -!pip install es-aces - -# %% -!echo $TASK_NAME -!echo $TASK_CONFIG_FP - -# %% -!aces-cli --multirun data=sharded data.standard=meds data.root="$MIMICIV_MEDS_DIR/data" "data.shard=$(expand_shards ./content/meds/data/)" cohort_dir=" ./content/tasks" cohort_name="$TASK_NAME" config_path="$TASK_CONFIG_FP" - -# %% -# TODO: reimporting polars due to dependencies? -import polars as pl - -# Execute query and get results -df = pl.read_parquet(f"./content/tasks/{TASK_NAME}/**/*.parquet") - -print("train prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/train/*.parquet")['boolean_value'].mean(), 3))) -print("tuning prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/tuning/*.parquet")['boolean_value'].mean(), 3))) -print("held_out prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/held_out/*.parquet")['boolean_value'].mean(), 3))) - - -df.sort('boolean_value') - -# %% [markdown] -# ## Switch Colab to python 3.11 for cehrbert -# %% -# %%capture -# !sudo apt-get install python3.11 python3.11-venv -# import sys -# !python3.11 -m venv cehrbert -# import os -# os.environ['PATH'] = './content/cehrbert/bin:' + os.environ['PATH'] -# !pip install --upgrade pip - -# %% -# import sys -# sys.executable = './content/cehrbert/bin/python' - -# %% [markdown] -# ## Install cehrbert and its dependencies - -# %% -!pip install meds_reader==0.1.9 -!pip install setuptools -!pip install cehrbert==1.3.1 - -# %% -MIMICIV_MEDS_DIR = "./content/meds/" -MIMICIV_MEDS_READER_DIR = "./content/meds_reader/" -TASK_DIR="./content/tasks/" -TASK_NAME="in_hospital_3d_los_after_48h" -OUTPUT_PRETRAIN_MODEL_DIR="./content/output/cehrbert/" -# TODO this variable has an identical name? -OUTPUT_PRETRAIN_MODEL_DIR="./content/output/cehrbert_finetuned/" - -# %% [markdown] -# Run meds_reader on the MEDS data - -# %% -!meds_reader_convert $MIMICIV_MEDS_DIR $MIMICIV_MEDS_READER_DIR - -# %% -!mkdir -p ./content/output/cehrbert/ -!mkdir -p ./content/output/cehrbert_dataset_prepared/ -!mkdir -p ./content/output/cehrbert_finetuned/ - -# %% -# !mkdir ./content/github_repo;cd ./content/github_repo;git clone https://github.com/cumc-dbmi/cehrbert.git;cd cehrbert;git checkout fix/meds_evaluation;pip install .; - -# %% [markdown] -# Create the cehrbert pretraining configuration yaml file - -# %% -cehrbert_pretrain_config = """ -#Model arguments -model_name_or_path: "./content/output/cehrbert/" -tokenizer_name_or_path: "./content/output/cehrbert/" -num_hidden_layers: 6 -max_position_embeddings: 1024 -hidden_size: 768 -vocab_size: 100000 -min_frequency: 50 -include_value_prediction: false # additional CEHR-BERT learning objective - -#Data arguments -data_folder: "./content/meds_reader/" -dataset_prepared_path: "./content/output/cehrbert_dataset_prepared/" - -# Below is a list of Med-to-CehrBert related arguments -preprocessing_num_workers: 2 -preprocessing_batch_size: 128 -# if is_data_in_med is false, it assumes the data is in the cehrbert format -is_data_in_meds: true -att_function_type: "cehr_bert" -inpatient_att_function_type: "mix" -include_auxiliary_token: true -include_demographic_prompt: false -# if the data is in the meds format, the validation split will be omitted -# as the meds already provide train/tuning/held_out splits -validation_split_percentage: 0.05 - -# Huggingface Arguments -dataloader_num_workers: 2 -dataloader_prefetch_factor: 2 - -overwrite_output_dir: false -resume_from_checkpoint: # automatically infer the latest checkpoint from the output folder -seed: 42 - -output_dir: "./content/output/cehrbert/" -evaluation_strategy: "epoch" -save_strategy: "epoch" -eval_accumulation_steps: 10 - -learning_rate: 0.00005 -per_device_train_batch_size: 8 -per_device_eval_batch_size: 8 -gradient_accumulation_steps: 2 - -num_train_epochs: 50 # for large datasets, 5-10 epochs should suffice -warmup_steps: 10 -weight_decay: 0.01 -logging_dir: "./logs" -logging_steps: 10 - -save_total_limit: -load_best_model_at_end: true -metric_for_best_model: "eval_loss" -greater_is_better: false - -report_to: "none" -""" -PRETRAIN_CONFIG_FP = f"./content/output/cehrbert/cehrbert_pretrain_config.yaml" -with open(PRETRAIN_CONFIG_FP, 'w') as f: - f.write(cehrbert_pretrain_config) - -# %% [markdown] -# ## Pretrain cehrbert using MLM -!python3.11 -m cehrbert.runners.hf_cehrbert_pretrain_runner ./content/output/cehrbert/cehrbert_pretrain_config.yaml - -# %% [markdown] -# ## Create the cehrbert finetuning configuration yaml file -cehrbert_finetune_config = f""" -#Model arguments -model_name_or_path: "./content/output/cehrbert/" -tokenizer_name_or_path: "./content/output/cehrbert/" -num_hidden_layers: 6 -max_position_embeddings: 1024 -hidden_size: 768 -vocab_size: 100000 -min_frequency: 50 -include_value_prediction: false # additional CEHR-BERT learning objective - -#Data arguments -cohort_folder: "./content/tasks/{TASK_NAME}/" -data_folder: "./content/meds_reader/" -dataset_prepared_path: "./content/output/cehrbert_dataset_prepared/" - -#LORA -use_lora: True -lora_rank: 64 -lora_alpha: 16 -target_modules: [ "query", "value" ] -lora_dropout: 0.1 - -# Below is a list of Med-to-CehrBert related arguments -preprocessing_num_workers: 2 -preprocessing_batch_size: 128 -# if is_data_in_med is false, it assumes the data is in the cehrbert format -is_data_in_meds: true -att_function_type: "cehr_bert" -inpatient_att_function_type: "mix" -include_auxiliary_token: true -include_demographic_prompt: false -# if the data is in the meds format, the validation split will be omitted -# as the meds already provide train/tuning/held_out splits -validation_split_percentage: 0.05 - -# Huggingface Arguments -dataloader_num_workers: 2 -dataloader_prefetch_factor: 2 - -overwrite_output_dir: false -resume_from_checkpoint: # automatically infer the latest checkpoint from the output folder -seed: 42 - -output_dir: "./content/output/cehrbert_finetuned" -evaluation_strategy: "epoch" -save_strategy: "epoch" -eval_accumulation_steps: 10 - -do_train: True -do_predict: True - -learning_rate: 0.00005 -per_device_train_batch_size: 8 -per_device_eval_batch_size: 8 -gradient_accumulation_steps: 2 - -num_train_epochs: 10 -warmup_steps: 10 -weight_decay: 0.01 -logging_dir: "./logs" -logging_steps: 10 - -save_total_limit: -load_best_model_at_end: true -metric_for_best_model: "eval_loss" -greater_is_better: false - -report_to: "none" -""" -FINETUNE_CONFIG_FP = f"./content/output/cehrbert/cehrbert_finetune_config.yaml" -with open(FINETUNE_CONFIG_FP, 'w') as f: - f.write(cehrbert_finetune_config) - -# %% -# ## Finetune cehrbert for the downstream task -!python3.11 -m cehrbert.runners.hf_cehrbert_finetune_runner ./content/output/cehrbert/cehrbert_finetune_config.yaml - -# %% -import pandas as pd - -pd.read_parquet("./content/output/cehrbert_finetuned/test_predictions") - -# %% -!cat ./content/output/cehrbert_finetuned/test_results.json diff --git a/src/MEDS_DEV/demo/meds_tab.ipynb b/src/MEDS_DEV/demo/meds_tab.ipynb deleted file mode 100644 index 86e5791..0000000 --- a/src/MEDS_DEV/demo/meds_tab.ipynb +++ /dev/null @@ -1,485 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xLJYigczPaTG" - }, - "outputs": [], - "source": [ - "#@title Swtich Colab to python 3.12\n", - "%%capture\n", - "!sudo apt-get install python3.12 python3.12-venv\n", - "import sys\n", - "!python3.12 -m venv meds_env\n", - "import os\n", - "os.environ['PATH'] = '/content/meds_env/bin:' + os.environ['PATH']\n", - "!pip install --upgrade pip\n", - "\n", - "# Then in a new code cell:\n", - "import sys\n", - "sys.executable = '/content/meds_env/bin/python'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "w8Zrf0NePwfs", - "outputId": "6aa313bc-5141-453c-88f5-8c1d22956f3d" - }, - "outputs": [], - "source": [ - "# confirm python version is 3.12\n", - "!python --version" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ikPVQZOnPcI0" - }, - "outputs": [], - "source": [ - "#@title Install dependencies\n", - "!pip -q install meds_etl==0.3.6 meds_transforms==0.0.7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "id": "rjqK4CuRPfnE" - }, - "outputs": [], - "source": [ - "#@title Download MIMIC-IV demo\n", - "\n", - "!wget -q -r -N -c --no-host-directories --cut-dirs=1 -np -P download https://physionet.org/files/mimic-iv-demo/2.2/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qHOBI1_5StBb", - "outputId": "eb0ef7ec-54c8-4cac-b1ff-d176c986a447" - }, - "outputs": [], - "source": [ - "# Download pre-meds script, event config (defining how raw data is converted to meds data), and meds-transform config\n", - "!mkdir /content/meds-transform/\n", - "!git clone --depth 1 https://github.com/mmcdermott/MEDS_transforms.git /content/tmp/\n", - "!mv /content/tmp/MIMIC-IV_Example /content/MIMIC-IV_Example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Sr2QdvNxpd0p", - "outputId": "7877300f-afc5-4583-95f2-e4f7089356b6" - }, - "outputs": [], - "source": [ - "# download MIMIC IV metadata\n", - "MIMICIV_RAW_DIR = \"https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map\"\n", - "MIMICIV_PRE_MEDS_DIR = \"/content/pre_meds/\"\n", - "!mkdir {MIMICIV_PRE_MEDS_DIR}\n", - "\n", - "OUTPUT_DIR = \"/content/download/mimic-iv-demo/2.2\"\n", - "\n", - "files = [\n", - " 'd_labitems_to_loinc.csv',\n", - " 'inputevents_to_rxnorm.csv',\n", - " 'lab_itemid_to_loinc.csv',\n", - " 'meas_chartevents_main.csv',\n", - " 'meas_chartevents_value.csv',\n", - " 'numerics-summary.csv',\n", - " 'outputevents_to_loinc.csv',\n", - " 'proc_datetimeevents.csv',\n", - " 'proc_itemid.csv',\n", - " 'waveforms-summary.csv'\n", - "]\n", - "\n", - "for file in files:\n", - " !wget -O {OUTPUT_DIR}/{file} {MIMICIV_RAW_DIR}/{file}\n", - " !wget -O {MIMICIV_PRE_MEDS_DIR}/{file} {MIMICIV_RAW_DIR}/{file}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pQSLxYJhRPxm", - "outputId": "41ab56f5-512c-4489-adfc-614644c6c632" - }, - "outputs": [], - "source": [ - "# Convert to MEDS\n", - "TUTORIAL_DIR = \"/content/MIMIC-IV_Example\"\n", - "MIMICIV_RAW_DIR = \"/content/download/mimic-iv-demo/2.2\"\n", - "MIMICIV_PRE_MEDS_DIR = \"/content/pre_meds/\"\n", - "MIMICIV_MEDS_DIR = \"/content/meds/\"\n", - "\n", - "EVENT_CONVERSION_CONFIG_FP=\"/content/MIMIC-IV_Example/configs/event_config.yaml\"\n", - "PIPELINE_CONFIG_PATH=\"/content/MIMIC-IV_Example/configs/pipeline_config.yaml\"\n", - "!cd {TUTORIAL_DIR} && /content/MIMIC-IV_Example/run.sh {MIMICIV_RAW_DIR} {MIMICIV_PRE_MEDS_DIR} {MIMICIV_MEDS_DIR} do_unzip=true" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 443 - }, - "id": "u2f6socuWhjd", - "outputId": "289bc4ae-e374-4ed1-fd98-58c803f14e26" - }, - "outputs": [], - "source": [ - "#@title Examine MEDS data\n", - "\n", - "import polars as pl\n", - "data = pl.read_parquet('/content/meds/data/**/*.parquet')\n", - "\n", - "data[['subject_id', 'time', 'code', 'numeric_value']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 479 - }, - "id": "CZD9xpSxd1Wp", - "outputId": "ea758e42-b71d-464f-f931-df7eec7a4415" - }, - "outputs": [], - "source": [ - "#@title A Simple Polars Analysis\n", - "\n", - "icd10_events = data.filter(pl.col('code').str.starts_with('DIAGNOSIS//ICD//10//'))\n", - "\n", - "icd10_events.group_by('code').count().sort('count', descending=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 443 - }, - "id": "u7EXKCZelho-", - "outputId": "27e81b86-1195-4c6c-f7c7-993665b826d7" - }, - "outputs": [], - "source": [ - "df = pl.read_parquet(\"/content/meds/metadata/codes.parquet\")\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PZmYRVX2W8m7" - }, - "source": [ - "# Using an example MEDS tool, ACES for labeling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "z3_pG9YAWpKy", - "outputId": "efa4c286-413d-4a91-a53d-fb41769cd4f2" - }, - "outputs": [], - "source": [ - "#@title Install ACES\n", - "\n", - "\n", - "!pip install es-aces" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H6fqe217XDhi" - }, - "outputs": [], - "source": [ - "# From the ACES documentation\n", - "\n", - "task_config = \"\"\"\n", - "description: >-\n", - " This file specifies the base configuration for the prediction of a hospital los being greater than 3days,\n", - " leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window\n", - " and the target window. Patients who die or are discharged in the gap window are excluded. Note that this\n", - " task is in-**hospital** los, not in-**ICU** los which is a different task.\n", - "\n", - "predicates:\n", - " hospital_admission:\n", - " code: {regex: \"HOSPITAL_ADMISSION//.*\"}\n", - " hospital_discharge:\n", - " code: {regex: \"HOSPITAL_DISCHARGE//.*\"}\n", - " death:\n", - " code: MEDS_DEATH\n", - " discharge_or_death:\n", - " expr: or(hospital_discharge, death)\n", - "\n", - "trigger: hospital_admission\n", - "\n", - "windows:\n", - " input:\n", - " start: NULL\n", - " end: trigger + 48h\n", - " start_inclusive: True\n", - " end_inclusive: True\n", - " index_timestamp: end\n", - " gap:\n", - " start: input.end\n", - " end: start + 24h\n", - " start_inclusive: False\n", - " end_inclusive: True\n", - " has:\n", - " hospital_admission: (None, 0)\n", - " discharge_or_death: (None, 0)\n", - " target:\n", - " start: trigger\n", - " end: start + 3d\n", - " start_inclusive: False\n", - " end_inclusive: True\n", - " label: discharge_or_death\n", - "\"\"\"\n", - "!mkdir /content/tasks/ -p\n", - "TASK_NAME = \"in_hospital_3d_los_after_48h\"\n", - "TASK_CONFIG_FP = f\"/content/tasks/{TASK_NAME}.yaml\"\n", - "with open(TASK_CONFIG_FP, 'w') as f:\n", - " f.write(task_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "es-39eHOyp5a", - "outputId": "9d5e2468-fdd5-4c4b-8615-fe24f5a9310f" - }, - "outputs": [], - "source": [ - "!pip install es-aces" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bXLiJGEry-Gb", - "outputId": "7d954ab4-cf5c-4d02-a99c-669b5822bf44" - }, - "outputs": [], - "source": [ - "!aces-cli --multirun data=sharded data.standard=meds data.root=\"$MIMICIV_MEDS_DIR/data\" \"data.shard=$(expand_shards /content/meds/data/)\" cohort_dir=\" /content/tasks\" cohort_name=\"$TASK_NAME\" config_path=\"$TASK_CONFIG_FP\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 497 - }, - "id": "7Vvac7DIWyRT", - "outputId": "40493f0e-48ba-4f5e-9d9a-401e26f1a9b7" - }, - "outputs": [], - "source": [ - "import polars as pl\n", - "\n", - "# execute query and get results\n", - "df = pl.read_parquet(f\"/content/tasks/{TASK_NAME}/**/*.parquet\")\n", - "\n", - "print(\"train prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/train/*.parquet\")['boolean_value'].mean(), 3)))\n", - "print(\"tuning prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/tuning/*.parquet\")['boolean_value'].mean(), 3)))\n", - "print(\"held_out prevalence: \" + str(round(pl.read_parquet(f\"/content/tasks/{TASK_NAME}/held_out/*.parquet\")['boolean_value'].mean(), 3)))\n", - "\n", - "\n", - "df.sort('boolean_value')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "XWB7O1UGhRIo", - "outputId": "e3416d5e-7427-4cf4-c0ab-20053a9d3430" - }, - "outputs": [], - "source": [ - "#@title Install meds-tab\n", - "\n", - "!pip uninstall es-aces -y\n", - "!pip install meds-tab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SeGawIqli0nn" - }, - "outputs": [], - "source": [ - "MIMICIV_MEDS_DIR = \"/content/meds/\"\n", - "OUTPUT_TABULARIZATION_DIR=\"/content/tabularized/\"\n", - "TASK_DIR=\"/content/tasks/\"\n", - "TASK_NAME=\"in_hospital_3d_los_after_48h\"\n", - "OUTPUT_MODEL_DIR=\"/content/output/meds_tab/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Tud0_0cgjljP", - "outputId": "fb3417e0-3ba4-4f9a-ab95-ce3ba8731ca1" - }, - "outputs": [], - "source": [ - "!meds-tab-describe input_dir={MIMICIV_MEDS_DIR}/data output_dir={OUTPUT_TABULARIZATION_DIR}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RVLBdOn1mnV5" - }, - "outputs": [], - "source": [ - "# Define the window sizes and aggregations to generate features for\n", - "WINDOW_SIZES = \"tabularization.window_sizes=[1d,30d,365d]\"\n", - "AGGREGATIONS = \"tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KhCPqBmduNYK" - }, - "outputs": [], - "source": [ - "!rm -rf /content/tabularized/tabularize/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p_D07KzxjVUl", - "outputId": "8836b076-cf64-4f29-da81-ac5125ab7608" - }, - "outputs": [], - "source": [ - "!meds-tab-tabularize-static \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "u-e-mV2Hk-Qf", - "outputId": "c292be12-ff74-44e4-f039-758e10ccc909" - }, - "outputs": [], - "source": [ - "!meds-tab-tabularize-time-series --multirun \"worker=range(0,2)\" \"hydra/launcher=joblib\" \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NmaR_-Fik4eH" - }, - "outputs": [], - "source": [ - "!meds-tab-cache-task \"input_dir={MIMICIV_MEDS_DIR}/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" \"input_label_dir=$TASK_DIR/$TASK_NAME/\" \"task_name=$TASK_NAME\" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dLIkOzTblBB2" - }, - "outputs": [], - "source": [ - "!meds-tab-xgboost --multirun \"input_dir=$MIMICIV_MEDS_DIR/data\" \"output_dir=$OUTPUT_TABULARIZATION_DIR\" \"output_model_dir=$OUTPUT_MODEL_DIR/$TASK_NAME/\" \"task_name=$TASK_NAME\" do_overwrite=False \"hydra.sweeper.n_trials=10\" $WINDOW_SIZES $AGGREGATIONS \"tabularization.min_code_inclusion_count=10\"" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/src/MEDS_DEV/demo/meds_tab.py b/src/MEDS_DEV/demo/meds_tab.py deleted file mode 100644 index 4fae3ae..0000000 --- a/src/MEDS_DEV/demo/meds_tab.py +++ /dev/null @@ -1,240 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.4 -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [Colab-only] Switch Colab to python 3.12 -# !sudo apt-get install python3.12 python3.12-venv -# import sys -# !python3.12 -m venv meds_env -# import os -# os.environ['PATH'] = '/content/meds_env/bin:' + os.environ['PATH'] -# !pip install --upgrade pip - -# # Then in a new code cell: -# import sys -# sys.executable = '/content/meds_env/bin/python' - -# # Confirm python version is 3.12 -# !python --version - -# %% -!pwd # Should be .../src/MEDS_DEV/demo - -# %% [markdown] -# ## Install dependencies - -# %% -!pip -q install meds_etl==0.3.6 meds_transforms==0.0.7 - -# TODO install meds-evaluation - -# %% [markdown] -# # Download MIMIC-IV demo - -# %% -# macOS users should install wget (e.g. through brew) -!wget -q -r -N -c --no-host-directories --cut-dirs=1 -np -P ./content/download https://physionet.org/files/mimic-iv-demo/2.2/ - -# %% -# Download pre-meds script, event config (defining how raw data is converted to meds data), and meds-transform config -!mkdir -p ./content/meds-transform/ -!git clone --depth 1 https://github.com/mmcdermott/MEDS_transforms.git ./content/tmp/ -!mv ./content/tmp/MIMIC-IV_Example ./content/MIMIC-IV_Example - -# %% -# Download MIMIC-IV metadata -MIMICIV_RAW_DIR = "https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map" -MIMICIV_PRE_MEDS_DIR = "./content/pre_meds/" -!mkdir {MIMICIV_PRE_MEDS_DIR} - -OUTPUT_DIR = "./content/download/mimic-iv-demo/2.2" - -files = [ - 'd_labitems_to_loinc.csv', - 'inputevents_to_rxnorm.csv', - 'lab_itemid_to_loinc.csv', - 'meas_chartevents_main.csv', - 'meas_chartevents_value.csv', - 'numerics-summary.csv', - 'outputevents_to_loinc.csv', - 'proc_datetimeevents.csv', - 'proc_itemid.csv', - 'waveforms-summary.csv' -] - -for file in files: - !wget -O {OUTPUT_DIR}/{file} {MIMICIV_RAW_DIR}/{file} - !wget -O {MIMICIV_PRE_MEDS_DIR}/{file} {MIMICIV_RAW_DIR}/{file} - -# %% -# Convert to MEDS -CURRENT_DIR = !pwd -CURRENT_DIR = CURRENT_DIR[0] -# %% -TUTORIAL_DIR = CURRENT_DIR + "/content/MIMIC-IV_Example" -MIMICIV_RAW_DIR = CURRENT_DIR + "/content/download/mimic-iv-demo/2.2" -MIMICIV_PRE_MEDS_DIR = CURRENT_DIR + "/content/pre_meds" -MIMICIV_MEDS_DIR = CURRENT_DIR + "/content/meds" - -EVENT_CONVERSION_CONFIG_FP = CURRENT_DIR + "/content/MIMIC-IV_Example/configs/event_config.yaml" -PIPELINE_CONFIG_PATH = CURRENT_DIR + "/content/MIMIC-IV_Example/configs/pipeline_config.yaml" -!cd {TUTORIAL_DIR} && ./run.sh {MIMICIV_RAW_DIR} {MIMICIV_PRE_MEDS_DIR} {MIMICIV_MEDS_DIR} do_unzip=true - -# %% [markdown] -# # Examine MEDS data - -# %% -import polars as pl - -data = pl.read_parquet('./content/meds/data/**/*.parquet') - -data[['subject_id', 'time', 'code', 'numeric_value']] - -# %% [markdown] -# # A simple Polars analysis - -# %% -icd10_events = data.filter(pl.col('code').str.starts_with('DIAGNOSIS//ICD//10//')) -icd10_events.group_by('code').count().sort('count', descending=True) - -# %% -df = pl.read_parquet("./content/meds/metadata/codes.parquet") -df - -# %% [markdown] -# ## Using an example MEDS tool, ACES for labeling - -# %% [markdown] -# ## Install ACES - -# %% -!pip install es-aces - -# %% - -# From ACES documentation -task_config = """ -description: >- - This file specifies the base configuration for the prediction of a hospital los being greater than 3days, - leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window - and the target window. Patients who die or are discharged in the gap window are excluded. Note that this - task is in-**hospital** los, not in-**ICU** los which is a different task. - -predicates: - hospital_admission: - code: {regex: "HOSPITAL_ADMISSION//.*"} - hospital_discharge: - code: {regex: "HOSPITAL_DISCHARGE//.*"} - death: - code: MEDS_DEATH - discharge_or_death: - expr: or(hospital_discharge, death) - -trigger: hospital_admission - -windows: - input: - start: NULL - end: trigger + 48h - start_inclusive: True - end_inclusive: True - index_timestamp: end - gap: - start: input.end - end: start + 24h - start_inclusive: False - end_inclusive: True - has: - hospital_admission: (None, 0) - discharge_or_death: (None, 0) - target: - start: trigger - end: start + 3d - start_inclusive: False - end_inclusive: True - label: discharge_or_death -""" - -!mkdir ./content/tasks/ -p -TASK_NAME = "in_hospital_3d_los_after_48h" -TASK_CONFIG_FP = f"./content/tasks/{TASK_NAME}.yaml" -with open(TASK_CONFIG_FP, 'w') as f: - f.write(task_config) - -# %% -!pip install es-aces - -# %% -!aces-cli --multirun data=sharded data.standard=meds data.root="$MIMICIV_MEDS_DIR/data" "data.shard=$(expand_shards ./content/meds/data/)" cohort_dir=" ./content/tasks" cohort_name="$TASK_NAME" config_path="$TASK_CONFIG_FP" - -# %% -# TODO: reimporting polars due to dependencies? -import polars as pl - -# Execute query and get results -df = pl.read_parquet(f"./content/tasks/{TASK_NAME}/**/*.parquet") - -print("train prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/train/*.parquet")['boolean_value'].mean(), 3))) -print("tuning prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/tuning/*.parquet")['boolean_value'].mean(), 3))) -print("held_out prevalence: " + str(round(pl.read_parquet(f"./content/tasks/{TASK_NAME}/held_out/*.parquet")['boolean_value'].mean(), 3))) - - -df.sort('boolean_value') - -# %% -# ## Install meds-tab - -!pip uninstall es-aces -y # TODO ??? -!pip install meds-tab - -# %% -MIMICIV_MEDS_DIR = "./content/meds/" -OUTPUT_TABULARIZATION_DIR="./content/tabularized/" -TASK_DIR="./content/tasks/" -TASK_NAME="in_hospital_3d_los_after_48h" -OUTPUT_MODEL_DIR="./content/output/meds_tab/" - -# %% -!meds-tab-describe input_dir={MIMICIV_MEDS_DIR}/data output_dir={OUTPUT_TABULARIZATION_DIR} - -# %% -# Define the window sizes and aggregations to generate features for -# TODO define this as system variables or make sure the shell -# commands can find these -WINDOW_SIZES = "tabularization.window_sizes=[1d,30d,365d]" -AGGREGATIONS = "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" - -# %% -!rm -rf ./content/tabularized/tabularize/ - -# %% -# TODO shell vs python variables -!echo {OUTPUT_TABULARIZATION_DIR} - -# %% -# TODO shell vs python variables -!echo WINDOW_SIZES -# %% -# TODO shell vs python variables -!meds-tab-tabularize-static "input_dir=$MIMICIV_MEDS_DIR/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS - -# %% -# TODO shell vs python variables -!meds-tab-tabularize-time-series --multirun "worker=range(0,2)" "hydra/launcher=joblib" "input_dir=$MIMICIV_MEDS_DIR/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS - -# %% -# TODO shell vs python variables -!meds-tab-cache-task "input_dir={MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" "input_label_dir=$TASK_DIR/$TASK_NAME/" "task_name=$TASK_NAME" do_overwrite=False $WINDOW_SIZES $AGGREGATIONS - -# %% -# TODO shell vs python variables -!meds-tab-xgboost --multirun "input_dir=$MIMICIV_MEDS_DIR/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" "output_model_dir=$OUTPUT_MODEL_DIR/$TASK_NAME/" "task_name=$TASK_NAME" do_overwrite=False "hydra.sweeper.n_trials=10" $WINDOW_SIZES $AGGREGATIONS "tabularization.min_code_inclusion_count=10"