From c41a9b895d6d480376bd4493d442c71c17008151 Mon Sep 17 00:00:00 2001 From: Swarad Gat <43824694+swaradgat19@users.noreply.github.com> Date: Tue, 19 Sep 2023 23:10:53 -0400 Subject: [PATCH] Fix/geojson command (#191) * generated geojson outputs by default in wsinfer run command * generating geojson files on run command, parallelized geojson convertion using tqdm * changed tests according to updated command * modified open() with separate filename variable * changed make_geojson parameter csv from a string to a Path variable * convert only new csv files to geojson instead of all * handled condition if output directory doesn't exist * changed function name to write_geojsons, fixed minor issues * raise CannotReadSpacing if page0 is none, fixed style issue * order issue fixed * added a try catch to handle pytorch-nightly error * styled using isort and black * removed redundant libraries * removed redundant lines in wsi.py * changed folder name in ci tests for windows, ubuntu and docker * shortened length of FileExistsError message --------- Co-authored-by: Jakub Kaczmarzyk --- .github/workflows/ci.yml | 10 +-- tests/test_all.py | 16 ++-- wsinfer/cli/cli.py | 2 - wsinfer/cli/infer.py | 4 + wsinfer/modellib/run_inference.py | 2 +- ...ert_csv_to_geojson.py => write_geojson.py} | 75 ++++++++----------- 6 files changed, 49 insertions(+), 60 deletions(-) rename wsinfer/{cli/convert_csv_to_geojson.py => write_geojson.py} (58%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0e0c87..3d2601f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,8 +68,8 @@ jobs: --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca test -f results/run_metadata_*.json test -f results/patches/JP2K-33003-1.h5 - test -f results/model-outputs/JP2K-33003-1.csv - test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675 + test -f results/model-outputs-csv/JP2K-33003-1.csv + test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675 # This is run on multiple operating systems. test-package: @@ -104,8 +104,8 @@ jobs: wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca test -f results/run_metadata_*.json test -f results/patches/JP2K-33003-1.h5 - test -f results/model-outputs/JP2K-33003-1.csv - test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675 + test -f results/model-outputs-csv/JP2K-33003-1.csv + test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675 # FIXME: tissue segmentation has different outputs on Windows. The patch sizes # are the same but the coordinates found are different. - name: Run 'wsinfer run' on Windows @@ -118,7 +118,7 @@ jobs: wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca Test-Path -Path results/run_metadata_*.json -PathType Leaf Test-Path -Path results/patches/JP2K-33003-1.h5 -PathType Leaf - Test-Path -Path results/model-outputs/JP2K-33003-1.csv -PathType Leaf + Test-Path -Path results/model-outputs-csv/JP2K-33003-1.csv -PathType Leaf # test $(python -c "print(sum(1 for _ in open('results/model-outputs/JP2K-33003-1.csv')))") -eq 675 style-and-types: diff --git a/tests/test_all.py b/tests/test_all.py index 39d7da0..551b9b2 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -107,8 +107,8 @@ def test_cli_run_with_registered_models( ], ) assert result.exit_code == 0 - assert (results_dir / "model-outputs").exists() - df = pd.read_csv(results_dir / "model-outputs" / "purple.csv") + assert (results_dir / "model-outputs-csv").exists() + df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv") df_ref = pd.read_csv(reference_csv) assert set(df.columns) == set(df_ref.columns) @@ -141,8 +141,8 @@ def test_cli_run_with_registered_models( del metadata_path, meta # Test conversion to geojson. - geojson_dir = results_dir / "geojson" - result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)]) + geojson_dir = results_dir / "model-outputs-geojson" + # result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)]) assert result.exit_code == 0 with open(geojson_dir / "purple.json") as f: d: geojsonlib.GeoJSON = geojsonlib.load(f) @@ -228,8 +228,8 @@ def test_cli_run_with_local_model(tmp_path: Path, tiff_image: Path): ], ) assert result.exit_code == 0 - assert (results_dir / "model-outputs").exists() - df = pd.read_csv(results_dir / "model-outputs" / "purple.csv") + assert (results_dir / "model-outputs-csv").exists() + df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv") df_ref = pd.read_csv(reference_csv) assert set(df.columns) == set(df_ref.columns) @@ -457,8 +457,8 @@ def test_issue_94(tmp_path: Path, tiff_image: Path): # Important part is that we run through all of the files, despite the unreadble # file. assert result.exit_code == 0 - assert results_dir.joinpath("model-outputs").joinpath("purple.csv").exists() - assert not results_dir.joinpath("model-outputs").joinpath("bad.csv").exists() + assert results_dir.joinpath("model-outputs-csv").joinpath("purple.csv").exists() + assert not results_dir.joinpath("model-outputs-csv").joinpath("bad.csv").exists() def test_issue_97(tmp_path: Path, tiff_image: Path): diff --git a/wsinfer/cli/cli.py b/wsinfer/cli/cli.py index c8c60cf..5fcda16 100644 --- a/wsinfer/cli/cli.py +++ b/wsinfer/cli/cli.py @@ -6,7 +6,6 @@ import click from ..wsi import set_backend -from .convert_csv_to_geojson import togeojson from .convert_csv_to_sbubmi import tosbu from .infer import run from .patch import patch @@ -45,6 +44,5 @@ def cli( cli.add_command(run) -cli.add_command(togeojson) cli.add_command(tosbu) cli.add_command(patch) diff --git a/wsinfer/cli/infer.py b/wsinfer/cli/infer.py index 8e17422..1c78f03 100644 --- a/wsinfer/cli/infer.py +++ b/wsinfer/cli/infer.py @@ -23,6 +23,7 @@ from ..modellib import models from ..modellib.run_inference import run_inference from ..patchlib import segment_and_patch_directory_of_slides +from ..write_geojson import write_geojsons def _num_cpus() -> int: @@ -374,3 +375,6 @@ def run( json.dump(run_metadata, f, indent=2) click.secho("Finished.", fg="green") + + csvs = list((results_dir / "model-outputs-csv").glob("*.csv")) + write_geojsons(csvs, results_dir, num_workers) diff --git a/wsinfer/modellib/run_inference.py b/wsinfer/modellib/run_inference.py index acdda2a..b27805c 100644 --- a/wsinfer/modellib/run_inference.py +++ b/wsinfer/modellib/run_inference.py @@ -86,7 +86,7 @@ def run_inference( # create patch paths if the whole slide image patch exists. patch_paths = [patch_dir / p.with_suffix(".h5").name for p in wsi_paths] - model_output_dir = results_dir / "model-outputs" + model_output_dir = results_dir / "model-outputs-csv" model_output_dir.mkdir(exist_ok=True) model = get_pretrained_torch_module(model=model_info) diff --git a/wsinfer/cli/convert_csv_to_geojson.py b/wsinfer/write_geojson.py similarity index 58% rename from wsinfer/cli/convert_csv_to_geojson.py rename to wsinfer/write_geojson.py index df821d6..e281f08 100644 --- a/wsinfer/cli/convert_csv_to_geojson.py +++ b/wsinfer/write_geojson.py @@ -7,11 +7,11 @@ import json import uuid +from functools import partial from pathlib import Path -import click import pandas as pd -import tqdm +from tqdm.contrib.concurrent import process_map def _box_to_polygon( @@ -64,62 +64,49 @@ def _dataframe_to_geojson(df: pd.DataFrame, prob_cols: list[str]) -> dict: } -def convert(input: str | Path, output: str | Path) -> None: - df = pd.read_csv(input) +def make_geojson(csv: Path, results_dir: Path) -> None: + filename = csv.stem + df = pd.read_csv(csv) prob_cols = [col for col in df.columns.tolist() if col.startswith("prob_")] if not prob_cols: - raise click.ClickException("Did not find any columns with prob_ prefix.") + raise KeyError("Did not find any columns with prob_ prefix.") geojson = _dataframe_to_geojson(df, prob_cols) - with open(output, "w") as f: + with open(results_dir / "model-outputs-geojson" / f"{filename}.json", "w") as f: json.dump(geojson, f) -@click.command() -@click.argument( - "results_dir", - type=click.Path( - exists=True, file_okay=False, dir_okay=True, path_type=Path, resolve_path=True - ), -) -@click.argument( - "output", - type=click.Path(exists=False, path_type=Path, resolve_path=True), -) -def togeojson(*, results_dir: Path, output: Path) -> None: - """Convert model outputs to GeoJSON format. +def write_geojsons(csvs: list[Path], results_dir: Path, num_workers: int) -> None: + output = results_dir / "model-outputs-geojson" - GeoJSON files can be used with pathology viewers like QuPath. - - RESULTS_DIR Path to results directory (containing model-outputs dir). - - OUTPUT Path to output directory in which to save GeoJSON files. - """ if not results_dir.exists(): - raise click.ClickException(f"results_dir does not exist: {results_dir}") - if output.exists(): - raise click.ClickException("Output directory already exists.") + raise FileExistsError(f"results_dir does not exist: {results_dir}") if ( - not (results_dir / "model-outputs").exists() + not (results_dir / "model-outputs-csv").exists() and (results_dir / "patches").exists() ): - raise click.ClickException( + raise FileExistsError( "Model outputs have not been generated yet. Please run model inference." ) - if not (results_dir / "model-outputs").exists(): - raise click.ClickException( - "Expected results_dir to contain a 'model-outputs' directory but it does" - " not. Please provide the path to the directory that contains" - " model-outputs, masks, and patches." + if not (results_dir / "model-outputs-csv").exists(): + raise FileExistsError( + "Expected results_dir to contain a 'model-outputs-csv' " + "directory but it does not." + "Please provide the path to the directory" + "that contains model-outputs, masks, and patches." ) + if output.exists(): + geojsons = list((results_dir / "model-outputs-geojson").glob("*.json")) - csvs = list((results_dir / "model-outputs").glob("*.csv")) - if not csvs: - raise click.ClickException("No CSVs found. Did you generate model outputs?") - - output.mkdir(exist_ok=False) + # Makes a list of filenames for both geojsons and csvs + geojson_filenames = [filename.stem for filename in geojsons] + csv_filenames = [filename.stem for filename in csvs] - for input_csv in tqdm.tqdm(csvs): - output_path = output / input_csv.with_suffix(".json").name - convert(input=input_csv, output=output_path) + # Makes a list of new csvs that need to be converted to geojson + csvs_new = [csv for csv in csv_filenames if csv not in geojson_filenames] + csvs = [path for path in csvs if path.stem in csvs_new] + else: + # If output directory doesn't exist, make one and set csvs_final to csvs + output.mkdir(parents=True, exist_ok=True) - click.secho(f"Saved outputs to {output}", fg="green") + func = partial(make_geojson, results_dir=results_dir) + process_map(func, csvs, max_workers=num_workers)