From c41a9b895d6d480376bd4493d442c71c17008151 Mon Sep 17 00:00:00 2001
From: Swarad Gat <43824694+swaradgat19@users.noreply.github.com>
Date: Tue, 19 Sep 2023 23:10:53 -0400
Subject: [PATCH] Fix/geojson command (#191)

* generated geojson outputs by default in wsinfer run command

* generating geojson files on run command, parallelized geojson convertion using tqdm

* changed tests according to updated command

* modified open() with separate filename variable

* changed make_geojson parameter csv from a string to a Path variable

* convert only new csv files to geojson instead of all

* handled condition if output directory doesn't exist

* changed function name to write_geojsons, fixed minor issues

* raise CannotReadSpacing if page0 is none, fixed style issue

* order issue fixed

* added a try catch to handle pytorch-nightly error

* styled using isort and black

* removed redundant libraries

* removed redundant lines in wsi.py

* changed folder name in ci tests for windows, ubuntu and docker

* shortened length of FileExistsError message

---------

Co-authored-by: Jakub Kaczmarzyk <jakub.kaczmarzyk@gmail.com>
---
 .github/workflows/ci.yml                      | 10 +--
 tests/test_all.py                             | 16 ++--
 wsinfer/cli/cli.py                            |  2 -
 wsinfer/cli/infer.py                          |  4 +
 wsinfer/modellib/run_inference.py             |  2 +-
 ...ert_csv_to_geojson.py => write_geojson.py} | 75 ++++++++-----------
 6 files changed, 49 insertions(+), 60 deletions(-)
 rename wsinfer/{cli/convert_csv_to_geojson.py => write_geojson.py} (58%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d0e0c87..3d2601f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -68,8 +68,8 @@ jobs:
             --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
           test -f results/run_metadata_*.json
           test -f results/patches/JP2K-33003-1.h5
-          test -f results/model-outputs/JP2K-33003-1.csv
-          test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675
+          test -f results/model-outputs-csv/JP2K-33003-1.csv
+          test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675
 
   # This is run on multiple operating systems.
   test-package:
@@ -104,8 +104,8 @@ jobs:
           wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
           test -f results/run_metadata_*.json
           test -f results/patches/JP2K-33003-1.h5
-          test -f results/model-outputs/JP2K-33003-1.csv
-          test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675
+          test -f results/model-outputs-csv/JP2K-33003-1.csv
+          test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675
       # FIXME: tissue segmentation has different outputs on Windows. The patch sizes
       # are the same but the coordinates found are different.
       - name: Run 'wsinfer run' on Windows
@@ -118,7 +118,7 @@ jobs:
           wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
           Test-Path -Path results/run_metadata_*.json -PathType Leaf
           Test-Path -Path results/patches/JP2K-33003-1.h5 -PathType Leaf
-          Test-Path -Path results/model-outputs/JP2K-33003-1.csv -PathType Leaf
+          Test-Path -Path results/model-outputs-csv/JP2K-33003-1.csv -PathType Leaf
           # test $(python -c "print(sum(1 for _ in open('results/model-outputs/JP2K-33003-1.csv')))") -eq 675
 
   style-and-types:
diff --git a/tests/test_all.py b/tests/test_all.py
index 39d7da0..551b9b2 100644
--- a/tests/test_all.py
+++ b/tests/test_all.py
@@ -107,8 +107,8 @@ def test_cli_run_with_registered_models(
         ],
     )
     assert result.exit_code == 0
-    assert (results_dir / "model-outputs").exists()
-    df = pd.read_csv(results_dir / "model-outputs" / "purple.csv")
+    assert (results_dir / "model-outputs-csv").exists()
+    df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv")
     df_ref = pd.read_csv(reference_csv)
 
     assert set(df.columns) == set(df_ref.columns)
@@ -141,8 +141,8 @@ def test_cli_run_with_registered_models(
     del metadata_path, meta
 
     # Test conversion to geojson.
-    geojson_dir = results_dir / "geojson"
-    result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)])
+    geojson_dir = results_dir / "model-outputs-geojson"
+    # result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)])
     assert result.exit_code == 0
     with open(geojson_dir / "purple.json") as f:
         d: geojsonlib.GeoJSON = geojsonlib.load(f)
@@ -228,8 +228,8 @@ def test_cli_run_with_local_model(tmp_path: Path, tiff_image: Path):
         ],
     )
     assert result.exit_code == 0
-    assert (results_dir / "model-outputs").exists()
-    df = pd.read_csv(results_dir / "model-outputs" / "purple.csv")
+    assert (results_dir / "model-outputs-csv").exists()
+    df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv")
     df_ref = pd.read_csv(reference_csv)
 
     assert set(df.columns) == set(df_ref.columns)
@@ -457,8 +457,8 @@ def test_issue_94(tmp_path: Path, tiff_image: Path):
     # Important part is that we run through all of the files, despite the unreadble
     # file.
     assert result.exit_code == 0
-    assert results_dir.joinpath("model-outputs").joinpath("purple.csv").exists()
-    assert not results_dir.joinpath("model-outputs").joinpath("bad.csv").exists()
+    assert results_dir.joinpath("model-outputs-csv").joinpath("purple.csv").exists()
+    assert not results_dir.joinpath("model-outputs-csv").joinpath("bad.csv").exists()
 
 
 def test_issue_97(tmp_path: Path, tiff_image: Path):
diff --git a/wsinfer/cli/cli.py b/wsinfer/cli/cli.py
index c8c60cf..5fcda16 100644
--- a/wsinfer/cli/cli.py
+++ b/wsinfer/cli/cli.py
@@ -6,7 +6,6 @@
 import click
 
 from ..wsi import set_backend
-from .convert_csv_to_geojson import togeojson
 from .convert_csv_to_sbubmi import tosbu
 from .infer import run
 from .patch import patch
@@ -45,6 +44,5 @@ def cli(
 
 
 cli.add_command(run)
-cli.add_command(togeojson)
 cli.add_command(tosbu)
 cli.add_command(patch)
diff --git a/wsinfer/cli/infer.py b/wsinfer/cli/infer.py
index 8e17422..1c78f03 100644
--- a/wsinfer/cli/infer.py
+++ b/wsinfer/cli/infer.py
@@ -23,6 +23,7 @@
 from ..modellib import models
 from ..modellib.run_inference import run_inference
 from ..patchlib import segment_and_patch_directory_of_slides
+from ..write_geojson import write_geojsons
 
 
 def _num_cpus() -> int:
@@ -374,3 +375,6 @@ def run(
         json.dump(run_metadata, f, indent=2)
 
     click.secho("Finished.", fg="green")
+
+    csvs = list((results_dir / "model-outputs-csv").glob("*.csv"))
+    write_geojsons(csvs, results_dir, num_workers)
diff --git a/wsinfer/modellib/run_inference.py b/wsinfer/modellib/run_inference.py
index acdda2a..b27805c 100644
--- a/wsinfer/modellib/run_inference.py
+++ b/wsinfer/modellib/run_inference.py
@@ -86,7 +86,7 @@ def run_inference(
     # create patch paths if the whole slide image patch exists.
     patch_paths = [patch_dir / p.with_suffix(".h5").name for p in wsi_paths]
 
-    model_output_dir = results_dir / "model-outputs"
+    model_output_dir = results_dir / "model-outputs-csv"
     model_output_dir.mkdir(exist_ok=True)
 
     model = get_pretrained_torch_module(model=model_info)
diff --git a/wsinfer/cli/convert_csv_to_geojson.py b/wsinfer/write_geojson.py
similarity index 58%
rename from wsinfer/cli/convert_csv_to_geojson.py
rename to wsinfer/write_geojson.py
index df821d6..e281f08 100644
--- a/wsinfer/cli/convert_csv_to_geojson.py
+++ b/wsinfer/write_geojson.py
@@ -7,11 +7,11 @@
 
 import json
 import uuid
+from functools import partial
 from pathlib import Path
 
-import click
 import pandas as pd
-import tqdm
+from tqdm.contrib.concurrent import process_map
 
 
 def _box_to_polygon(
@@ -64,62 +64,49 @@ def _dataframe_to_geojson(df: pd.DataFrame, prob_cols: list[str]) -> dict:
     }
 
 
-def convert(input: str | Path, output: str | Path) -> None:
-    df = pd.read_csv(input)
+def make_geojson(csv: Path, results_dir: Path) -> None:
+    filename = csv.stem
+    df = pd.read_csv(csv)
     prob_cols = [col for col in df.columns.tolist() if col.startswith("prob_")]
     if not prob_cols:
-        raise click.ClickException("Did not find any columns with prob_ prefix.")
+        raise KeyError("Did not find any columns with prob_ prefix.")
     geojson = _dataframe_to_geojson(df, prob_cols)
-    with open(output, "w") as f:
+    with open(results_dir / "model-outputs-geojson" / f"{filename}.json", "w") as f:
         json.dump(geojson, f)
 
 
-@click.command()
-@click.argument(
-    "results_dir",
-    type=click.Path(
-        exists=True, file_okay=False, dir_okay=True, path_type=Path, resolve_path=True
-    ),
-)
-@click.argument(
-    "output",
-    type=click.Path(exists=False, path_type=Path, resolve_path=True),
-)
-def togeojson(*, results_dir: Path, output: Path) -> None:
-    """Convert model outputs to GeoJSON format.
+def write_geojsons(csvs: list[Path], results_dir: Path, num_workers: int) -> None:
+    output = results_dir / "model-outputs-geojson"
 
-    GeoJSON files can be used with pathology viewers like QuPath.
-
-    RESULTS_DIR     Path to results directory (containing model-outputs dir).
-
-    OUTPUT          Path to output directory in which to save GeoJSON files.
-    """
     if not results_dir.exists():
-        raise click.ClickException(f"results_dir does not exist: {results_dir}")
-    if output.exists():
-        raise click.ClickException("Output directory already exists.")
+        raise FileExistsError(f"results_dir does not exist: {results_dir}")
     if (
-        not (results_dir / "model-outputs").exists()
+        not (results_dir / "model-outputs-csv").exists()
         and (results_dir / "patches").exists()
     ):
-        raise click.ClickException(
+        raise FileExistsError(
             "Model outputs have not been generated yet. Please run model inference."
         )
-    if not (results_dir / "model-outputs").exists():
-        raise click.ClickException(
-            "Expected results_dir to contain a 'model-outputs' directory but it does"
-            " not. Please provide the path to the directory that contains"
-            " model-outputs, masks, and patches."
+    if not (results_dir / "model-outputs-csv").exists():
+        raise FileExistsError(
+            "Expected results_dir to contain a 'model-outputs-csv' "
+            "directory but it does not."
+            "Please provide the path to the directory"
+            "that contains model-outputs, masks, and patches."
         )
+    if output.exists():
+        geojsons = list((results_dir / "model-outputs-geojson").glob("*.json"))
 
-    csvs = list((results_dir / "model-outputs").glob("*.csv"))
-    if not csvs:
-        raise click.ClickException("No CSVs found. Did you generate model outputs?")
-
-    output.mkdir(exist_ok=False)
+        # Makes a list of filenames for both geojsons and csvs
+        geojson_filenames = [filename.stem for filename in geojsons]
+        csv_filenames = [filename.stem for filename in csvs]
 
-    for input_csv in tqdm.tqdm(csvs):
-        output_path = output / input_csv.with_suffix(".json").name
-        convert(input=input_csv, output=output_path)
+        # Makes a list of new csvs that need to be converted to geojson
+        csvs_new = [csv for csv in csv_filenames if csv not in geojson_filenames]
+        csvs = [path for path in csvs if path.stem in csvs_new]
+    else:
+        # If output directory doesn't exist, make one and set csvs_final to csvs
+        output.mkdir(parents=True, exist_ok=True)
 
-    click.secho(f"Saved outputs to {output}", fg="green")
+    func = partial(make_geojson, results_dir=results_dir)
+    process_map(func, csvs, max_workers=num_workers)