Skip to content

Commit

Permalink
Fix/geojson command (#191)
Browse files Browse the repository at this point in the history
* generated geojson outputs by default in wsinfer run command

* generating geojson files on run command, parallelized geojson convertion using tqdm

* changed tests according to updated command

* modified open() with separate filename variable

* changed make_geojson parameter csv from a string to a Path variable

* convert only new csv files to geojson instead of all

* handled condition if output directory doesn't exist

* changed function name to write_geojsons, fixed minor issues

* raise CannotReadSpacing if page0 is none, fixed style issue

* order issue fixed

* added a try catch to handle pytorch-nightly error

* styled using isort and black

* removed redundant libraries

* removed redundant lines in wsi.py

* changed folder name in ci tests for windows, ubuntu and docker

* shortened length of FileExistsError message

---------

Co-authored-by: Jakub Kaczmarzyk <[email protected]>
  • Loading branch information
swaradgat19 and kaczmarj authored Sep 20, 2023
1 parent 59c0f5c commit c41a9b8
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 60 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ jobs:
--wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
test -f results/run_metadata_*.json
test -f results/patches/JP2K-33003-1.h5
test -f results/model-outputs/JP2K-33003-1.csv
test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675
test -f results/model-outputs-csv/JP2K-33003-1.csv
test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675
# This is run on multiple operating systems.
test-package:
Expand Down Expand Up @@ -104,8 +104,8 @@ jobs:
wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
test -f results/run_metadata_*.json
test -f results/patches/JP2K-33003-1.h5
test -f results/model-outputs/JP2K-33003-1.csv
test $(wc -l < results/model-outputs/JP2K-33003-1.csv) -eq 675
test -f results/model-outputs-csv/JP2K-33003-1.csv
test $(wc -l < results/model-outputs-csv/JP2K-33003-1.csv) -eq 675
# FIXME: tissue segmentation has different outputs on Windows. The patch sizes
# are the same but the coordinates found are different.
- name: Run 'wsinfer run' on Windows
Expand All @@ -118,7 +118,7 @@ jobs:
wsinfer run --wsi-dir slides/ --results-dir results/ --model breast-tumor-resnet34.tcga-brca
Test-Path -Path results/run_metadata_*.json -PathType Leaf
Test-Path -Path results/patches/JP2K-33003-1.h5 -PathType Leaf
Test-Path -Path results/model-outputs/JP2K-33003-1.csv -PathType Leaf
Test-Path -Path results/model-outputs-csv/JP2K-33003-1.csv -PathType Leaf
# test $(python -c "print(sum(1 for _ in open('results/model-outputs/JP2K-33003-1.csv')))") -eq 675
style-and-types:
Expand Down
16 changes: 8 additions & 8 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def test_cli_run_with_registered_models(
],
)
assert result.exit_code == 0
assert (results_dir / "model-outputs").exists()
df = pd.read_csv(results_dir / "model-outputs" / "purple.csv")
assert (results_dir / "model-outputs-csv").exists()
df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv")
df_ref = pd.read_csv(reference_csv)

assert set(df.columns) == set(df_ref.columns)
Expand Down Expand Up @@ -141,8 +141,8 @@ def test_cli_run_with_registered_models(
del metadata_path, meta

# Test conversion to geojson.
geojson_dir = results_dir / "geojson"
result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)])
geojson_dir = results_dir / "model-outputs-geojson"
# result = runner.invoke(cli, ["togeojson", str(results_dir), str(geojson_dir)])
assert result.exit_code == 0
with open(geojson_dir / "purple.json") as f:
d: geojsonlib.GeoJSON = geojsonlib.load(f)
Expand Down Expand Up @@ -228,8 +228,8 @@ def test_cli_run_with_local_model(tmp_path: Path, tiff_image: Path):
],
)
assert result.exit_code == 0
assert (results_dir / "model-outputs").exists()
df = pd.read_csv(results_dir / "model-outputs" / "purple.csv")
assert (results_dir / "model-outputs-csv").exists()
df = pd.read_csv(results_dir / "model-outputs-csv" / "purple.csv")
df_ref = pd.read_csv(reference_csv)

assert set(df.columns) == set(df_ref.columns)
Expand Down Expand Up @@ -457,8 +457,8 @@ def test_issue_94(tmp_path: Path, tiff_image: Path):
# Important part is that we run through all of the files, despite the unreadble
# file.
assert result.exit_code == 0
assert results_dir.joinpath("model-outputs").joinpath("purple.csv").exists()
assert not results_dir.joinpath("model-outputs").joinpath("bad.csv").exists()
assert results_dir.joinpath("model-outputs-csv").joinpath("purple.csv").exists()
assert not results_dir.joinpath("model-outputs-csv").joinpath("bad.csv").exists()


def test_issue_97(tmp_path: Path, tiff_image: Path):
Expand Down
2 changes: 0 additions & 2 deletions wsinfer/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import click

from ..wsi import set_backend
from .convert_csv_to_geojson import togeojson
from .convert_csv_to_sbubmi import tosbu
from .infer import run
from .patch import patch
Expand Down Expand Up @@ -45,6 +44,5 @@ def cli(


cli.add_command(run)
cli.add_command(togeojson)
cli.add_command(tosbu)
cli.add_command(patch)
4 changes: 4 additions & 0 deletions wsinfer/cli/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ..modellib import models
from ..modellib.run_inference import run_inference
from ..patchlib import segment_and_patch_directory_of_slides
from ..write_geojson import write_geojsons


def _num_cpus() -> int:
Expand Down Expand Up @@ -374,3 +375,6 @@ def run(
json.dump(run_metadata, f, indent=2)

click.secho("Finished.", fg="green")

csvs = list((results_dir / "model-outputs-csv").glob("*.csv"))
write_geojsons(csvs, results_dir, num_workers)
2 changes: 1 addition & 1 deletion wsinfer/modellib/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def run_inference(
# create patch paths if the whole slide image patch exists.
patch_paths = [patch_dir / p.with_suffix(".h5").name for p in wsi_paths]

model_output_dir = results_dir / "model-outputs"
model_output_dir = results_dir / "model-outputs-csv"
model_output_dir.mkdir(exist_ok=True)

model = get_pretrained_torch_module(model=model_info)
Expand Down
75 changes: 31 additions & 44 deletions wsinfer/cli/convert_csv_to_geojson.py → wsinfer/write_geojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

import json
import uuid
from functools import partial
from pathlib import Path

import click
import pandas as pd
import tqdm
from tqdm.contrib.concurrent import process_map


def _box_to_polygon(
Expand Down Expand Up @@ -64,62 +64,49 @@ def _dataframe_to_geojson(df: pd.DataFrame, prob_cols: list[str]) -> dict:
}


def convert(input: str | Path, output: str | Path) -> None:
df = pd.read_csv(input)
def make_geojson(csv: Path, results_dir: Path) -> None:
filename = csv.stem
df = pd.read_csv(csv)
prob_cols = [col for col in df.columns.tolist() if col.startswith("prob_")]
if not prob_cols:
raise click.ClickException("Did not find any columns with prob_ prefix.")
raise KeyError("Did not find any columns with prob_ prefix.")
geojson = _dataframe_to_geojson(df, prob_cols)
with open(output, "w") as f:
with open(results_dir / "model-outputs-geojson" / f"{filename}.json", "w") as f:
json.dump(geojson, f)


@click.command()
@click.argument(
"results_dir",
type=click.Path(
exists=True, file_okay=False, dir_okay=True, path_type=Path, resolve_path=True
),
)
@click.argument(
"output",
type=click.Path(exists=False, path_type=Path, resolve_path=True),
)
def togeojson(*, results_dir: Path, output: Path) -> None:
"""Convert model outputs to GeoJSON format.
def write_geojsons(csvs: list[Path], results_dir: Path, num_workers: int) -> None:
output = results_dir / "model-outputs-geojson"

GeoJSON files can be used with pathology viewers like QuPath.
RESULTS_DIR Path to results directory (containing model-outputs dir).
OUTPUT Path to output directory in which to save GeoJSON files.
"""
if not results_dir.exists():
raise click.ClickException(f"results_dir does not exist: {results_dir}")
if output.exists():
raise click.ClickException("Output directory already exists.")
raise FileExistsError(f"results_dir does not exist: {results_dir}")
if (
not (results_dir / "model-outputs").exists()
not (results_dir / "model-outputs-csv").exists()
and (results_dir / "patches").exists()
):
raise click.ClickException(
raise FileExistsError(
"Model outputs have not been generated yet. Please run model inference."
)
if not (results_dir / "model-outputs").exists():
raise click.ClickException(
"Expected results_dir to contain a 'model-outputs' directory but it does"
" not. Please provide the path to the directory that contains"
" model-outputs, masks, and patches."
if not (results_dir / "model-outputs-csv").exists():
raise FileExistsError(
"Expected results_dir to contain a 'model-outputs-csv' "
"directory but it does not."
"Please provide the path to the directory"
"that contains model-outputs, masks, and patches."
)
if output.exists():
geojsons = list((results_dir / "model-outputs-geojson").glob("*.json"))

csvs = list((results_dir / "model-outputs").glob("*.csv"))
if not csvs:
raise click.ClickException("No CSVs found. Did you generate model outputs?")

output.mkdir(exist_ok=False)
# Makes a list of filenames for both geojsons and csvs
geojson_filenames = [filename.stem for filename in geojsons]
csv_filenames = [filename.stem for filename in csvs]

for input_csv in tqdm.tqdm(csvs):
output_path = output / input_csv.with_suffix(".json").name
convert(input=input_csv, output=output_path)
# Makes a list of new csvs that need to be converted to geojson
csvs_new = [csv for csv in csv_filenames if csv not in geojson_filenames]
csvs = [path for path in csvs if path.stem in csvs_new]
else:
# If output directory doesn't exist, make one and set csvs_final to csvs
output.mkdir(parents=True, exist_ok=True)

click.secho(f"Saved outputs to {output}", fg="green")
func = partial(make_geojson, results_dir=results_dir)
process_map(func, csvs, max_workers=num_workers)

0 comments on commit c41a9b8

Please sign in to comment.