From 3592fc0169a9a6e8cd18a7cb3e4bbcc2bde56e0f Mon Sep 17 00:00:00 2001 From: Daniel Al Mouiee Date: Fri, 2 Feb 2024 16:16:29 +1100 Subject: [PATCH 1/3] Added static typing to `pydicer` modules --- pydicer/config.py | 16 +++++--- pydicer/convert/data.py | 16 +++++--- pydicer/convert/headers.py | 7 +++- pydicer/dataset/functions.py | 44 +++++++++++++++------ pydicer/dataset/nnunet.py | 33 +++++++++++----- pydicer/dataset/preparation.py | 4 +- pydicer/dataset/structureset.py | 25 ++++++++---- pydicer/generate/object.py | 66 +++++++++++++++++--------------- pydicer/generate/segmentation.py | 2 +- pydicer/input/base.py | 3 +- pydicer/input/filesystem.py | 5 ++- pydicer/input/orthanc.py | 48 ++++++++++++++++------- pydicer/input/pacs.py | 21 ++++++++-- pydicer/input/tcia.py | 12 +++++- pydicer/input/test.py | 4 +- pydicer/input/web.py | 4 +- pydicer/logger.py | 11 ++++-- pydicer/preprocess/data.py | 7 +++- pydicer/quarantine.py | 6 +-- pydicer/tool.py | 9 +++-- pydicer/utils.py | 39 +++++++++++-------- pydicer/visualise/data.py | 8 +++- 22 files changed, 264 insertions(+), 126 deletions(-) diff --git a/pydicer/config.py b/pydicer/config.py index e140483..0cef9f9 100644 --- a/pydicer/config.py +++ b/pydicer/config.py @@ -15,7 +15,13 @@ "available in the .pydicer directory.", "type": int, "default": 0, - "choices": [logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR], + "choices": [ + logging.NOTSET, + logging.DEBUG, + logging.INFO, + logging.WARNING, + logging.ERROR, + ], }, "for_fallback_linkage": { "module": "general", @@ -80,7 +86,6 @@ class PyDicerConfig: class __PyDicerConfig: # pylint: disable=invalid-name def __init__(self, working_dir=None): - if working_dir is None: raise ValueError("working_dir must be set on config init") self.working_dir = Path(working_dir) @@ -128,7 +133,7 @@ def get_working_dir(self): """ return self.instance.working_dir - def get_config(self, name): + def get_config(self, name: str) -> object: """Get the value of the config item with the specified name Args: @@ -146,7 +151,7 @@ def get_config(self, name): return self.instance.pydicer_config[name] - def set_config(self, name, value): + def set_config(self, name: str, value: object): """Set the value for the config with the given name Args: @@ -163,7 +168,8 @@ def set_config(self, name, value): if not isinstance(value, PYDICER_CONFIG[name]["type"]) and not value is None: raise ValueError( - f"Config {name} must be of type " f"{type(self.instance.pydicer_config[name])}" + f"Config {name} must be of type " + f"{type(self.instance.pydicer_config[name])}" ) self.instance.pydicer_config[name] = value diff --git a/pydicer/convert/data.py b/pydicer/convert/data.py index 4fb507d..85d130c 100644 --- a/pydicer/convert/data.py +++ b/pydicer/convert/data.py @@ -3,6 +3,8 @@ import copy import shutil from pathlib import Path +from typing import Union + import pandas as pd import numpy as np import SimpleITK as sitk @@ -51,7 +53,7 @@ ] -def get_object_type(sop_class_uid): +def get_object_type(sop_class_uid: str) -> str: """Get the type of the object (used for the output path) Args: @@ -69,7 +71,9 @@ def get_object_type(sop_class_uid): return object_type -def handle_missing_slice(files, ignore_duplicates=False): +def handle_missing_slice( + files: Union[pd.DataFrame, list], ignore_duplicates: bool = False +) -> list: """function to interpolate missing slices in an image Example usage: @@ -98,6 +102,8 @@ def handle_missing_slice(files, ignore_duplicates=False): Args: df_files (pd.DataFrame|list): the DataFrame which was produced by PreprocessData or list of filepaths to dicom slices + ignore_duplicates (booleanbool, optional): specifices whether the function is to ignore + duplicate slices when handling missing ones Returns: file_paths(list): a list of the interpolated file paths @@ -231,7 +237,7 @@ def handle_missing_slice(files, ignore_duplicates=False): return df_files.file_path.tolist() -def link_via_frame_of_reference(for_uid, df_preprocess): +def link_via_frame_of_reference(for_uid: str, df_preprocess: pd.DataFrame) -> pd.DataFrame: """Find the image series linked to this FOR Args: @@ -271,7 +277,7 @@ def __init__(self, working_directory="."): self.pydicer_directory = working_directory.joinpath(PYDICER_DIR_NAME) self.output_directory = working_directory.joinpath(CONVERTED_DIR_NAME) - def add_entry(self, entry): + def add_entry(self, entry: dict): """Add an entry of a converted data object to the patient's converted dataframe. Args: @@ -308,7 +314,7 @@ def add_entry(self, entry): df_pat_data = df_pat_data.reset_index(drop=True) df_pat_data.to_csv(converted_df_path) - def convert(self, patient=None, force=True): + def convert(self, patient: Union[str, list]=None, force: bool=True): """Converts the DICOM which was preprocessed into the pydicer output directory. Args: diff --git a/pydicer/convert/headers.py b/pydicer/convert/headers.py index 499643b..bf3144b 100644 --- a/pydicer/convert/headers.py +++ b/pydicer/convert/headers.py @@ -1,11 +1,16 @@ import logging import json +from typing import Union +from pathlib import Path + import pydicom logger = logging.getLogger(__name__) -def convert_dicom_headers(dcm_file, binary_path, json_file): +def convert_dicom_headers( + dcm_file: Union[str, Path], binary_path: str, json_file: Union[str, Path] +): """Save the DICOM Headers as a JSON file Args: diff --git a/pydicer/dataset/functions.py b/pydicer/dataset/functions.py index 7abc957..ec19cb8 100644 --- a/pydicer/dataset/functions.py +++ b/pydicer/dataset/functions.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -def rt_latest_struct(df, **kwargs): +def rt_latest_struct(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """Select the latest Structure set and the image which it is linked to. You can specify keyword arguments to for a match on any top level DICOM attributes. You may also supply lists of values to these, one of which should match to select that series. @@ -91,18 +91,24 @@ def rt_latest_struct(df, **kwargs): keep_rows.append(struct_row.name) # Track index of row to keep # Find the linked image - df_linked_img = df[df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid] + df_linked_img = df[ + df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid + ] if len(df_linked_img) == 0: - logger.warning("No linked images found for structure: %s", struct_row.hashed_uid) + logger.warning( + "No linked images found for structure: %s", struct_row.hashed_uid + ) continue - keep_rows.append(df_linked_img.iloc[0].name) # Keep the index of the row of the image too + keep_rows.append( + df_linked_img.iloc[0].name + ) # Keep the index of the row of the image too return df.loc[keep_rows] -def rt_latest_dose(df, **kwargs): +def rt_latest_dose(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """Select the latest RTDOSE and the image, structure and plan which it is linked to. You can specify keyword arguments to for a match on any top level DICOM attributes. You may also supply lists of values to these, one of which should match to select that series. @@ -191,16 +197,22 @@ def rt_latest_dose(df, **kwargs): keep_rows.append(dose_row.name) # Track index of row of dose to keep # Find the linked plan - df_linked_plan = df[df["sop_instance_uid"] == dose_row.referenced_sop_instance_uid] + df_linked_plan = df[ + df["sop_instance_uid"] == dose_row.referenced_sop_instance_uid + ] if len(df_linked_plan) == 0: - logger.warning("No linked plans found for dose: %s", dose_row.sop_instance_uid) + logger.warning( + "No linked plans found for dose: %s", dose_row.sop_instance_uid + ) continue # Find the linked structure set plan_row = df_linked_plan.iloc[0] keep_rows.append(plan_row.name) # Keep the index of the row of the plan - df_linked_struct = df[df["sop_instance_uid"] == plan_row.referenced_sop_instance_uid] + df_linked_struct = df[ + df["sop_instance_uid"] == plan_row.referenced_sop_instance_uid + ] if len(df_linked_struct) == 0: # Try to link via Frame of Reference instead @@ -209,18 +221,26 @@ def rt_latest_dose(df, **kwargs): ] if len(df_linked_struct) == 0: - logger.warning("No structures found for plan: %s", plan_row.sop_instance_uid) + logger.warning( + "No structures found for plan: %s", plan_row.sop_instance_uid + ) continue # Find the linked image struct_row = df_linked_struct.iloc[0] keep_rows.append(struct_row.name) # Keep the index of the row of the structure - df_linked_img = df[df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid] + df_linked_img = df[ + df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid + ] if len(df_linked_img) == 0: - logger.warning("No linked images found for structure: %s", struct_row.hashed_uid) + logger.warning( + "No linked images found for structure: %s", struct_row.hashed_uid + ) continue - keep_rows.append(df_linked_img.iloc[0].name) # Keep the index of the row of the image too + keep_rows.append( + df_linked_img.iloc[0].name + ) # Keep the index of the row of the image too return df.loc[keep_rows] diff --git a/pydicer/dataset/nnunet.py b/pydicer/dataset/nnunet.py index 2befca5..67606e5 100644 --- a/pydicer/dataset/nnunet.py +++ b/pydicer/dataset/nnunet.py @@ -45,7 +45,7 @@ def __init__( nnunet_description: str = "", dataset_name: str = CONVERTED_DIR_NAME, image_modality: str = "CT", - mapping_id=DEFAULT_MAPPING_ID, + mapping_id: str = DEFAULT_MAPPING_ID, ): """Prepare a dataset to train models using nnUNet. @@ -219,12 +219,16 @@ def check_duplicates_train_test(self): """ if len(self.training_cases) == 0: - raise SystemError("training_cases are empty, run split_dataset function first.") + raise SystemError( + "training_cases are empty, run split_dataset function first." + ) img_stats = [] df = read_converted_data(self.working_directory, dataset_name=self.dataset_name) - df_images = df[(df.modality == "CT") | (df.modality == "MR") | (df.modality == "PT")] + df_images = df[ + (df.modality == "CT") | (df.modality == "MR") | (df.modality == "PT") + ] for case in self.training_cases + self.testing_cases: df_pat = df_images[df_images.patient_id == case] @@ -252,7 +256,9 @@ def check_duplicates_train_test(self): # Check to see if we have any duplicate image spacing and sizes, if so inspect these # further - duplicated_rows = df_img_stats.duplicated(subset=["spacing", "size"], keep=False) + duplicated_rows = df_img_stats.duplicated( + subset=["spacing", "size"], keep=False + ) df_img_stats["voxel_sum"] = df_img_stats.apply( lambda row: sitk.GetArrayFromImage(sitk.ReadImage(row.img_path)).sum() if row.name in duplicated_rows.index @@ -342,7 +348,9 @@ def check_structure_names(self) -> pd.DataFrame: print(f"Structure {s} is missing for patients: {missing_pats}") incomplete_structures.append(s) - incomplete_patients += [p for p in missing_pats if not p in incomplete_patients] + incomplete_patients += [ + p for p in missing_pats if not p in incomplete_patients + ] if incomplete_structures: print( @@ -383,7 +391,8 @@ def check_overlapping_structures(self): structure_name_j = structure_names[sj] structure_sum = ( - structure_set[structure_name_i] + structure_set[structure_name_j] + structure_set[structure_name_i] + + structure_set[structure_name_j] ) arr = sitk.GetArrayFromImage(structure_sum) if arr.max() > 1: @@ -444,7 +453,9 @@ def prepare_dataset(self) -> Path: """ if len(self.training_cases) == 0: - raise SystemError("training_cases are empty, run split_dataset function first.") + raise SystemError( + "training_cases are empty, run split_dataset function first." + ) # First check that all cases (in training set) have the structures which are to be learnt df_structures = self.check_structure_names() @@ -571,7 +582,9 @@ def generate_training_scripts( raise FileNotFoundError( "Ensure that the folder in which to generate the script exists." ) - script_path = script_directory.joinpath(f"train_{self.nnunet_id}_{self.nnunet_name}.sh") + script_path = script_directory.joinpath( + f"train_{self.nnunet_id}_{self.nnunet_name}.sh" + ) if isinstance(folds, str): folds = [folds] @@ -637,7 +650,9 @@ def train(self, script_directory: Union[str, Path] = ".", in_screen: bool = True """ # Make sure the script folder exists script_directory = Path(script_directory) - script_path = script_directory.joinpath(f"train_{self.nnunet_id}_{self.nnunet_name}.sh") + script_path = script_directory.joinpath( + f"train_{self.nnunet_id}_{self.nnunet_name}.sh" + ) if not script_path.exists(): raise FileNotFoundError( diff --git a/pydicer/dataset/preparation.py b/pydicer/dataset/preparation.py index 23d15bc..2d73a50 100644 --- a/pydicer/dataset/preparation.py +++ b/pydicer/dataset/preparation.py @@ -1,7 +1,7 @@ import logging import os from pathlib import Path -from typing import Callable +from typing import Callable, Union import pandas as pd @@ -22,7 +22,7 @@ class PrepareDataset: Defaults to ".". """ - def __init__(self, working_directory="."): + def __init__(self, working_directory: Union[str, Path] = "."): self.working_directory = Path(working_directory) def add_object_to_dataset(self, dataset_name: str, data_object_row: pd.Series): diff --git a/pydicer/dataset/structureset.py b/pydicer/dataset/structureset.py index 35038a3..f65fac5 100644 --- a/pydicer/dataset/structureset.py +++ b/pydicer/dataset/structureset.py @@ -10,7 +10,9 @@ logger = logging.getLogger(__name__) -def get_mapping_for_structure_set(structure_set_row: pd.Series, mapping_id: str): +def get_mapping_for_structure_set( + structure_set_row: pd.Series, mapping_id: str +) -> dict: """Searches the folder hierarchy to find a structure name mapping file with the given ID. Args: @@ -52,7 +54,8 @@ def __init__(self, structure_set_row, mapping_id=DEFAULT_MAPPING_ID): self.structure_set_id = structure_set_row.hashed_uid self.structure_names = [ - s.name.replace(".nii.gz", "") for s in self.structure_set_path.glob("*.nii.gz") + s.name.replace(".nii.gz", "") + for s in self.structure_set_path.glob("*.nii.gz") ] self.unmapped_structure_names = self.structure_names @@ -61,7 +64,9 @@ def __init__(self, structure_set_row, mapping_id=DEFAULT_MAPPING_ID): # Check if we can find a mapping for this structure set, if not we'll just used the # unmapped structure names if mapping_id is not None: - self.structure_mapping = get_mapping_for_structure_set(structure_set_row, mapping_id) + self.structure_mapping = get_mapping_for_structure_set( + structure_set_row, mapping_id + ) if self.structure_mapping is None: logger.warning("No mapping file found with id %s", mapping_id) @@ -71,7 +76,7 @@ def __init__(self, structure_set_row, mapping_id=DEFAULT_MAPPING_ID): self.cache = {} - def get_mapped_structure_name(self, item): + def get_mapped_structure_name(self, item: str) -> str: """Get the structure set specific name for a structure that may have been mapped. Args: @@ -86,7 +91,9 @@ def get_mapped_structure_name(self, item): if self.structure_mapping is not None: if item in self.structure_mapping: for variation in self.structure_mapping[item]: - variation_path = self.structure_set_path.joinpath(f"{variation}.nii.gz") + variation_path = self.structure_set_path.joinpath( + f"{variation}.nii.gz" + ) if variation_path.exists(): # Found variation, let's use that file... # TODO an issue would occur if there were multiple files that would match @@ -96,7 +103,7 @@ def get_mapped_structure_name(self, item): return structure_name - def get_standardised_structure_name(self, item): + def get_standardised_structure_name(self, item: str) -> str: """Get the standardised name for a structure that is present in this structure set. Args: @@ -150,7 +157,7 @@ def values(self): def items(self): return [(s, self[s]) for s in self.structure_names] - def get_unmapped_structures(self): + def get_unmapped_structures(self) -> list: """Get a list of structures for which no structure was found based on the mapping. If no mapping is being used this will always be empty. @@ -160,7 +167,9 @@ def get_unmapped_structures(self): missing_mappings = [] for k in self.keys(): structure_name = self.get_mapped_structure_name(k) - structure_path = self.structure_set_path.joinpath(f"{structure_name}.nii.gz") + structure_path = self.structure_set_path.joinpath( + f"{structure_name}.nii.gz" + ) if not structure_path.exists(): missing_mappings.append(k) diff --git a/pydicer/generate/object.py b/pydicer/generate/object.py index a22f3c0..58cd9fc 100644 --- a/pydicer/generate/object.py +++ b/pydicer/generate/object.py @@ -1,4 +1,6 @@ import logging +from typing import Union +from pathlib import Path import pandas as pd import SimpleITK as sitk @@ -10,14 +12,14 @@ def add_object( - working_directory, - object_id, - patient_id, - object_type, - modality, - for_uid=None, - referenced_sop_instance_uid=None, - datasets=None, + working_directory: Path, + object_id: str, + patient_id: str, + object_type: str, + modality: str, + for_uid: str = None, + referenced_sop_instance_uid: str = None, + datasets: Union[str, list] = None, ): """Add a generated object to the project. @@ -155,7 +157,9 @@ def add_object( df_converted.to_csv(patient_directory.joinpath("converted.csv")) -def get_linked_for_and_ref_uid(working_directory, patient_id, linked_obj): +def get_linked_for_and_ref_uid( + working_directory: Path, patient_id: str, linked_obj: Union[str, pd.Series] = None +) -> tuple: """Determine the linked frame of reference UID and SOP instance UID Args: @@ -194,14 +198,14 @@ def get_linked_for_and_ref_uid(working_directory, patient_id, linked_obj): def add_image_object( - working_directory, - image, - image_id, - modality, - patient_id, - linked_image=None, - for_uid=None, - datasets=None, + working_directory: Path, + image: sitk.Image, + image_id: str, + modality: str, + patient_id: str, + linked_image: pd.Series = None, + for_uid: str = None, + datasets: Union[list, str] = None, ): """Add a generated image object to the project. @@ -259,13 +263,13 @@ def add_image_object( def add_structure_object( - working_directory, - structures, - structure_id, - patient_id, - linked_image=None, - for_uid=None, - datasets=None, + working_directory: Path, + structures: dict, + structure_id: str, + patient_id: str, + linked_image: Union[str, pd.Series] = None, + for_uid: str = None, + datasets: Union[list, str] = None, ): """Add a generated structure object to the project. @@ -325,13 +329,13 @@ def add_structure_object( def add_dose_object( - working_directory, - dose, - dose_id, - patient_id, - linked_plan=None, - for_uid=None, - datasets=None, + working_directory: Path, + dose: sitk.Image, + dose_id: str, + patient_id: str, + linked_plan: Union[str, pd.Series] = None, + for_uid: str = None, + datasets: Union[list, str] = None, ): """Add a generated dose object to the project. diff --git a/pydicer/generate/segmentation.py b/pydicer/generate/segmentation.py index 49f8cb7..1618ee2 100644 --- a/pydicer/generate/segmentation.py +++ b/pydicer/generate/segmentation.py @@ -117,7 +117,7 @@ def read_all_segmentation_logs( dataset_name: str = CONVERTED_DIR_NAME, segment_id: str = None, modality: str = None, -): +) -> pd.DataFrame: """Read all auto-segmentation logs in a dataset. Args: dataset_name (str): The name of the dataset to read for. diff --git a/pydicer/input/base.py b/pydicer/input/base.py index 677b9a8..2735b88 100644 --- a/pydicer/input/base.py +++ b/pydicer/input/base.py @@ -1,5 +1,6 @@ import tempfile import logging +from typing import Union import abc from pathlib import Path @@ -9,7 +10,7 @@ class InputBase(abc.ABC): - def __init__(self, working_directory=None): + def __init__(self, working_directory: Union[str, Path] = None): """ Base class for input modules. diff --git a/pydicer/input/filesystem.py b/pydicer/input/filesystem.py index 00bdc19..8cbef44 100644 --- a/pydicer/input/filesystem.py +++ b/pydicer/input/filesystem.py @@ -1,8 +1,11 @@ +from pathlib import Path +from typing import Union + from pydicer.input.base import InputBase class FileSystemInput(InputBase): - def __init__(self, directory): + def __init__(self, directory: Union[str, Path]): """ Class for inputing files from the file system diff --git a/pydicer/input/orthanc.py b/pydicer/input/orthanc.py index 0ac1926..f0e8646 100644 --- a/pydicer/input/orthanc.py +++ b/pydicer/input/orthanc.py @@ -1,5 +1,7 @@ -import logging from io import BytesIO +import logging +from typing import Union +from Pathlib import Path import pydicom from pyorthanc.deprecated.client import Orthanc @@ -10,7 +12,7 @@ logger = logging.getLogger(__name__) -def adapt_dataset_from_bytes(blob): +def adapt_dataset_from_bytes(blob: bytes) -> pydicom.Dataset: """Convert bytes coming from Orthanc to DICOM dataset Args: @@ -24,7 +26,14 @@ def adapt_dataset_from_bytes(blob): class OrthancInput(InputBase): - def __init__(self, host, port, username=None, password=None, working_directory=None): + def __init__( + self, + host: str, + port: int, + username: str = None, + password: str = None, + working_directory: Union[str, Path] = None, + ): """Class for fetching files from Orthanc. Args: @@ -54,7 +63,9 @@ def __init__(self, host, port, username=None, password=None, working_directory=N # connection error if we can't connect to the Orthanc self.orthanc.c_find({"Level": "Patient", "Query": {"PatientID": "XXX"}}) - def fetch_data(self, patients, modalities=None): + def fetch_data( + self, patients: Union[list, str], modalities: Union[list, str] = None + ): """Download the DICOM data from Orthanc Args: @@ -74,7 +85,6 @@ def fetch_data(self, patients, modalities=None): modalities = [modalities] for patient in get_iterator(patients, unit="patients", name="Orthanc Fetch"): - # Find the Orthanc ID for this patient orthanc_patient_ids = self.orthanc.c_find( {"Level": "Patient", "Query": {"PatientID": patient}} @@ -86,34 +96,44 @@ def fetch_data(self, patients, modalities=None): if len(orthanc_patient_ids) > 1: logger.warning( - "Patient returned multple Orthanc IDs: %s. Selecting first only", patient + "Patient returned multple Orthanc IDs: %s. Selecting first only", + patient, ) orthanc_patient_id = orthanc_patient_ids[0] - patient_information = self.orthanc.get_patient_information(orthanc_patient_id) + patient_information = self.orthanc.get_patient_information( + orthanc_patient_id + ) patient_id = patient_information["MainDicomTags"]["PatientID"] # Loop over each study for this patient study_identifiers = patient_information["Studies"] for study_identifier in study_identifiers: - # Loop over each series in this study study_information = self.orthanc.get_study_information(study_identifier) series_identifiers = study_information["Series"] for series_identifier in series_identifiers: - series_information = self.orthanc.get_series_information(series_identifier) + series_information = self.orthanc.get_series_information( + series_identifier + ) # Skip if this isn't one of the modalities we want modality = series_information["MainDicomTags"]["Modality"] if modalities is not None and not modality in modalities: continue - series_information = self.orthanc.get_series_information(series_identifier) - series_instance_uid = series_information["MainDicomTags"]["SeriesInstanceUID"] + series_information = self.orthanc.get_series_information( + series_identifier + ) + series_instance_uid = series_information["MainDicomTags"][ + "SeriesInstanceUID" + ] # Create the output directory for this series - series_path = self.working_directory.joinpath(patient_id, series_instance_uid) + series_path = self.working_directory.joinpath( + patient_id, series_instance_uid + ) series_path.mkdir(exist_ok=True, parents=True) # Loop over each instance in this series @@ -127,7 +147,9 @@ def fetch_data(self, patients, modalities=None): f = self.orthanc.get_instance_file(instance_identifier) ds = adapt_dataset_from_bytes(f) - sop_instance_uid = instance_information["MainDicomTags"]["SOPInstanceUID"] + sop_instance_uid = instance_information["MainDicomTags"][ + "SOPInstanceUID" + ] ds_file_name = f"{modality}.{sop_instance_uid}.dcm" ds_path = series_path.joinpath(ds_file_name) diff --git a/pydicer/input/pacs.py b/pydicer/input/pacs.py index a36d191..58402f5 100644 --- a/pydicer/input/pacs.py +++ b/pydicer/input/pacs.py @@ -1,4 +1,7 @@ import os +from pathlib import Path +from typing import Union + import pydicom from platipy.dicom.communication.connector import DicomConnector @@ -7,7 +10,13 @@ class DICOMPACSInput(InputBase): - def __init__(self, host, port, ae_title=None, working_directory=None): + def __init__( + self, + host: str, + port: str, + ae_title: str = None, + working_directory: Union[str, Path] = None, + ): """Class for fetching files from DICOM PACS. Currently only supports C-GET commands to fetch the data. @@ -26,13 +35,18 @@ def __init__(self, host, port, ae_title=None, working_directory=None): super().__init__(working_directory) self.dicom_connector = DicomConnector( - host=host, port=port, ae_title=ae_title, output_directory=self.working_directory + host=host, + port=port, + ae_title=ae_title, + output_directory=self.working_directory, ) if not self.dicom_connector.verify(): raise ConnectionError("Unable to connect to DICOM PACS.") - def fetch_data(self, patients, modalities=None): + def fetch_data( + self, patients: Union[list, str], modalities: Union[list, str] = None + ): """Download the DICOM data from the PACS. Args: @@ -52,7 +66,6 @@ def fetch_data(self, patients, modalities=None): modalities = [modalities] for patient in patients: - dataset = pydicom.Dataset() dataset.PatientID = patient dataset.PatientName = "" diff --git a/pydicer/input/tcia.py b/pydicer/input/tcia.py index ecafe0c..3c7bc8b 100644 --- a/pydicer/input/tcia.py +++ b/pydicer/input/tcia.py @@ -1,10 +1,18 @@ +from typing import Union + from platipy.dicom.download import tcia from pydicer.input.base import InputBase class TCIAInput(InputBase): - def __init__(self, collection, patient_ids, modalities=None, working_directory=None): + def __init__( + self, + collection: str, + patient_ids: list, + modalities: list = None, + working_directory: Union[str, list] = None, + ): """ Input class that interfaces with the TCIA API @@ -14,7 +22,7 @@ def __init__(self, collection, patient_ids, modalities=None, working_directory=N fetched modalities (list, optional): A list of strings defining the modalites to fetch. Will fetch all modalities available if not specified. - working_directory (str): (str|pathlib.Path, optional): The working directory in which + working_directory (str|pathlib.Path, optional): The working directory in which to store the data fetched. Defaults to a temp directory. """ super().__init__(working_directory) diff --git a/pydicer/input/test.py b/pydicer/input/test.py index aa1fb71..50be0cd 100644 --- a/pydicer/input/test.py +++ b/pydicer/input/test.py @@ -1,10 +1,12 @@ +from typing import Union + from pydicer.input.web import WebInput class TestInput(WebInput): __test__ = False # pytest will try to use this as a test class without this - def __init__(self, working_directory=None): + def __init__(self, working_directory: Union[str, list] = None): """ A test input class to download example data from zenodo diff --git a/pydicer/input/web.py b/pydicer/input/web.py index d10c74c..08449fa 100644 --- a/pydicer/input/web.py +++ b/pydicer/input/web.py @@ -1,4 +1,6 @@ import logging +from pathlib import Path +from typing import Union from pydicer.input.base import InputBase from pydicer.utils import download_and_extract_zip_file @@ -7,7 +9,7 @@ class WebInput(InputBase): - def __init__(self, data_url, working_directory=None): + def __init__(self, data_url: str, working_directory: Union[str, Path] = None): """ Class for downloading and saving input data off the internet diff --git a/pydicer/logger.py b/pydicer/logger.py index 2220e5c..d0a5580 100644 --- a/pydicer/logger.py +++ b/pydicer/logger.py @@ -23,12 +23,13 @@ def __init__(self, pat_id, data_directory, force=True): df_pat_log = pd.DataFrame(columns=SUMMARY_CSV_COLS) df_pat_log.to_csv(self.summary_csv_path, index=False) - def log_module_error(self, module, hashed_uid, error_log): + def log_module_error(self, module: str, hashed_uid: str, error_log: str): """Function to log errors for a specific pydicer module Args: module (str): pydicer module to log error for in CSV - error (str): error to log in CSV + hashed_uid (str): hashed UID of the patient being logged to the error CSV + error_log (str): error to log in CSV """ end_time = dt.now() df_error = pd.DataFrame( @@ -37,11 +38,12 @@ def log_module_error(self, module, hashed_uid, error_log): ) df_error.to_csv(self.summary_csv_path, header=False, mode="a", index=False) - def eval_module_process(self, module, hashed_uid): + def eval_module_process(self, module: str, hashed_uid: str): """Function to log if any patient had issues for a specific pydicer module Args: module (str): pydicer module to check if no errors were generated for all patients + hashed_uid (str): hashed UID of the patient being logged to the error CSV """ end_time = dt.now() @@ -53,7 +55,8 @@ def eval_module_process(self, module, hashed_uid): ] if len(df_summary_mod) == 0: df_final_summary = pd.DataFrame( - [[module, hashed_uid, 0, "", self.start_time, end_time]], columns=SUMMARY_CSV_COLS + [[module, hashed_uid, 0, "", self.start_time, end_time]], + columns=SUMMARY_CSV_COLS, ) df_final_summary.to_csv( self.summary_csv_path, diff --git a/pydicer/preprocess/data.py b/pydicer/preprocess/data.py index c7a18fe..5a75e27 100644 --- a/pydicer/preprocess/data.py +++ b/pydicer/preprocess/data.py @@ -1,4 +1,5 @@ import logging +from typing import Union from pathlib import Path import pandas as pd @@ -36,7 +37,7 @@ def __init__(self, working_directory): self.pydicer_directory = working_directory.joinpath(PYDICER_DIR_NAME) self.pydicer_directory.mkdir(exist_ok=True) - def scan_file(self, file): + def scan_file(self, file: Union[str, Path]) -> dict: """Scan a DICOM file. Args: @@ -147,7 +148,9 @@ def scan_file(self, file): return None - def preprocess(self, input_directory, force=True): + def preprocess( + self, input_directory: Union(Path, list), force: bool = True + ) -> pd.DataFrame: """ Function to preprocess information regarding the data located in an Input working directory diff --git a/pydicer/quarantine.py b/pydicer/quarantine.py index 1123a84..37ae8ec 100644 --- a/pydicer/quarantine.py +++ b/pydicer/quarantine.py @@ -9,11 +9,11 @@ QUARATINE_DICOM_KEYS = ["PatientID", "Modality", "SOPInstanceUID", "SeriesDescription"] -def copy_file_to_quarantine(file, working_directory, error_msg): +def copy_file_to_quarantine(file: Path, working_directory: Path, error_msg: str): """Move a DICOM file that couldn't be processed into the quarantine directory Args: - file (Path): DICOM path to be moved into quarantine + file (pathlib.Path): DICOM path to be moved into quarantine working_directory (pathlib.Path): Main working directory for pydicer error_msg (str): error message associated with the quarantined file """ @@ -61,7 +61,7 @@ def copy_file_to_quarantine(file, working_directory, error_msg): df_summary.to_csv(summary_file) -def read_quarantined_data(working_directory: Path): +def read_quarantined_data(working_directory: Path) -> pd.DataFrame: """A function to read the data from the quarantine summary. Args: diff --git a/pydicer/tool.py b/pydicer/tool.py index e2a0e37..ac7b0bc 100644 --- a/pydicer/tool.py +++ b/pydicer/tool.py @@ -2,6 +2,7 @@ import logging from logging.handlers import RotatingFileHandler from pathlib import Path +from typing import Union import pandas as pd @@ -84,7 +85,7 @@ def __init__(self, working_directory="."): self.dataset = PrepareDataset(self.working_directory) self.analyse = AnalyseData(self.working_directory) - def set_verbosity(self, verbosity): + def set_verbosity(self, verbosity: int): """Set's the verbosity of the tool to the std out (console). When 0 (not set) the tool will display a progress bar. Other values indicate Python's build in logging levels: - DEBUG: 10 @@ -137,7 +138,7 @@ def update_logging(self): console_handler.setLevel(verbosity) logger.addHandler(console_handler) - def add_input(self, input_obj): + def add_input(self, input_obj: Union[str, Path, InputBase]): """Add an input location containing DICOM data. Must a str, pathlib.Path or InputBase object, such as: - FileSystemInput @@ -163,7 +164,7 @@ def add_input(self, input_obj): "input_obj must be of type str, pathlib.Path or inherit InputBase" ) - def preprocess(self, force=True): + def preprocess(self, force: bool = True): """Preprocess the DICOM data in preparation for conversion Args: @@ -181,7 +182,7 @@ def preprocess(self, force=True): self.preprocessed_data = read_preprocessed_data(self.working_directory) - def run_pipeline(self, patient=None, force=True): + def run_pipeline(self, patient: Union[str, list] = None, force: bool = True): """Runs the entire conversion pipeline, including computation of DVHs and first-order radiomics. diff --git a/pydicer/utils.py b/pydicer/utils.py index 23c3b0d..f3dce30 100644 --- a/pydicer/utils.py +++ b/pydicer/utils.py @@ -8,6 +8,7 @@ import shutil from datetime import datetime from pathlib import Path +from typing import Union import pandas as pd import SimpleITK as sitk @@ -20,7 +21,7 @@ logger = logging.getLogger(__name__) -def hash_uid(uid, truncate=6): +def hash_uid(uid: str, truncate: int = 6) -> str: """Hash a UID and truncate it Args: @@ -36,7 +37,7 @@ def hash_uid(uid, truncate=6): return hash_sha.hexdigest()[:truncate] -def determine_dcm_datetime(ds, require_time=False): +def determine_dcm_datetime(ds: pydicom.Dataset, require_time: bool = False) -> datetime: """Get a date/time value from a DICOM dataset. Will attempt to pull from SeriesDate/SeriesTime field first. Will fallback to StudyDate/StudyTime or InstanceCreationDate/InstanceCreationTime if not available. @@ -70,7 +71,11 @@ def determine_dcm_datetime(ds, require_time=False): return None -def load_object_metadata(row: pd.Series, keep_tags=None, remove_tags=None): +def load_object_metadata( + row: pd.Series, + keep_tags: Union[str, list] = None, + remove_tags: Union[str, list] = None, +) -> pydicom.Dataset: """Loads the object's metadata Args: @@ -147,7 +152,7 @@ def load_object_metadata(row: pd.Series, keep_tags=None, remove_tags=None): return pydicom.Dataset.from_json(ds_dict, bulk_data_uri_handler=lambda _: None) -def load_dvh(row, struct_hash=None): +def load_dvh(row: pd.Series, struct_hash: Union[list, str] = None) -> pd.DataFrame: """Loads an object's Dose Volume Histogram (DVH) Args: @@ -201,7 +206,7 @@ def load_dvh(row, struct_hash=None): return df_result -def read_preprocessed_data(working_directory: Path): +def read_preprocessed_data(working_directory: Path) -> pd.DataFrame: """Reads the pydicer preprocessed data Args: @@ -230,10 +235,10 @@ def read_preprocessed_data(working_directory: Path): def read_converted_data( working_directory: Path, - dataset_name=CONVERTED_DIR_NAME, - patients=None, - join_working_directory=True, -): + dataset_name: str = CONVERTED_DIR_NAME, + patients: list = None, + join_working_directory: bool = True, +) -> pd.DataFrame: """Read the converted data frame from the supplied data directory. Args: @@ -287,7 +292,7 @@ def read_converted_data( return df.reset_index(drop=True) -def parse_patient_kwarg(patient): +def parse_patient_kwarg(patient: Union[list, str]) -> list: """Helper function to prepare patient list from kwarg used in functions throughout pydicer. Args: @@ -317,7 +322,7 @@ def parse_patient_kwarg(patient): return patient -def read_simple_itk_image(row): +def read_simple_itk_image(row: pd.Series) -> sitk.Image: """Reads the SimpleITK Image object given a converted dataframe row. Args: @@ -338,7 +343,9 @@ def read_simple_itk_image(row): return sitk.ReadImage(str(nifti_path)) -def get_iterator(iterable, length=None, unit="it", name=None): +def get_iterator( + iterable, length: int = None, unit: str = "it", name: str = None +): """Get the appropriate iterator based on the level of verbosity configured. Args: @@ -369,7 +376,7 @@ def get_iterator(iterable, length=None, unit="it", name=None): return iterator -def map_structure_name(struct_name, struct_map_dict): +def map_structure_name(struct_name: str, struct_map_dict: dict) -> str: """Function to map a structure's name according to a mapping dictionary Args: @@ -534,7 +541,7 @@ def add_structure_name_mapping( json.dump(mapping_dict, structures_map_file, ensure_ascii=False, indent=4) -def download_and_extract_zip_file(zip_url, output_directory): +def download_and_extract_zip_file(zip_url: str, output_directory: Union[str, Path]): """Downloads a zip file from the URL specified and extracts the contents to the output directory. @@ -555,7 +562,9 @@ def download_and_extract_zip_file(zip_url, output_directory): zip_ref.extractall(output_directory) -def fetch_converted_test_data(working_directory=None, dataset="HNSCC"): +def fetch_converted_test_data( + working_directory: Union[str, Path] = None, dataset: str = "HNSCC" +) -> Path: """Fetch some public data which has already been converted using PyDicer. Useful for unit testing as well as examples. diff --git a/pydicer/visualise/data.py b/pydicer/visualise/data.py index 28f1ee3..664d6c9 100644 --- a/pydicer/visualise/data.py +++ b/pydicer/visualise/data.py @@ -1,4 +1,5 @@ import logging +from typing import Union from pathlib import Path import SimpleITK as sitk import matplotlib @@ -31,7 +32,12 @@ def __init__(self, working_directory="."): self.working_directory = Path(working_directory) self.output_directory = self.working_directory.joinpath(CONVERTED_DIR_NAME) - def visualise(self, dataset_name=CONVERTED_DIR_NAME, patient=None, force=True): + def visualise( + self, + dataset_name: str = CONVERTED_DIR_NAME, + patient: Union[list, str] = None, + force: bool = True, + ): """Visualise the data in the working directory. PNG files are generates providing a snapshot of the various data objects. From 2ad01d6fac0a40551e3a44a22d019f69e3ca6ce1 Mon Sep 17 00:00:00 2001 From: Daniel Al Mouiee Date: Fri, 2 Feb 2024 16:22:40 +1100 Subject: [PATCH 2/3] Fixed `pathlib` import --- pydicer/input/orthanc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydicer/input/orthanc.py b/pydicer/input/orthanc.py index f0e8646..096a715 100644 --- a/pydicer/input/orthanc.py +++ b/pydicer/input/orthanc.py @@ -1,7 +1,7 @@ from io import BytesIO import logging from typing import Union -from Pathlib import Path +from pathlib import Path import pydicom from pyorthanc.deprecated.client import Orthanc From 27209d538765bf4a887341cdc27382da29d1e73d Mon Sep 17 00:00:00 2001 From: Daniel Al Mouiee Date: Fri, 2 Feb 2024 16:31:45 +1100 Subject: [PATCH 3/3] Fixed `Union` usage --- pydicer/preprocess/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydicer/preprocess/data.py b/pydicer/preprocess/data.py index 5a75e27..1a13748 100644 --- a/pydicer/preprocess/data.py +++ b/pydicer/preprocess/data.py @@ -149,7 +149,7 @@ def scan_file(self, file: Union[str, Path]) -> dict: return None def preprocess( - self, input_directory: Union(Path, list), force: bool = True + self, input_directory: Union[Path, list], force: bool = True ) -> pd.DataFrame: """ Function to preprocess information regarding the data located in an Input working directory