Merge pull request #120 from MannLabs/implement_featurization_rename

Implement featurization rename
MannLabs · Nov 30, 2024 · 6e29e4b · 6e29e4b
2 parents e0a00f6 + b051095
commit 6e29e4b
Show file tree

Hide file tree

Showing 19 changed files with 100 additions and 94 deletions.
diff --git a/docs/pages/images/HDF5_data_containers.png b/docs/pages/images/HDF5_data_containers.png
diff --git a/docs/pages/images/WGA_segmentation_cytosol.png b/docs/pages/images/WGA_segmentation_cytosol.png
diff --git a/docs/pages/images/WGA_segmentation_nucleus.png b/docs/pages/images/WGA_segmentation_nucleus.png
diff --git a/docs/pages/images/class_hierarchy.png b/docs/pages/images/class_hierarchy.png
diff --git a/docs/pages/images/extraction_process.png b/docs/pages/images/extraction_process.png
diff --git a/docs/pages/images/graphical_abstract.png b/docs/pages/images/graphical_abstract.png
diff --git a/docs/pages/images/graphical_abstract_without_title.png b/docs/pages/images/graphical_abstract_without_title.png
diff --git a/docs/pages/images/project_structure.png b/docs/pages/images/project_structure.png
diff --git a/docs/pages/images/scportrait_workflow_steps.png b/docs/pages/images/scportrait_workflow_steps.png
diff --git a/docs/pages/images/scportrait_workflow_steps_with_explaination.png b/docs/pages/images/scportrait_workflow_steps_with_explaination.png
diff --git a/docs/pages/images/segmentation_classes.png b/docs/pages/images/segmentation_classes.png
diff --git a/docs/pages/introduction.rst b/docs/pages/introduction.rst
@@ -16,7 +16,7 @@ The scPortrait workflow consists of up to 5 steps:
 
   3. :ref:`Extraction <quickstart_extraction>`: The segmentation masks are applied to extract single-cell images for all cells in the input images. Images of individual cells are rescaled to [0, 1] per channel.
 
-  4. :ref:`Featurization <quickstart_featurization>`: The image-based phenotype of each individual cell in the extracted single-cell dataset is classified using the specified classification method. Multiple classification runs can be performed on the same dataset using different classification methods.
+  4. :ref:`Featurization <quickstart_featurization>`: The image-based phenotype of each individual cell in the extracted single-cell dataset is featurized using the specified featurization method. Multiple featurization runs can be performed on the same dataset using different methods.
 
   5. :ref:`Selection <quickstart_selection>`: Cutting instructions for the isolation of selected individual cells by laser microdissection are generated. The cutting shapes are written to an ``.xml`` file which can be loaded on a leica LMD microscope for automated cell excision.
 
@@ -92,7 +92,7 @@ The size in px of the output images can be set in the ``config.yml`` file and ty
 Featurization
 ==============
 
-During featurization, the extracted single cell images are passed to a phenotype analysis method that either calculates a set of features for each cell or directly assigns a class label to each cell. This analysis method will need to be adapted to each use case. For example, in our `first publication <https://doi.org/10.1101/2023.06.01.542416>`_ we describe a deep learning-based binary image classifier that identifies individual cells defective in a biological process called "autophagy". Multiple classification runs can be performed on the same dataset so that different classification approaches can be used in parallel.
+During featurization, the extracted single cell images are passed to a phenotype analysis method that either generates a set of features for each cell or directly assigns a class label. This analysis method will need to be adapted to each use case. For example, in our `first publication <https://doi.org/10.1101/2023.06.01.542416>`_ we describe a deep learning-based binary image classifier that identifies individual cells defective in a biological process called "autophagy". Multiple featurization runs can be performed on the same dataset so that different featurization approaches can be used in parallel.
 
 .. _quickstart_selection:
 

diff --git a/docs/pages/module/pipeline.rst b/docs/pages/module/pipeline.rst
@@ -46,19 +46,19 @@ HDF5CellExtraction
     :show-inheritance:
 
 
-classification
-##############
+Featurization
+#############
 
 MLClusterClassifier
 ===================
-.. autoclass:: scportrait.pipeline.classification.MLClusterClassifier
+.. autoclass:: scportrait.pipeline.featurization.MLClusterClassifier
     :members:
 
     .. automethod:: __call__
 
 CellFeaturizer
 ==============
-.. autoclass:: scportrait.pipeline.classification.CellFeaturizer
+.. autoclass:: scportrait.pipeline.featurization.CellFeaturizer
     :members:
 
     .. automethod:: __call__

diff --git a/docs/pages/notebooks/Example_Notebook_TimeCourse_Project.ipynb b/docs/pages/notebooks/Example_Notebook_TimeCourse_Project.ipynb
@@ -101,8 +101,8 @@
     "scPortrait works with a project structure that is kept the same across different projects. Each project contains all of the results from one run. Each Project has the same general structure:\n",
     "\n",
     "    .\n",
-    "    ├── classification\n",
-    "    │   └── classifier_name\n",
+    "    ├── featurization\n",
+    "    │   └── featurizer_name\n",
     "    │       └── processing.log\n",
     "    ├── config.yml\n",
     "    ├── extraction\n",

diff --git a/docs/pages/notebooks/example_scPortrait_project.ipynb b/docs/pages/notebooks/example_scPortrait_project.ipynb
@@ -13,7 +13,7 @@
     "\n",
     "2. **extraction**: The segmentation masks are applied to extract single-cell images for all cells in the input images. Images of individual cells are rescaled to [0, 1] per channel.\n",
     "\n",
-    "3. **classification**: The image-based phenotype of each individual cell in the extracted single-cell dataset is classified using the specified classification method. Multiple classification runs can be performed on the same dataset using different classification methods. Here we utilize the pretrained binary classifier from the original [SPARCS manuscript](https://doi.org/10.1101/2023.06.01.542416) that identifies individual cells defective in a biological process called \"autophagy\". \n",
+    "3. **featurization**: The image-based phenotype of each individual cell in the extracted single-cell dataset is featurized using the specified featurization method. Multiple featurization runs can be performed on the same dataset using different methods. Here we utilize the pretrained binary classifier from the original [SPARCS manuscript](https://doi.org/10.1101/2023.06.01.542416) that identifies individual cells defective in a biological process called \"autophagy\". \n",
     "\n",
     "4. **selection**: Cutting instructions for the isolation of selected individual cells by laser microdissection are generated. The cutting shapes are written to an ``.xml`` file which can be loaded on a leica LMD microscope for automated cell excision.\n",
     "\n",
@@ -1390,10 +1390,9 @@
     "Within the `config.yml` we specify which model should be used for inference and we can give it a name. \n",
     "\n",
     "                MLClusterClassifier:\n",
-    "                    channel_classification: 4\n",
-    "                    threads: 24 #\n",
+    "                    channel_selection: 4\n",
     "                    batch_size: 900\n",
-    "                    dataloader_worker: 0 #needs to be 0 if using cpu\n",
+    "                    dataloader_worker_number: 0 #needs to be 0 if using cpu\n",
     "                    standard_scale: False\n",
     "                    exp_transform: False\n",
     "                    log_transform: False\n",
@@ -1411,14 +1410,14 @@
     "<img src=\"../images/classifying_autophagy.png\" alt=\"autophagy classification with example cells\" width=\"800\"/>\n",
     "\n",
     "\n",
-    "The inference results will be written to a new folder generated under `classification` with this name. \n",
+    "The inference results will be written to a new folder generated under `featurization` with this name. \n",
     "\n",
     "If we want to use a model we trained ourselves that is not yet included within the scPortrait library we can simply replace the network name in the config with the path to the checkpoint file generated by pytorch.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1462,7 +1461,7 @@
     }
    ],
    "source": [
-    "project.classify()"
+    "project.featurize()"
    ]
   },
   {

diff --git a/docs/pages/pipeline/config.rst b/docs/pages/pipeline/config.rst
@@ -40,11 +40,11 @@ The configuration file is a ``.yml`` file which specifies all of the parameters
         hdf5_rdcc_w0: 1
         hdf5_rdcc_nslots: 50000
     CellFeaturizer:
-        channel_classification: 4
+        channel_selection: 4
         batch_size: 900
-        dataloader_worker: 0 #needs to be 0 if using cpu
+        dataloader_worker_number: 0 #needs to be 0 if using cpu
         inference_device: "cpu"
-        screen_label: "Ch3_Featurization"
+        label: "Ch3_Featurization"
     LMDSelection:
         processes: 20
         segmentation_channel: 0

diff --git a/src/scportrait/pipeline/classification.py → src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/classification.py → src/scportrait/pipeline/featurization.py
@@ -18,14 +18,15 @@
 from scportrait.tools.ml.plmodels import MultilabelSupervisedModel
 
 
-class _ClassificationBase(ProcessingStep):
+class _FeaturizationBase(ProcessingStep):
     PRETRAINED_MODEL_NAMES = [
         "autophagy_classifier",
     ]
     MASK_NAMES = ["nucleus", "cytosol"]
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self._check_config()
 
         self.label = self.config["label"]
         self.num_workers = self.config["dataloader_worker_number"]
@@ -36,40 +37,46 @@ def __init__(self, *args, **kwargs):
         self.transforms = None
         self.expected_imagesize = None
 
-        self._setup_channel_classification()
+        self._setup_channel_selection()
 
         # setup deep debugging
         self.deep_debug = False
 
         if "overwrite_run_path" not in self.__dict__.keys():
             self.overwrite_run_path = self.overwrite
 
+    def _check_config(self):
+        """Check if all required parameters are present in the config file."""
+
+        assert "label" in self.config.keys(), "No label specified in config file."
+        assert "dataloader_worker_number" in self.config.keys(), "No dataloader_worker_number specified in config file."
+        assert "batch_size" in self.config.keys(), "No batch_size specified in config file."
+
     def _setup_output(self):
-        """Helper function to generate the output directory for the classification results."""
+        """Helper function to generate the output directory for the featurization results."""
 
-        # Create classification directory
         if not os.path.isdir(self.directory):
             os.makedirs(self.directory)
 
         self.run_path = os.path.join(self.directory, f"{self.data_type}_{self.label}")
 
         if not os.path.isdir(self.run_path):
             os.makedirs(self.run_path)
-            self.log(f"Created new directory for classification results: {self.run_path}")
+            self.log(f"Created new directory for featurization results: {self.run_path}")
         else:
             if self.overwrite:
-                self.log("Overwrite flag is set, deleting existing directory for classification results.")
+                self.log("Overwrite flag is set, deleting existing directory for featurization results.")
                 shutil.rmtree(self.run_path)
                 os.makedirs(self.run_path)
-                self.log(f"Created new directory for classification results: {self.run_path}")
+                self.log(f"Created new directory for featurization results: {self.run_path}")
             elif self.overwrite_run_path:
-                self.log("Overwrite flag is set, deleting existing directory for classification results.")
+                self.log("Overwrite flag is set, deleting existing directory for featurization results.")
                 shutil.rmtree(self.run_path)
                 os.makedirs(self.run_path)
-                self.log(f"Created new directory for classification results: {self.run_path}")
+                self.log(f"Created new directory for featurization results: {self.run_path}")
             else:
                 raise ValueError(
-                    f"Directory for classification results already exists at {self.run_path}. Please set the overwrite flag to True if you wish to overwrite the existing directory."
+                    f"Directory for featurization results already exists at {self.run_path}. Please set the overwrite flag to True if you wish to overwrite the existing directory."
                 )
 
     def _setup_log_transform(self):
@@ -78,11 +85,11 @@ def _setup_log_transform(self):
         else:
             self.log_transform = False  # default value
 
-    def _setup_channel_classification(self):
-        if "channel_classification" in self.config.keys():
-            self.channel_classification = self.config["channel_classification"]
+    def _setup_channel_selection(self):
+        if "channel_selection" in self.config.keys():
+            self.channel_selection = self.config["channel_selection"]
         else:
-            self.channel_classification = None
+            self.channel_selection = None
 
     def _detect_automatic_inference_device(self):
         """Automatically detect the best inference device available on the system."""
@@ -98,7 +105,7 @@ def _detect_automatic_inference_device(self):
 
     def _setup_inference_device(self):
         """
-        Configure the classification run to use the specified inference device.
+        Configure the featurization run to use the specified inference device.
         If no device is specified, the device is automatically detected.
         """
 
@@ -159,7 +166,7 @@ def _setup_inference_device(self):
             self.log(f"Automatically configured inferece device to {self.inference_device}")
 
     def _general_setup(self):
-        """Helper function to execute all setup functions that are common to all classification steps."""
+        """Helper function to execute all setup functions that are common to all featurization steps."""
 
         self._setup_output()
         self._setup_log_transform()
@@ -195,7 +202,7 @@ def _get_model_specs(self):
                 self.define_model_class(self.DEFAULT_MODEL_CLASS)  # default model class
         else:
             self.log(
-                f"Model class already defined as {self.model_class} will not overwrite. If this behaviour was unintended please set the model class to none by executing 'project.classification_f.model_class = None'"
+                f"Model class already defined as {self.model_class} will not overwrite. If this behaviour was unintended please set the model class to none by executing 'project.featurization_f.model_class = None'"
             )
 
         if "model_type" in self.config.keys():
@@ -235,7 +242,7 @@ def _get_gpu_memory_usage(self):
     ### Functions for model loading and setup
 
     def _assign_model(self, model):
-        self.log("Model assigned to classification function.")
+        self.log("Model assigned to featurization function.")
         self.model = model
 
         # check if the hparams specify an expected image size
@@ -247,7 +254,6 @@ def define_model_class(self, model_class, force_load=False):
             model_class = eval(model_class)  # convert string to class by evaluating it
 
         # check that it is a valid model class
-
         if force_load:
             if not issubclass(model_class, pl.LightningModule):
                 Warning(
@@ -427,7 +433,7 @@ def generate_dataloader(
                 dir_labels=[0],
                 transform=t,
                 return_id=True,
-                select_channel=self.channel_classification,
+                select_channel=self.channel_selection,
             )
 
         if size > 0:
@@ -633,7 +639,7 @@ def _post_processing_cleanup(self):
         # reset to init values to ensure that subsequent runs are not affected by previous runs
         self.model_class = None
         self.transforms = None
-        self.channel_classification = None
+        self.channel_selection = None
         self.model = None
 
         self._clear_cache()
@@ -652,7 +658,7 @@ def _post_processing_cleanup(self):
 ###############################################
 
 
-class MLClusterClassifier(_ClassificationBase):
+class MLClusterClassifier(_FeaturizationBase):
     """
     Class for classifying single cells using a pre-trained machine learning model.
 
@@ -761,7 +767,7 @@ def _setup_encoders(self):
     def _setup_transforms(self) -> None:
         if self.transforms is not None:
             self.log(
-                "Transforms already configured manually. Will not overwrite. If this behaviour was unintended please set the transforms to None by executing 'project.classification_f.transforms = None'"
+                "Transforms already configured manually. Will not overwrite. If this behaviour was unintended please set the transforms to None by executing 'project.featurization_f.transforms = None'"
             )
             return
 
@@ -828,7 +834,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
 
             MLClusterClassifier:
                 # Channel number on which the classification should be performed
-                channel_classification: 4
+                channel_selection: 4
 
                 # Number of threads to use for dataloader
                 dataloader_worker_number: 24
@@ -895,7 +901,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
             self._post_processing_cleanup()
 
 
-class EnsembleClassifier(_ClassificationBase):
+class EnsembleClassifier(_FeaturizationBase):
     """
     This class takes a pre-trained ensemble of models and uses it to classify extracted single cell datasets.
     """
@@ -986,7 +992,7 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
 
                 EnsembleClassifier:
                     # channel number on which the classification should be performed
-                    channel_classification: 4
+                    channel_selection: 4
 
                     #number of threads to use for dataloader
                     dataloader_worker_number: 24
@@ -1038,7 +1044,7 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
 
 
 ####### CellFeaturization based on Classic Featurecalculation #######
-class _cellFeaturizerBase(_ClassificationBase):
+class _cellFeaturizerBase(_FeaturizationBase):
     CLEAN_LOG = True
     DEFAULT_DATA_LOADER = HDF5SingleCellDataset
 
@@ -1219,8 +1225,8 @@ def _write_results_sdata(self, results, mask_type="seg_all"):
             # define name to save table under
             self.label.replace("CellFeaturizer_", "")  # remove class name from label to ensure we dont have duplicates
 
-            if self.channel_classification is not None:
-                table_name = f"{self.__class__.__name__ }_{self.config['channel_classification']}_{self.MASK_NAMES[0]}"
+            if self.channel_selection is not None:
+                table_name = f"{self.__class__.__name__ }_{self.config['channel_selection']}_{self.MASK_NAMES[0]}"
             else:
                 table_name = f"{self.__class__.__name__ }_{self.MASK_NAMES[0]}"
 
@@ -1253,8 +1259,8 @@ def _write_results_sdata(self, results, mask_type="seg_all"):
             )
 
             # define name to save table under
-            if self.channel_classification is not None:
-                table_name = f"{self.__class__.__name__ }_{self.config['channel_classification']}_{self.MASK_NAMES[1]}"
+            if self.channel_selection is not None:
+                table_name = f"{self.__class__.__name__ }_{self.config['channel_selection']}_{self.MASK_NAMES[1]}"
             else:
                 table_name = f"{self.__class__.__name__ }_{self.MASK_NAMES[1]}"
 
@@ -1284,7 +1290,7 @@ class CellFeaturizer(_cellFeaturizerBase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.channel_classification = None  # ensure that all images are passed to the function
+        self.channel_selection = None  # ensure that all images are passed to the function
 
     def _setup(self):
         self._general_setup()
@@ -1328,7 +1334,7 @@ def process(self, extraction_dir, size=0):
 
             CellFeaturizer:
                 # Channel number on which the featurization should be performed
-                channel_classification: 4
+                channel_selection: 4
 
                 # Number of threads to use for dataloader
                 dataloader_worker_number: 0 # needs to be 0 if using CPU
@@ -1395,9 +1401,9 @@ def __init__(self, *args, **kwargs):
 
     def _setup_channel_selection(self):
         if self.n_masks == 2:
-            self.channel_classification = [0, 1, self.channel_classification]
+            self.channel_selection = [0, 1, self.channel_selection]
         if self.n_masks == 1:
-            self.channel_classification = [0, self.channel_classification]
+            self.channel_selection = [0, self.channel_selection]
         return
 
     def _setup(self):
@@ -1407,7 +1413,7 @@ def _setup(self):
         self._get_channel_specs()
 
     def process(self, extraction_dir, size=0):
-        self.log(f"Started CellFeaturization of selected channel {self.channel_classification}.")
+        self.log(f"Started CellFeaturization of selected channel {self.channel_selection}.")
 
         # perform setup
         self._setup()
@@ -1420,7 +1426,7 @@ def process(self, extraction_dir, size=0):
         )
 
         # generate column names
-        channel_name = self.channel_names[self.channel_classification[-1] - self.n_masks]
+        channel_name = self.channel_names[self.channel_selection[-1] - self.n_masks]
         self._generate_column_names(n_masks=self.n_masks, n_channels=1, channel_names=[channel_name])
 
         # define inference function