Partial code update to maintain forward compatibility with numpy/pand…

…as/anndata/scipy
flatironinstitute · Jul 2, 2024 · 3f58c73 · 3f58c73
1 parent d888caa
commit 3f58c73
Show file tree

Hide file tree

Showing 19 changed files with 738 additions and 248 deletions.
diff --git a/inferelator/crossvalidation_workflow.py b/inferelator/crossvalidation_workflow.py
@@ -1,10 +1,10 @@
 """
-This is a manager which will take an Inferelator workflow and repeatedly run it with different parameters.
-This is implemented using deep copies; it is therefore memory-intensive.
+This is a manager which will take an Inferelator workflow and repeatedly
+run it with different parameters.
+This is implemented using deep copies;
+it is therefore memory-intensive.
 """
 
-from __future__ import print_function
-
 # I hate py2 now
 try:
     from builtins import FileExistsError
@@ -20,8 +20,6 @@
 import numpy as np
 import pandas as pd
 
-
-from inferelator.distributed.inferelator_mp import MPControl
 from inferelator.utils import Validator as check
 from inferelator import utils
 from inferelator import workflow
@@ -102,7 +100,11 @@ def workflow(self):
 
     @workflow.setter
     def workflow(self, wkf):
-        assert check.argument_is_subclass(wkf, workflow.WorkflowBase, allow_none=True)
+        assert check.argument_is_subclass(
+            wkf,
+            workflow.WorkflowBase,
+            allow_none=True
+        )
         if self._baseline_workflow is not None:
             warnings.warn("Replacing stored workflow with a new workflow")
         self._baseline_workflow = wkf

diff --git a/inferelator/distributed/joblib_controller.py b/inferelator/distributed/joblib_controller.py
@@ -53,9 +53,14 @@ def map(
         check.argument_callable(func)
         check.argument_list_type(args, collections.abc.Iterable)
 
-        return [r for r in joblib.Parallel(n_jobs=cls.processes)(
-            joblib.delayed(func)(*a, **kwargs) for a in zip(*args)
-        )]
+        with joblib.parallel_config(
+            backend="loky",
+            inner_max_num_threads=1
+        ):
+
+            return [r for r in joblib.Parallel(n_jobs=cls.processes)(
+                joblib.delayed(func)(*a, **kwargs) for a in zip(*args)
+            )]
 
     @classmethod
     def shutdown(cls):

diff --git a/inferelator/postprocessing/f1_score.py b/inferelator/postprocessing/f1_score.py
@@ -1,21 +1,34 @@
 import numpy as np
 
 from inferelator.postprocessing.precision_recall import RankSummaryPR
-from inferelator.postprocessing import (TARGET_COLUMN, REGULATOR_COLUMN, CONFIDENCE_COLUMN,
-                                        F1_COLUMN, PRECISION_COLUMN, RECALL_COLUMN)
+from inferelator.postprocessing import (
+    TARGET_COLUMN,
+    REGULATOR_COLUMN,
+    CONFIDENCE_COLUMN,
+    F1_COLUMN,
+    PRECISION_COLUMN,
+    RECALL_COLUMN
+)
 
 import matplotlib
 
-# If matplotlib is being an idiot and trying to set a tkinter backend, switch to agg
-if matplotlib.get_backend() in (i for i in matplotlib.rcsetup.interactive_bk):
+# If matplotlib is being an idiot and trying to set a tkinter backend,
+# switch to agg
+if matplotlib.get_backend() in (
+    i
+    for i in matplotlib.backends.backend_registry.list_builtin(
+        matplotlib.backends.BackendFilter.INTERACTIVE
+    )
+):
     matplotlib.use('agg')
 
+
 import matplotlib.pyplot as plt
 
 
 class RankSummaryF1(RankSummaryPR):
     """
-    This class extends RankSumming and calculates Matthews correlation coefficient
+    This class extends RankSumming and calculates F1 score
     """
 
     name = "F1"
@@ -34,17 +47,37 @@ def optconff1(self):
     def maxf1(self):
         return self.calculate_opt_f1(self.filtered_data)
 
-    def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'):
-        super(RankSummaryPR, self).__init__(rankable_data, gold_standard, filter_method=filter_method)
-
-        # Calculate the precision and recall and store them with confidence data
-        self.filtered_data = self.calculate_precision_recall(self.filtered_data.copy(), transform_ties='mean')
+    def __init__(
+        self,
+        rankable_data,
+        gold_standard,
+        filter_method='keep_all_gold_standard'
+    ):
+        super(RankSummaryPR, self).__init__(
+            rankable_data,
+            gold_standard,
+            filter_method=filter_method
+        )
+
+        # Calculate the precision and recall and store them with confidence
+        # data
+        self.filtered_data = self.calculate_precision_recall(
+            self.filtered_data.copy(),
+            transform_ties='mean'
+        )
         self.filtered_data = self.calculate_f1(self.filtered_data.copy())
 
         # Join the filtered F1 score onto the full confidences
-        join_data = self.filtered_data.loc[:, [TARGET_COLUMN, REGULATOR_COLUMN, F1_COLUMN]]
-        join_data = join_data.set_index([TARGET_COLUMN, REGULATOR_COLUMN])
-        self.confidence_data = self.confidence_data.join(join_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
+        join_data = self.filtered_data.loc[
+            :,
+            [TARGET_COLUMN, REGULATOR_COLUMN, F1_COLUMN]
+        ].set_index(
+            [TARGET_COLUMN, REGULATOR_COLUMN]
+        )
+        self.confidence_data = self.confidence_data.join(
+            join_data,
+            on=[TARGET_COLUMN, REGULATOR_COLUMN]
+        )
 
     def score(self):
         return self.name, self.maxf1
@@ -59,8 +92,15 @@ def output_curve(self, ax=None, figsize=(6, 4)):
 
         # Extract the recall and precision data
         curve = self.curve_dataframe()
-        self.plot_f1_conf(curve[F1_COLUMN].values, curve[CONFIDENCE_COLUMN].values, self.maxf1, self.optconff1, ax,
-                          num_edges=(self.confidence_data[CONFIDENCE_COLUMN] >= self.optconff1).sum())
+        self.plot_f1_conf(
+            curve[F1_COLUMN].values,
+            curve[CONFIDENCE_COLUMN].values,
+            self.maxf1,
+            self.optconff1,
+            ax,
+            num_edges=(
+                self.confidence_data[CONFIDENCE_COLUMN] >= self.optconff1
+            ).sum())
 
         return ax
 
@@ -75,12 +115,22 @@ def plot_f1_conf(f1, conf, optf1, optconf, ax, num_edges=None):
         ax.set_xlim(1, 0)
         ax.set_ylim(0, 1)
         ax.set_ylabel('F1')
-        ax.vlines(float(optconf), 0, 1, transform=ax.get_xaxis_transform(), colors='r', linestyles='dashed')
-
-        _msg = "max F1 = {optf1:.4f}\noptimal conf = {optconf:.4f}\nnum_edges = {n}".format(optf1=optf1,
-                                                                                            optconf=optconf,
-                                                                                            n=num_edges)
-        ax.annotate(_msg, xy=(0.4, 0.075), xycoords='axes fraction')
+        ax.vlines(
+            float(optconf),
+            0,
+            1,
+            transform=ax.get_xaxis_transform(),
+            colors='r',
+            linestyles='dashed'
+        )
+
+        ax.annotate(
+            f"max F1 = {optf1:.4f}\n"
+            f"optimal conf = {optconf:.4f}\n"
+            f"num_edges = {num_edges}",
+            xy=(0.4, 0.075),
+            xycoords='axes fraction'
+        )
 
         return ax
 
@@ -92,12 +142,18 @@ def calculate_opt_f1(data):
     @staticmethod
     def calculate_opt_conf_f1(data):
 
-        return data.loc[data[F1_COLUMN] >= np.max(data[F1_COLUMN]), CONFIDENCE_COLUMN].min()
+        return data.loc[
+            data[F1_COLUMN] >= np.max(data[F1_COLUMN]),
+            CONFIDENCE_COLUMN
+        ].min()
 
     @staticmethod
     def calculate_f1(data):
 
-        data[F1_COLUMN] = RankSummaryF1.pr_to_f1(data[PRECISION_COLUMN], data[RECALL_COLUMN])
+        data[F1_COLUMN] = RankSummaryF1.pr_to_f1(
+            data[PRECISION_COLUMN],
+            data[RECALL_COLUMN]
+        )
         return data
 
     @staticmethod

diff --git a/inferelator/postprocessing/matthews_correlation.py b/inferelator/postprocessing/matthews_correlation.py
@@ -1,23 +1,36 @@
-from math import isfinite
 import numpy as np
-import warnings
 
 from inferelator.postprocessing.model_performance import RankSummingMetric
-from inferelator.postprocessing import (TARGET_COLUMN, REGULATOR_COLUMN, CONFIDENCE_COLUMN, GOLD_STANDARD_COLUMN,
-                                        MCC_COLUMN, TP, FP, TN, FN)
+from inferelator.postprocessing import (
+    TARGET_COLUMN,
+    REGULATOR_COLUMN,
+    CONFIDENCE_COLUMN,
+    MCC_COLUMN,
+    TP,
+    FP,
+    TN,
+    FN
+)
 
 import matplotlib
 
-# If matplotlib is being an idiot and trying to set a tkinter backend, switch to agg
-if matplotlib.get_backend() in (i for i in matplotlib.rcsetup.interactive_bk):
+# If matplotlib is being an idiot and trying to set a tkinter backend,
+# switch to agg
+if matplotlib.get_backend() in (
+    i
+    for i in matplotlib.backends.backend_registry.list_builtin(
+        matplotlib.backends.BackendFilter.INTERACTIVE
+    )
+):
     matplotlib.use('agg')
 
 import matplotlib.pyplot as plt
 
 
 class RankSummaryMCC(RankSummingMetric):
     """
-    This class extends RankSumming and calculates Matthews correlation coefficient
+    This class extends RankSumming and calculates Matthews
+    correlation coefficient
     """
 
     name = "MCC"
@@ -41,17 +54,33 @@ def nnzmmc(self):
 
     # Plotter function
 
-    def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'):
-
-        super(RankSummaryMCC, self).__init__(rankable_data, gold_standard, filter_method=filter_method)
-
-        # Calculate the precision and recall and store them with confidence data
+    def __init__(
+        self,
+        rankable_data,
+        gold_standard,
+        filter_method='keep_all_gold_standard'
+    ):
+
+        super(RankSummaryMCC, self).__init__(
+            rankable_data,
+            gold_standard,
+            filter_method=filter_method
+        )
+
+        # Calculate the precision and recall and store them with confidence
+        # data
         self.filtered_data = self.calculate_mcc(self.filtered_data.copy())
 
         # Join the filtered MCC onto the full confidences
-        join_data = self.filtered_data.loc[:, [TARGET_COLUMN, REGULATOR_COLUMN, MCC_COLUMN]]
-        join_data = join_data.set_index([TARGET_COLUMN, REGULATOR_COLUMN])
-        self.confidence_data = self.confidence_data.join(join_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
+        join_data = self.filtered_data.loc[
+            :,
+            [TARGET_COLUMN, REGULATOR_COLUMN, MCC_COLUMN]
+        ].set_index([TARGET_COLUMN, REGULATOR_COLUMN])
+
+        self.confidence_data = self.confidence_data.join(
+            join_data,
+            on=[TARGET_COLUMN, REGULATOR_COLUMN]
+        )
 
     def score(self):
 
@@ -69,8 +98,16 @@ def output_curve(self, ax=None, figsize=(6, 4)):
 
         # Extract the recall and precision data
         curve = self.curve_dataframe()
-        self.plot_mcc_conf(curve[MCC_COLUMN].values, curve[CONFIDENCE_COLUMN].values, self.maxmcc, self.optconfmcc, ax,
-                           num_edges=(self.confidence_data[CONFIDENCE_COLUMN] >= self.optconfmcc).sum())
+        self.plot_mcc_conf(
+            curve[MCC_COLUMN].values,
+            curve[CONFIDENCE_COLUMN].values,
+            self.maxmcc,
+            self.optconfmcc,
+            ax,
+            num_edges=(
+                self.confidence_data[CONFIDENCE_COLUMN] >= self.optconfmcc
+            ).sum()
+        )
 
         return ax
 
@@ -88,12 +125,22 @@ def plot_mcc_conf(mcc, conf, optmcc, optconf, ax, num_edges=None):
         ax.set_xlim(1, 0)
         ax.set_ylim(y_min, 1)
         ax.set_ylabel('MCC')
-        ax.vlines(float(optconf), 0, 1, transform=ax.get_xaxis_transform(), colors='r', linestyles='dashed')
-
-        _msg = "max MCC = {optmcc:.4f}\noptimal conf = {optconf:.4f}\nnum_edges = {n}".format(optmcc=optmcc,
-                                                                                              optconf=optconf,
-                                                                                              n=num_edges)
-        ax.annotate(_msg, xy=(0.4, 0.075), xycoords='axes fraction')
+        ax.vlines(
+            float(optconf),
+            0,
+            1,
+            transform=ax.get_xaxis_transform(),
+            colors='r',
+            linestyles='dashed'
+        )
+
+        ax.annotate(
+            f"max MCC = {optmcc:.4f}\n"
+            f"optimal conf = {optconf:.4f}\n"
+            f"num_edges = {num_edges}",
+            xy=(0.4, 0.075),
+            xycoords='axes fraction'
+        )
 
         return ax
 
@@ -105,7 +152,10 @@ def calculate_opt_mcc(data):
     @staticmethod
     def calculate_opt_conf_mcc(data):
 
-        return data.loc[data[MCC_COLUMN] >= np.max(data[MCC_COLUMN]), CONFIDENCE_COLUMN].min()
+        return data.loc[
+            data[MCC_COLUMN] >= np.max(data[MCC_COLUMN]),
+            CONFIDENCE_COLUMN
+        ].min()
 
     @staticmethod
     def calculate_nnz_mcc(data, conf):
@@ -116,14 +166,25 @@ def calculate_nnz_mcc(data, conf):
     def calculate_mcc(data):
 
         df = RankSummingMetric.compute_confusion_matrix(data)
-        data[MCC_COLUMN] = RankSummaryMCC.confusion_to_mcc(df[TP], df[TN], df[FP], df[FN])
+        data[MCC_COLUMN] = RankSummaryMCC.confusion_to_mcc(
+            df[TP],
+            df[TN],
+            df[FP],
+            df[FN]
+        )
         return data
 
     @staticmethod
     def confusion_to_mcc(tp, tn, fp, fn):
-        denominator = np.sqrt(tp + fp) * np.sqrt(tp + fn) * np.sqrt(tn + fp) * np.sqrt(tn + fn)
-
-        # If any denominator value is 0, MCC is 0/0 and by convention will be set to 0.0
+        denominator = (
+            np.sqrt(tp + fp) *
+            np.sqrt(tp + fn) *
+            np.sqrt(tn + fp) *
+            np.sqrt(tn + fn)
+        )
+
+        # If any denominator value is 0, MCC is 0/0 and by
+        # convention will be set to 0.0
         denominator[denominator == 0] = 1.0
 
         return (tp * tn - fp * fn) / denominator