Skip to content

Commit

Permalink
Partial code update to maintain forward compatibility with numpy/pand…
Browse files Browse the repository at this point in the history
…as/anndata/scipy
  • Loading branch information
asistradition committed Jul 2, 2024
1 parent d888caa commit 3f58c73
Show file tree
Hide file tree
Showing 19 changed files with 738 additions and 248 deletions.
16 changes: 9 additions & 7 deletions inferelator/crossvalidation_workflow.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
This is a manager which will take an Inferelator workflow and repeatedly run it with different parameters.
This is implemented using deep copies; it is therefore memory-intensive.
This is a manager which will take an Inferelator workflow and repeatedly
run it with different parameters.
This is implemented using deep copies;
it is therefore memory-intensive.
"""

from __future__ import print_function

# I hate py2 now
try:
from builtins import FileExistsError
Expand All @@ -20,8 +20,6 @@
import numpy as np
import pandas as pd


from inferelator.distributed.inferelator_mp import MPControl
from inferelator.utils import Validator as check
from inferelator import utils
from inferelator import workflow
Expand Down Expand Up @@ -102,7 +100,11 @@ def workflow(self):

@workflow.setter
def workflow(self, wkf):
assert check.argument_is_subclass(wkf, workflow.WorkflowBase, allow_none=True)
assert check.argument_is_subclass(
wkf,
workflow.WorkflowBase,
allow_none=True
)
if self._baseline_workflow is not None:
warnings.warn("Replacing stored workflow with a new workflow")
self._baseline_workflow = wkf
Expand Down
11 changes: 8 additions & 3 deletions inferelator/distributed/joblib_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,14 @@ def map(
check.argument_callable(func)
check.argument_list_type(args, collections.abc.Iterable)

return [r for r in joblib.Parallel(n_jobs=cls.processes)(
joblib.delayed(func)(*a, **kwargs) for a in zip(*args)
)]
with joblib.parallel_config(
backend="loky",
inner_max_num_threads=1
):

return [r for r in joblib.Parallel(n_jobs=cls.processes)(
joblib.delayed(func)(*a, **kwargs) for a in zip(*args)
)]

@classmethod
def shutdown(cls):
Expand Down
102 changes: 79 additions & 23 deletions inferelator/postprocessing/f1_score.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
import numpy as np

from inferelator.postprocessing.precision_recall import RankSummaryPR
from inferelator.postprocessing import (TARGET_COLUMN, REGULATOR_COLUMN, CONFIDENCE_COLUMN,
F1_COLUMN, PRECISION_COLUMN, RECALL_COLUMN)
from inferelator.postprocessing import (
TARGET_COLUMN,
REGULATOR_COLUMN,
CONFIDENCE_COLUMN,
F1_COLUMN,
PRECISION_COLUMN,
RECALL_COLUMN
)

import matplotlib

# If matplotlib is being an idiot and trying to set a tkinter backend, switch to agg
if matplotlib.get_backend() in (i for i in matplotlib.rcsetup.interactive_bk):
# If matplotlib is being an idiot and trying to set a tkinter backend,
# switch to agg
if matplotlib.get_backend() in (
i
for i in matplotlib.backends.backend_registry.list_builtin(
matplotlib.backends.BackendFilter.INTERACTIVE
)
):
matplotlib.use('agg')


import matplotlib.pyplot as plt


class RankSummaryF1(RankSummaryPR):
"""
This class extends RankSumming and calculates Matthews correlation coefficient
This class extends RankSumming and calculates F1 score
"""

name = "F1"
Expand All @@ -34,17 +47,37 @@ def optconff1(self):
def maxf1(self):
return self.calculate_opt_f1(self.filtered_data)

def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'):
super(RankSummaryPR, self).__init__(rankable_data, gold_standard, filter_method=filter_method)

# Calculate the precision and recall and store them with confidence data
self.filtered_data = self.calculate_precision_recall(self.filtered_data.copy(), transform_ties='mean')
def __init__(
self,
rankable_data,
gold_standard,
filter_method='keep_all_gold_standard'
):
super(RankSummaryPR, self).__init__(
rankable_data,
gold_standard,
filter_method=filter_method
)

# Calculate the precision and recall and store them with confidence
# data
self.filtered_data = self.calculate_precision_recall(
self.filtered_data.copy(),
transform_ties='mean'
)
self.filtered_data = self.calculate_f1(self.filtered_data.copy())

# Join the filtered F1 score onto the full confidences
join_data = self.filtered_data.loc[:, [TARGET_COLUMN, REGULATOR_COLUMN, F1_COLUMN]]
join_data = join_data.set_index([TARGET_COLUMN, REGULATOR_COLUMN])
self.confidence_data = self.confidence_data.join(join_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
join_data = self.filtered_data.loc[
:,
[TARGET_COLUMN, REGULATOR_COLUMN, F1_COLUMN]
].set_index(
[TARGET_COLUMN, REGULATOR_COLUMN]
)
self.confidence_data = self.confidence_data.join(
join_data,
on=[TARGET_COLUMN, REGULATOR_COLUMN]
)

def score(self):
return self.name, self.maxf1
Expand All @@ -59,8 +92,15 @@ def output_curve(self, ax=None, figsize=(6, 4)):

# Extract the recall and precision data
curve = self.curve_dataframe()
self.plot_f1_conf(curve[F1_COLUMN].values, curve[CONFIDENCE_COLUMN].values, self.maxf1, self.optconff1, ax,
num_edges=(self.confidence_data[CONFIDENCE_COLUMN] >= self.optconff1).sum())
self.plot_f1_conf(
curve[F1_COLUMN].values,
curve[CONFIDENCE_COLUMN].values,
self.maxf1,
self.optconff1,
ax,
num_edges=(
self.confidence_data[CONFIDENCE_COLUMN] >= self.optconff1
).sum())

return ax

Expand All @@ -75,12 +115,22 @@ def plot_f1_conf(f1, conf, optf1, optconf, ax, num_edges=None):
ax.set_xlim(1, 0)
ax.set_ylim(0, 1)
ax.set_ylabel('F1')
ax.vlines(float(optconf), 0, 1, transform=ax.get_xaxis_transform(), colors='r', linestyles='dashed')

_msg = "max F1 = {optf1:.4f}\noptimal conf = {optconf:.4f}\nnum_edges = {n}".format(optf1=optf1,
optconf=optconf,
n=num_edges)
ax.annotate(_msg, xy=(0.4, 0.075), xycoords='axes fraction')
ax.vlines(
float(optconf),
0,
1,
transform=ax.get_xaxis_transform(),
colors='r',
linestyles='dashed'
)

ax.annotate(
f"max F1 = {optf1:.4f}\n"
f"optimal conf = {optconf:.4f}\n"
f"num_edges = {num_edges}",
xy=(0.4, 0.075),
xycoords='axes fraction'
)

return ax

Expand All @@ -92,12 +142,18 @@ def calculate_opt_f1(data):
@staticmethod
def calculate_opt_conf_f1(data):

return data.loc[data[F1_COLUMN] >= np.max(data[F1_COLUMN]), CONFIDENCE_COLUMN].min()
return data.loc[
data[F1_COLUMN] >= np.max(data[F1_COLUMN]),
CONFIDENCE_COLUMN
].min()

@staticmethod
def calculate_f1(data):

data[F1_COLUMN] = RankSummaryF1.pr_to_f1(data[PRECISION_COLUMN], data[RECALL_COLUMN])
data[F1_COLUMN] = RankSummaryF1.pr_to_f1(
data[PRECISION_COLUMN],
data[RECALL_COLUMN]
)
return data

@staticmethod
Expand Down
117 changes: 89 additions & 28 deletions inferelator/postprocessing/matthews_correlation.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,36 @@
from math import isfinite
import numpy as np
import warnings

from inferelator.postprocessing.model_performance import RankSummingMetric
from inferelator.postprocessing import (TARGET_COLUMN, REGULATOR_COLUMN, CONFIDENCE_COLUMN, GOLD_STANDARD_COLUMN,
MCC_COLUMN, TP, FP, TN, FN)
from inferelator.postprocessing import (
TARGET_COLUMN,
REGULATOR_COLUMN,
CONFIDENCE_COLUMN,
MCC_COLUMN,
TP,
FP,
TN,
FN
)

import matplotlib

# If matplotlib is being an idiot and trying to set a tkinter backend, switch to agg
if matplotlib.get_backend() in (i for i in matplotlib.rcsetup.interactive_bk):
# If matplotlib is being an idiot and trying to set a tkinter backend,
# switch to agg
if matplotlib.get_backend() in (
i
for i in matplotlib.backends.backend_registry.list_builtin(
matplotlib.backends.BackendFilter.INTERACTIVE
)
):
matplotlib.use('agg')

import matplotlib.pyplot as plt


class RankSummaryMCC(RankSummingMetric):
"""
This class extends RankSumming and calculates Matthews correlation coefficient
This class extends RankSumming and calculates Matthews
correlation coefficient
"""

name = "MCC"
Expand All @@ -41,17 +54,33 @@ def nnzmmc(self):

# Plotter function

def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'):

super(RankSummaryMCC, self).__init__(rankable_data, gold_standard, filter_method=filter_method)

# Calculate the precision and recall and store them with confidence data
def __init__(
self,
rankable_data,
gold_standard,
filter_method='keep_all_gold_standard'
):

super(RankSummaryMCC, self).__init__(
rankable_data,
gold_standard,
filter_method=filter_method
)

# Calculate the precision and recall and store them with confidence
# data
self.filtered_data = self.calculate_mcc(self.filtered_data.copy())

# Join the filtered MCC onto the full confidences
join_data = self.filtered_data.loc[:, [TARGET_COLUMN, REGULATOR_COLUMN, MCC_COLUMN]]
join_data = join_data.set_index([TARGET_COLUMN, REGULATOR_COLUMN])
self.confidence_data = self.confidence_data.join(join_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
join_data = self.filtered_data.loc[
:,
[TARGET_COLUMN, REGULATOR_COLUMN, MCC_COLUMN]
].set_index([TARGET_COLUMN, REGULATOR_COLUMN])

self.confidence_data = self.confidence_data.join(
join_data,
on=[TARGET_COLUMN, REGULATOR_COLUMN]
)

def score(self):

Expand All @@ -69,8 +98,16 @@ def output_curve(self, ax=None, figsize=(6, 4)):

# Extract the recall and precision data
curve = self.curve_dataframe()
self.plot_mcc_conf(curve[MCC_COLUMN].values, curve[CONFIDENCE_COLUMN].values, self.maxmcc, self.optconfmcc, ax,
num_edges=(self.confidence_data[CONFIDENCE_COLUMN] >= self.optconfmcc).sum())
self.plot_mcc_conf(
curve[MCC_COLUMN].values,
curve[CONFIDENCE_COLUMN].values,
self.maxmcc,
self.optconfmcc,
ax,
num_edges=(
self.confidence_data[CONFIDENCE_COLUMN] >= self.optconfmcc
).sum()
)

return ax

Expand All @@ -88,12 +125,22 @@ def plot_mcc_conf(mcc, conf, optmcc, optconf, ax, num_edges=None):
ax.set_xlim(1, 0)
ax.set_ylim(y_min, 1)
ax.set_ylabel('MCC')
ax.vlines(float(optconf), 0, 1, transform=ax.get_xaxis_transform(), colors='r', linestyles='dashed')

_msg = "max MCC = {optmcc:.4f}\noptimal conf = {optconf:.4f}\nnum_edges = {n}".format(optmcc=optmcc,
optconf=optconf,
n=num_edges)
ax.annotate(_msg, xy=(0.4, 0.075), xycoords='axes fraction')
ax.vlines(
float(optconf),
0,
1,
transform=ax.get_xaxis_transform(),
colors='r',
linestyles='dashed'
)

ax.annotate(
f"max MCC = {optmcc:.4f}\n"
f"optimal conf = {optconf:.4f}\n"
f"num_edges = {num_edges}",
xy=(0.4, 0.075),
xycoords='axes fraction'
)

return ax

Expand All @@ -105,7 +152,10 @@ def calculate_opt_mcc(data):
@staticmethod
def calculate_opt_conf_mcc(data):

return data.loc[data[MCC_COLUMN] >= np.max(data[MCC_COLUMN]), CONFIDENCE_COLUMN].min()
return data.loc[
data[MCC_COLUMN] >= np.max(data[MCC_COLUMN]),
CONFIDENCE_COLUMN
].min()

@staticmethod
def calculate_nnz_mcc(data, conf):
Expand All @@ -116,14 +166,25 @@ def calculate_nnz_mcc(data, conf):
def calculate_mcc(data):

df = RankSummingMetric.compute_confusion_matrix(data)
data[MCC_COLUMN] = RankSummaryMCC.confusion_to_mcc(df[TP], df[TN], df[FP], df[FN])
data[MCC_COLUMN] = RankSummaryMCC.confusion_to_mcc(
df[TP],
df[TN],
df[FP],
df[FN]
)
return data

@staticmethod
def confusion_to_mcc(tp, tn, fp, fn):
denominator = np.sqrt(tp + fp) * np.sqrt(tp + fn) * np.sqrt(tn + fp) * np.sqrt(tn + fn)

# If any denominator value is 0, MCC is 0/0 and by convention will be set to 0.0
denominator = (
np.sqrt(tp + fp) *
np.sqrt(tp + fn) *
np.sqrt(tn + fp) *
np.sqrt(tn + fn)
)

# If any denominator value is 0, MCC is 0/0 and by
# convention will be set to 0.0
denominator[denominator == 0] = 1.0

return (tp * tn - fp * fn) / denominator
Loading

0 comments on commit 3f58c73

Please sign in to comment.