From b556c41274a61b5def7069e3972043fa3e7dadb4 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 16:13:08 -0400 Subject: [PATCH 01/15] move operations and transform util files into a dedicated utils folder --- cytominer_eval/{operations/util.py => utils/operation_utils.py} | 0 cytominer_eval/{transform/util.py => utils/transform_utils.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cytominer_eval/{operations/util.py => utils/operation_utils.py} (100%) rename cytominer_eval/{transform/util.py => utils/transform_utils.py} (100%) diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/utils/operation_utils.py similarity index 100% rename from cytominer_eval/operations/util.py rename to cytominer_eval/utils/operation_utils.py diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/utils/transform_utils.py similarity index 100% rename from cytominer_eval/transform/util.py rename to cytominer_eval/utils/transform_utils.py From de4a79921de79a57698b8d7209d21e727c6681ce Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 16:35:21 -0400 Subject: [PATCH 02/15] move testing util files to a new test_utils folder --- .../test_util.py => test_utils/test_operation_utils.py} | 0 .../test_util.py => test_utils/test_transform_utils.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cytominer_eval/tests/{test_operations/test_util.py => test_utils/test_operation_utils.py} (100%) rename cytominer_eval/tests/{test_transform/test_util.py => test_utils/test_transform_utils.py} (100%) diff --git a/cytominer_eval/tests/test_operations/test_util.py b/cytominer_eval/tests/test_utils/test_operation_utils.py similarity index 100% rename from cytominer_eval/tests/test_operations/test_util.py rename to cytominer_eval/tests/test_utils/test_operation_utils.py diff --git a/cytominer_eval/tests/test_transform/test_util.py b/cytominer_eval/tests/test_utils/test_transform_utils.py similarity index 100% rename from cytominer_eval/tests/test_transform/test_util.py rename to cytominer_eval/tests/test_utils/test_transform_utils.py From 2c33899bd7eaaf622cb745fb26e652c73b2794f3 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 16:36:20 -0400 Subject: [PATCH 03/15] move assert_melt function to test_utils folder --- .../tests/{test_transform => test_utils}/test_assert_melt.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cytominer_eval/tests/{test_transform => test_utils}/test_assert_melt.py (100%) diff --git a/cytominer_eval/tests/test_transform/test_assert_melt.py b/cytominer_eval/tests/test_utils/test_assert_melt.py similarity index 100% rename from cytominer_eval/tests/test_transform/test_assert_melt.py rename to cytominer_eval/tests/test_utils/test_assert_melt.py From ec476eb45a2ca9110c4e00d2c8e0f897c7f292e4 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 17:09:21 -0400 Subject: [PATCH 04/15] update util imports in all files --- cytominer_eval/evaluate.py | 2 +- cytominer_eval/operations/enrichment.py | 11 ++++------- cytominer_eval/operations/grit.py | 9 +++++---- cytominer_eval/operations/mp_value.py | 2 +- cytominer_eval/operations/precision_recall.py | 5 +++-- .../operations/replicate_reproducibility.py | 4 ++-- cytominer_eval/transform/transform.py | 2 +- 7 files changed, 17 insertions(+), 18 deletions(-) diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py index 744f094..b7f2e37 100644 --- a/cytominer_eval/evaluate.py +++ b/cytominer_eval/evaluate.py @@ -8,7 +8,7 @@ from typing import List, Union from cytominer_eval.transform import metric_melt -from cytominer_eval.transform.util import check_replicate_groups +from cytominer_eval.utils.transform_utils import check_replicate_groups from cytominer_eval.operations import ( replicate_reproducibility, precision_recall, diff --git a/cytominer_eval/operations/enrichment.py b/cytominer_eval/operations/enrichment.py index 37ab2ff..704c186 100644 --- a/cytominer_eval/operations/enrichment.py +++ b/cytominer_eval/operations/enrichment.py @@ -5,12 +5,8 @@ from typing import List, Union import scipy -from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method -from cytominer_eval.transform.util import ( - set_pair_ids, - set_grit_column_info, - assert_melt, -) +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.transform_utils import set_pair_ids, assert_melt def enrichment( @@ -18,7 +14,8 @@ def enrichment( replicate_groups: List[str], percentile: Union[float, List[float]], ) -> pd.DataFrame: - """Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates. + """Calculate the enrichment score. This score is based on the fisher exact odds score. + Similar to the other functions, the closest connections are determined and checked with the replicates. This score effectively calculates how much better the distribution of correct connections is compared to random. Parameters diff --git a/cytominer_eval/operations/grit.py b/cytominer_eval/operations/grit.py index 1537139..5b0d1ea 100644 --- a/cytominer_eval/operations/grit.py +++ b/cytominer_eval/operations/grit.py @@ -8,11 +8,12 @@ import pandas as pd from typing import List -from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method -from cytominer_eval.transform.util import ( - set_pair_ids, +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.transform_utils import set_pair_ids, assert_melt +from cytominer_eval.utils.grit_utils import ( + check_grit_replicate_summary_method, set_grit_column_info, - assert_melt, + calculate_grit, ) diff --git a/cytominer_eval/operations/mp_value.py b/cytominer_eval/operations/mp_value.py index 0d2b76d..f051862 100644 --- a/cytominer_eval/operations/mp_value.py +++ b/cytominer_eval/operations/mp_value.py @@ -16,7 +16,7 @@ import pandas as pd from typing import List -from .util import calculate_mp_value +from cytominer_eval.utils.mpvalue_utils import calculate_mp_value def mp_value( diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py index 5bd015d..248ee74 100644 --- a/cytominer_eval/operations/precision_recall.py +++ b/cytominer_eval/operations/precision_recall.py @@ -6,8 +6,9 @@ import pandas as pd from typing import List, Union -from .util import assign_replicates, calculate_precision_recall -from cytominer_eval.transform.util import set_pair_ids, assert_melt +from cytominer_eval.utils.precisionrecall_utils import calculate_precision_recall +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.transform_utils import set_pair_ids, assert_melt def precision_recall( diff --git a/cytominer_eval/operations/replicate_reproducibility.py b/cytominer_eval/operations/replicate_reproducibility.py index 0c15b2d..c537e45 100644 --- a/cytominer_eval/operations/replicate_reproducibility.py +++ b/cytominer_eval/operations/replicate_reproducibility.py @@ -5,8 +5,8 @@ import pandas as pd from typing import List -from .util import assign_replicates, set_pair_ids -from cytominer_eval.transform.util import assert_melt +from cytominer_eval.utils.operation_utils import assign_replicates, set_pair_ids +from cytominer_eval.utils.transform_utils import assert_melt def replicate_reproducibility( diff --git a/cytominer_eval/transform/transform.py b/cytominer_eval/transform/transform.py index 5b54081..a24c10a 100644 --- a/cytominer_eval/transform/transform.py +++ b/cytominer_eval/transform/transform.py @@ -2,7 +2,7 @@ import pandas as pd from typing import List -from .util import ( +from cytominer_eval.utils.transform_utils import ( assert_pandas_dtypes, get_upper_matrix, set_pair_ids, From 6daa7ddd16941ceb8866fa30e89a208ad2559845 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 17:11:33 -0400 Subject: [PATCH 05/15] update util import statements in tests --- cytominer_eval/tests/test_evaluate.py | 18 ++++++++++++++---- .../tests/test_operations/test_enrichment.py | 4 ++-- .../tests/test_operations/test_grit.py | 11 +++++------ .../tests/test_operations/test_mp_value.py | 7 ++++++- .../tests/test_utils/test_assert_melt.py | 4 ++-- .../tests/test_utils/test_operation_utils.py | 7 ++++--- 6 files changed, 33 insertions(+), 18 deletions(-) diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py index ed50551..d972398 100644 --- a/cytominer_eval/tests/test_evaluate.py +++ b/cytominer_eval/tests/test_evaluate.py @@ -4,9 +4,11 @@ import tempfile import numpy as np import pandas as pd -from cytominer_eval.transform.util import get_available_similarity_metrics from cytominer_eval import evaluate +from cytominer_eval.utils.availability_utils import get_available_similarity_metrics + + example_gene_file = "SQ00014610_normalized_feature_select.csv.gz" example_gene_file = pathlib.Path( "{file}/../example_data/gene/{eg}".format( @@ -111,7 +113,11 @@ def test_evaluate_replicate_reprod_return_cor_true(): assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949 assert sorted(med_cor_df.columns.tolist()) == sorted( - ["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",] + [ + "Metadata_gene_name", + "Metadata_pert_name", + "similarity_metric", + ] ) @@ -202,7 +208,9 @@ def test_evaluate_grit(): top_result = ( grit_results_df.sort_values(by="grit", ascending=False) .reset_index(drop=True) - .iloc[0,] + .iloc[ + 0, + ] ) assert np.round(top_result.grit, 4) == 2.3352 assert top_result.group == "PTK2" @@ -228,7 +236,9 @@ def test_evaluate_grit(): top_result = ( grit_results_df.sort_values(by="grit", ascending=False) .reset_index(drop=True) - .iloc[0,] + .iloc[ + 0, + ] ) assert np.round(top_result.grit, 4) == 0.9990 diff --git a/cytominer_eval/tests/test_operations/test_enrichment.py b/cytominer_eval/tests/test_operations/test_enrichment.py index b49d70a..5819304 100644 --- a/cytominer_eval/tests/test_operations/test_enrichment.py +++ b/cytominer_eval/tests/test_operations/test_enrichment.py @@ -55,13 +55,13 @@ def test_enrichment(): assert result.enrichment_percentile[1] == 0.995 # check if the higher percentiles are larger than the small one assert result.enrichment_percentile[1] > result.enrichment_percentile.iloc[-1] - + result_int = enrichment( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, percentile=0.97, ) - + assert result_int.enrichment_percentile[0] == result.enrichment_percentile.iloc[-1] diff --git a/cytominer_eval/tests/test_operations/test_grit.py b/cytominer_eval/tests/test_operations/test_grit.py index c21b4fd..7c4c745 100644 --- a/cytominer_eval/tests/test_operations/test_grit.py +++ b/cytominer_eval/tests/test_operations/test_grit.py @@ -11,17 +11,16 @@ from cytominer_eval.operations import grit from cytominer_eval.transform import metric_melt -from cytominer_eval.transform.util import ( - assert_melt, - set_pair_ids, + +from cytominer_eval.utils.transform_utils import assert_melt, set_pair_ids +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.grit_utils import ( set_grit_column_info, -) -from cytominer_eval.operations.util import ( - assign_replicates, get_grit_entry, calculate_grit, ) + # Load CRISPR dataset example_file = "SQ00014610_normalized_feature_select.csv.gz" example_file = pathlib.Path( diff --git a/cytominer_eval/tests/test_operations/test_mp_value.py b/cytominer_eval/tests/test_operations/test_mp_value.py index 2e34d6c..00fb2bc 100644 --- a/cytominer_eval/tests/test_operations/test_mp_value.py +++ b/cytominer_eval/tests/test_operations/test_mp_value.py @@ -5,8 +5,13 @@ import numpy as np import pandas as pd from math import isclose + from cytominer_eval.operations import mp_value -from cytominer_eval.operations.util import calculate_mp_value, calculate_mahalanobis + +from cytominer_eval.utils.mpvalue_utils import ( + calculate_mp_value, + calculate_mahalanobis, +) # Load CRISPR dataset example_file = "SQ00014610_normalized_feature_select.csv.gz" diff --git a/cytominer_eval/tests/test_utils/test_assert_melt.py b/cytominer_eval/tests/test_utils/test_assert_melt.py index 9104635..cd9d2df 100644 --- a/cytominer_eval/tests/test_utils/test_assert_melt.py +++ b/cytominer_eval/tests/test_utils/test_assert_melt.py @@ -8,8 +8,8 @@ import pandas.api.types as ptypes from cytominer_eval.transform import metric_melt -from cytominer_eval.operations.util import assign_replicates -from cytominer_eval.transform.util import assert_melt +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.transform_utils import assert_melt example_file = "SQ00014610_normalized_feature_select.csv.gz" diff --git a/cytominer_eval/tests/test_utils/test_operation_utils.py b/cytominer_eval/tests/test_utils/test_operation_utils.py index 784b451..4d478a7 100644 --- a/cytominer_eval/tests/test_utils/test_operation_utils.py +++ b/cytominer_eval/tests/test_utils/test_operation_utils.py @@ -8,8 +8,9 @@ from pandas.testing import assert_frame_equal from cytominer_eval.transform import metric_melt -from cytominer_eval.transform.util import set_pair_ids -from cytominer_eval.operations.util import assign_replicates, calculate_precision_recall +from cytominer_eval.utils.transform_utils import set_pair_ids +from cytominer_eval.utils.operation_utils import assign_replicates +from cytominer_eval.utils.precisionrecall_utils import calculate_precision_recall random.seed(123) tmpdir = tempfile.gettempdir() @@ -118,7 +119,7 @@ def test_calculate_precision_recall(): expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333} expected_result = pd.DataFrame(expected_result, index=["result"]).transpose() - assert_frame_equal(result, expected_result, check_less_precise=True) + assert_frame_equal(result, expected_result) # Check that recall is 1 when k is maximized result = pd.DataFrame( From c7a6f6910c2caa474ac7cddab121381efd4e156c Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 17:12:15 -0400 Subject: [PATCH 06/15] split util files into better named files --- cytominer_eval/utils/availability_utils.py | 23 ++ cytominer_eval/utils/grit_utils.py | 167 ++++++++++ cytominer_eval/utils/mpvalue_utils.py | 169 ++++++++++ cytominer_eval/utils/operation_utils.py | 304 +----------------- cytominer_eval/utils/precisionrecall_utils.py | 43 +++ cytominer_eval/utils/transform_utils.py | 94 +----- 6 files changed, 407 insertions(+), 393 deletions(-) create mode 100644 cytominer_eval/utils/availability_utils.py create mode 100644 cytominer_eval/utils/grit_utils.py create mode 100644 cytominer_eval/utils/mpvalue_utils.py create mode 100644 cytominer_eval/utils/precisionrecall_utils.py diff --git a/cytominer_eval/utils/availability_utils.py b/cytominer_eval/utils/availability_utils.py new file mode 100644 index 0000000..d8f0d7f --- /dev/null +++ b/cytominer_eval/utils/availability_utils.py @@ -0,0 +1,23 @@ +def get_available_eval_metrics(): + r"""Output the available eval metrics in the cytominer_eval library""" + return [ + "replicate_reproducibility", + "precision_recall", + "grit", + "mp_value", + "enrichment", + ] + + +def get_available_similarity_metrics(): + r"""Output the available metrics for calculating pairwise similarity in the + cytominer_eval library + """ + return ["pearson", "kendall", "spearman"] + + +def get_available_grit_summary_methods(): + r"""Output the available metrics for calculating pairwise similarity in the + cytominer_eval library + """ + return ["mean", "median"] diff --git a/cytominer_eval/utils/grit_utils.py b/cytominer_eval/utils/grit_utils.py new file mode 100644 index 0000000..33a5256 --- /dev/null +++ b/cytominer_eval/utils/grit_utils.py @@ -0,0 +1,167 @@ +import numpy as np +import pandas as pd +from typing import List, Union +import pandas.api.types as ptypes +from collections import OrderedDict + +from sklearn.preprocessing import StandardScaler + +from cytominer_eval.utils.transform_utils import set_pair_ids +from cytominer_eval.utils.availability_utils import get_available_grit_summary_methods + + +def calculate_grit( + replicate_group_df: pd.DataFrame, + control_perts: List[str], + column_id_info: dict, + replicate_summary_method: str = "mean", +) -> pd.Series: + """Given an elongated pairwise correlation dataframe of replicate groups, + calculate grit. + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.grit.grit`. + + Parameters + ---------- + replicate_group_df : pandas.DataFrame + An elongated dataframe storing pairwise correlations of all profiles to a single + replicate group. + control_perts : list + The profile_ids that should be considered controls (the reference) + column_id_info: dict + A dictionary of column identifiers noting profile and replicate group ids. This + variable is autogenerated in + :py:func:`cytominer_eval.transform.util.set_grit_column_info`. + replicate_summary_method : {'mean', 'median'}, optional + how replicate z-scores to control perts are summarized. Defaults to "mean". + + Returns + ------- + dict + A return bundle of identifiers (perturbation, group) and results (grit score). + The dictionary has keys ("perturbation", "group", "grit_score"). "grit_score" + will be NaN if no other profiles exist in the defined group. + """ + # Confirm that we support the provided summary method + check_grit_replicate_summary_method(replicate_summary_method) + + group_entry = get_grit_entry(replicate_group_df, column_id_info["group"]["id"]) + pert = get_grit_entry(replicate_group_df, column_id_info["profile"]["id"]) + + # Define distributions for control perturbations + control_distrib = replicate_group_df.loc[ + replicate_group_df.loc[:, column_id_info["profile"]["comparison"]].isin( + control_perts + ), + "similarity_metric", + ].values.reshape(-1, 1) + + assert len(control_distrib) > 1, "Error! No control perturbations found." + + # Define distributions for same group (but not same perturbation) + same_group_distrib = replicate_group_df.loc[ + ( + replicate_group_df.loc[:, column_id_info["group"]["comparison"]] + == group_entry + ) + & (replicate_group_df.loc[:, column_id_info["profile"]["comparison"]] != pert), + "similarity_metric", + ].values.reshape(-1, 1) + + return_bundle = {"perturbation": pert, "group": group_entry} + if len(same_group_distrib) == 0: + return_bundle["grit"] = np.nan + + else: + scaler = StandardScaler() + scaler.fit(control_distrib) + grit_z_scores = scaler.transform(same_group_distrib) + + if replicate_summary_method == "mean": + grit_score = np.mean(grit_z_scores) + elif replicate_summary_method == "median": + grit_score = np.median(grit_z_scores) + + return_bundle["grit"] = grit_score + + return pd.Series(return_bundle) + + +def get_grit_entry(df: pd.DataFrame, col: str) -> str: + """Helper function to define the perturbation identifier of interest + + Grit must be calculated using unique perturbations. This may or may not mean unique + perturbations. + """ + entries = df.loc[:, col] + assert ( + len(entries.unique()) == 1 + ), "grit is calculated for each perturbation independently" + return str(list(entries)[0]) + + +def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: + r"""Transform column names to be used in calculating grit + + In calculating grit, the data must have a metadata feature describing the core + replicate perturbation (profile_col) and a separate metadata feature(s) describing + the larger group (replicate_group_col) that the perturbation belongs to (e.g. gene, + MOA). + + Parameters + ---------- + profile_col : str + the metadata column storing profile ids. The column can have unique or replicate + identifiers. + replicate_group_col : str + the metadata column indicating a higher order structure (group) than the + profile column. E.g. target gene vs. guide in a CRISPR experiment. + + Returns + ------- + dict + A nested dictionary of renamed columns indicating how to determine replicates + """ + # Identify column transform names + pair_ids = set_pair_ids() + + profile_id_with_suffix = [ + "{col}{suf}".format(col=profile_col, suf=pair_ids[x]["suffix"]) + for x in pair_ids + ] + + group_id_with_suffix = [ + "{col}{suf}".format(col=replicate_group_col, suf=pair_ids[x]["suffix"]) + for x in pair_ids + ] + + col_info = ["id", "comparison"] + profile_id_info = dict(zip(col_info, profile_id_with_suffix)) + group_id_info = dict(zip(col_info, group_id_with_suffix)) + + column_id_info = {"profile": profile_id_info, "group": group_id_info} + return column_id_info + + +def check_grit_replicate_summary_method(replicate_summary_method: str) -> None: + r"""Helper function to ensure that we support the user input replicate summary + + Parameters + ---------- + replicate_summary_method : str + The user input replicate summary method + + Returns + ------- + None + Assertion will fail if the user inputs an incorrect replicate summary method + """ + avail_methods = get_available_grit_summary_methods() + + if replicate_summary_method not in avail_methods: + raise ValueError( + "{input} method not supported, use one of: {avail}".format( + input=replicate_summary_method, avail=avail_methods + ) + ) diff --git a/cytominer_eval/utils/mpvalue_utils.py b/cytominer_eval/utils/mpvalue_utils.py new file mode 100644 index 0000000..1aab318 --- /dev/null +++ b/cytominer_eval/utils/mpvalue_utils.py @@ -0,0 +1,169 @@ +import numpy as np +import pandas as pd +from typing import List, Union + +from sklearn.decomposition import PCA +from sklearn.covariance import EmpiricalCovariance + +from cytominer_eval.utils.transform_utils import set_pair_ids + + +class MahalanobisEstimator: + """ + Store location and dispersion estimators of the empirical distribution of data + provided in an array and allow computation of statistical distances. + + Parameters + ---------- + arr : {pandas.DataFrame, np.ndarray} + the matrix used to calculate covariance + + Attributes + ---------- + sigma : np.array + Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance() + + Methods + ------- + mahalanobis(X) + Computes mahalanobis distance between the input array (self.arr) and the X + array as provided + """ + + def __init__(self, arr: Union[pd.DataFrame, np.ndarray]): + self.sigma = EmpiricalCovariance().fit(arr) + + def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: + """Compute the mahalanobis distance between the empirical distribution described + by this object and points in an array `X`. + + Parameters + ---------- + X : {pandas.DataFrame, np.ndarray} + A samples by features array-like matrix to compute mahalanobis distance + between self.arr + + Returns + ------- + numpy.array + Mahalanobis distance between the input array and the original sigma + """ + return self.sigma.mahalanobis(X) + + +def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd.Series: + """Given perturbation and control dataframes, calculate mahalanobis distance per + perturbation + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.util.calculate_mp_value`. + + Parameters + ---------- + pert_df : pandas.DataFrame + A pandas dataframe of replicate perturbations (samples by features) + control_df : pandas.DataFrame + A pandas dataframe of control perturbations (samples by features). Must have the + same feature measurements as pert_df + + Returns + ------- + float + The mahalanobis distance between perturbation and control + """ + assert len(control_df) > 1, "Error! No control perturbations found." + + # Get dispersion and center estimators for the control perturbations + control_estimators = MahalanobisEstimator(control_df) + + # Distance between mean of perturbation and control + maha = control_estimators.mahalanobis(np.array(np.mean(pert_df, 0)).reshape(1, -1))[ + 0 + ] + return maha + + +def default_mp_value_parameters(): + """Set the different default parameters used for mp-values. + + Returns + ------- + dict + A default parameter set with keys: rescale_pca (whether the PCA should be + scaled by variance explained) and nb_permutations (how many permutations to + calculate empirical p-value). Defaults to True and 100, respectively. + """ + params = {"rescale_pca": True, "nb_permutations": 100} + return params + + +def calculate_mp_value( + pert_df: pd.DataFrame, + control_df: pd.DataFrame, + params: dict = {}, +) -> pd.Series: + """Given perturbation and control dataframes, calculate mp-value per perturbation + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.mp_value.mp_value`. + + Parameters + ---------- + pert_df : pandas.DataFrame + A pandas dataframe of replicate perturbations (samples by features) + control_df : pandas.DataFrame + A pandas dataframe of control perturbations (samples by features). Must have the + same feature measurements as pert_df + params : {dict}, optional + the parameters to use when calculating mp value. See + :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`. + + Returns + ------- + float + The mp value for the given perturbation + + """ + assert len(control_df) > 1, "Error! No control perturbations found." + + # Assign parameters + p = default_mp_value_parameters() + + assert all( + [x in p.keys() for x in params.keys()] + ), "Unknown parameters provided. Only {e} are supported.".format(e=p.keys()) + for (k, v) in params.items(): + p[k] = v + + merge_df = pd.concat([pert_df, control_df]).reset_index(drop=True) + + # We reduce the dimensionality with PCA + # so that 90% of the variance is conserved + pca = PCA(n_components=0.9, svd_solver="full") + pca_array = pca.fit_transform(merge_df) + # We scale columns by the variance explained + if p["rescale_pca"]: + pca_array = pca_array * pca.explained_variance_ratio_ + # This seems useless, as the point of using the Mahalanobis + # distance instead of the Euclidean distance is to be independent + # of axes scales + + # Distance between mean of perturbation and control + obs = calculate_mahalanobis( + pert_df=pca_array[: pert_df.shape[0]], + control_df=pca_array[-control_df.shape[0] :], + ) + # In the paper's methods section it mentions the covariance used + # might be modified to include variation of the perturbation as well. + + # Permutation test + sim = np.zeros(p["nb_permutations"]) + pert_mask = np.zeros(pca_array.shape[0], dtype=bool) + pert_mask[: pert_df.shape[0]] = 1 + for i in range(p["nb_permutations"]): + pert_mask_perm = np.random.permutation(pert_mask) + pert_perm = pca_array[pert_mask_perm] + control_perm = pca_array[np.logical_not(pert_mask_perm)] + sim[i] = calculate_mahalanobis(pert_df=pert_perm, control_df=control_perm) + + return np.mean([x >= obs for x in sim]) diff --git a/cytominer_eval/utils/operation_utils.py b/cytominer_eval/utils/operation_utils.py index 9110686..549f0c8 100644 --- a/cytominer_eval/utils/operation_utils.py +++ b/cytominer_eval/utils/operation_utils.py @@ -2,15 +2,7 @@ import pandas as pd from typing import List, Union -from sklearn.preprocessing import StandardScaler -from sklearn.decomposition import PCA -from sklearn.covariance import EmpiricalCovariance - -from cytominer_eval.transform import metric_melt -from cytominer_eval.transform.util import ( - set_pair_ids, - check_grit_replicate_summary_method, -) +from cytominer_eval.utils.transform_utils import set_pair_ids def assign_replicates( @@ -74,297 +66,3 @@ def assign_replicates( compare_df, left_index=True, right_index=True ) return similarity_melted_df - - -def calculate_precision_recall(replicate_group_df: pd.DataFrame, k: int) -> pd.Series: - """Given an elongated pairwise correlation dataframe of replicate groups, calculate - precision and recall. - - Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See - :py:func:`cytominer_eval.operations.precision_recall.precision_recall`. - - Parameters - ---------- - replicate_group_df : pandas.DataFrame - An elongated dataframe storing pairwise correlations of all profiles to a single - replicate group. - k : int - an integer indicating how many pairwise comparisons to threshold. - - Returns - ------- - dict - A return bundle of identifiers (k) and results (precision and recall at k). - The dictionary has keys ("k", "precision", "recall"). - """ - assert ( - "group_replicate" in replicate_group_df.columns - ), "'group_replicate' not found in dataframe; remember to call assign_replicates()." - - recall_denom__total_relevant_items = sum(replicate_group_df.group_replicate) - precision_denom__num_recommended_items = k - - num_recommended_items_at_k = sum( - replicate_group_df.iloc[ - :k, - ].group_replicate - ) - - precision_at_k = num_recommended_items_at_k / precision_denom__num_recommended_items - recall_at_k = num_recommended_items_at_k / recall_denom__total_relevant_items - - return_bundle = {"k": k, "precision": precision_at_k, "recall": recall_at_k} - - return pd.Series(return_bundle) - - -def calculate_grit( - replicate_group_df: pd.DataFrame, - control_perts: List[str], - column_id_info: dict, - replicate_summary_method: str = "mean", -) -> pd.Series: - """Given an elongated pairwise correlation dataframe of replicate groups, - calculate grit. - - Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See - :py:func:`cytominer_eval.operations.grit.grit`. - - Parameters - ---------- - replicate_group_df : pandas.DataFrame - An elongated dataframe storing pairwise correlations of all profiles to a single - replicate group. - control_perts : list - The profile_ids that should be considered controls (the reference) - column_id_info: dict - A dictionary of column identifiers noting profile and replicate group ids. This - variable is autogenerated in - :py:func:`cytominer_eval.transform.util.set_grit_column_info`. - replicate_summary_method : {'mean', 'median'}, optional - how replicate z-scores to control perts are summarized. Defaults to "mean". - - Returns - ------- - dict - A return bundle of identifiers (perturbation, group) and results (grit score). - The dictionary has keys ("perturbation", "group", "grit_score"). "grit_score" - will be NaN if no other profiles exist in the defined group. - """ - # Confirm that we support the provided summary method - check_grit_replicate_summary_method(replicate_summary_method) - - group_entry = get_grit_entry(replicate_group_df, column_id_info["group"]["id"]) - pert = get_grit_entry(replicate_group_df, column_id_info["profile"]["id"]) - - # Define distributions for control perturbations - control_distrib = replicate_group_df.loc[ - replicate_group_df.loc[:, column_id_info["profile"]["comparison"]].isin( - control_perts - ), - "similarity_metric", - ].values.reshape(-1, 1) - - assert len(control_distrib) > 1, "Error! No control perturbations found." - - # Define distributions for same group (but not same perturbation) - same_group_distrib = replicate_group_df.loc[ - ( - replicate_group_df.loc[:, column_id_info["group"]["comparison"]] - == group_entry - ) - & (replicate_group_df.loc[:, column_id_info["profile"]["comparison"]] != pert), - "similarity_metric", - ].values.reshape(-1, 1) - - return_bundle = {"perturbation": pert, "group": group_entry} - if len(same_group_distrib) == 0: - return_bundle["grit"] = np.nan - - else: - scaler = StandardScaler() - scaler.fit(control_distrib) - grit_z_scores = scaler.transform(same_group_distrib) - - if replicate_summary_method == "mean": - grit_score = np.mean(grit_z_scores) - elif replicate_summary_method == "median": - grit_score = np.median(grit_z_scores) - - return_bundle["grit"] = grit_score - - return pd.Series(return_bundle) - - -def get_grit_entry(df: pd.DataFrame, col: str) -> str: - """Helper function to define the perturbation identifier of interest - - Grit must be calculated using unique perturbations. This may or may not mean unique - perturbations. - """ - entries = df.loc[:, col] - assert ( - len(entries.unique()) == 1 - ), "grit is calculated for each perturbation independently" - return str(list(entries)[0]) - - -class MahalanobisEstimator: - """ - Store location and dispersion estimators of the empirical distribution of data - provided in an array and allow computation of statistical distances. - - Parameters - ---------- - arr : {pandas.DataFrame, np.ndarray} - the matrix used to calculate covariance - - Attributes - ---------- - sigma : np.array - Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance() - - Methods - ------- - mahalanobis(X) - Computes mahalanobis distance between the input array (self.arr) and the X - array as provided - """ - - def __init__(self, arr: Union[pd.DataFrame, np.ndarray]): - self.sigma = EmpiricalCovariance().fit(arr) - - def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: - """Compute the mahalanobis distance between the empirical distribution described - by this object and points in an array `X`. - - Parameters - ---------- - X : {pandas.DataFrame, np.ndarray} - A samples by features array-like matrix to compute mahalanobis distance - between self.arr - - Returns - ------- - numpy.array - Mahalanobis distance between the input array and the original sigma - """ - return self.sigma.mahalanobis(X) - - -def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd.Series: - """Given perturbation and control dataframes, calculate mahalanobis distance per - perturbation - - Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See - :py:func:`cytominer_eval.operations.util.calculate_mp_value`. - - Parameters - ---------- - pert_df : pandas.DataFrame - A pandas dataframe of replicate perturbations (samples by features) - control_df : pandas.DataFrame - A pandas dataframe of control perturbations (samples by features). Must have the - same feature measurements as pert_df - - Returns - ------- - float - The mahalanobis distance between perturbation and control - """ - assert len(control_df) > 1, "Error! No control perturbations found." - - # Get dispersion and center estimators for the control perturbations - control_estimators = MahalanobisEstimator(control_df) - - # Distance between mean of perturbation and control - maha = control_estimators.mahalanobis(np.array(np.mean(pert_df, 0)).reshape(1, -1))[ - 0 - ] - return maha - - -def default_mp_value_parameters(): - """Set the different default parameters used for mp-values. - - Returns - ------- - dict - A default parameter set with keys: rescale_pca (whether the PCA should be - scaled by variance explained) and nb_permutations (how many permutations to - calculate empirical p-value). Defaults to True and 100, respectively. - """ - params = {"rescale_pca": True, "nb_permutations": 100} - return params - - -def calculate_mp_value( - pert_df: pd.DataFrame, - control_df: pd.DataFrame, - params: dict = {}, -) -> pd.Series: - """Given perturbation and control dataframes, calculate mp-value per perturbation - - Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See - :py:func:`cytominer_eval.operations.mp_value.mp_value`. - - Parameters - ---------- - pert_df : pandas.DataFrame - A pandas dataframe of replicate perturbations (samples by features) - control_df : pandas.DataFrame - A pandas dataframe of control perturbations (samples by features). Must have the - same feature measurements as pert_df - params : {dict}, optional - the parameters to use when calculating mp value. See - :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`. - - Returns - ------- - float - The mp value for the given perturbation - - """ - assert len(control_df) > 1, "Error! No control perturbations found." - - # Assign parameters - p = default_mp_value_parameters() - - assert all( - [x in p.keys() for x in params.keys()] - ), "Unknown parameters provided. Only {e} are supported.".format(e=p.keys()) - for (k, v) in params.items(): - p[k] = v - - merge_df = pd.concat([pert_df, control_df]).reset_index(drop=True) - - # We reduce the dimensionality with PCA - # so that 90% of the variance is conserved - pca = PCA(n_components=0.9, svd_solver="full") - pca_array = pca.fit_transform(merge_df) - # We scale columns by the variance explained - if p["rescale_pca"]: - pca_array = pca_array * pca.explained_variance_ratio_ - # This seems useless, as the point of using the Mahalanobis - # distance instead of the Euclidean distance is to be independent - # of axes scales - - # Distance between mean of perturbation and control - obs = calculate_mahalanobis( - pert_df=pca_array[: pert_df.shape[0]], - control_df=pca_array[-control_df.shape[0] :], - ) - # In the paper's methods section it mentions the covariance used - # might be modified to include variation of the perturbation as well. - - # Permutation test - sim = np.zeros(p["nb_permutations"]) - pert_mask = np.zeros(pca_array.shape[0], dtype=bool) - pert_mask[: pert_df.shape[0]] = 1 - for i in range(p["nb_permutations"]): - pert_mask_perm = np.random.permutation(pert_mask) - pert_perm = pca_array[pert_mask_perm] - control_perm = pca_array[np.logical_not(pert_mask_perm)] - sim[i] = calculate_mahalanobis(pert_df=pert_perm, control_df=control_perm) - - return np.mean([x >= obs for x in sim]) diff --git a/cytominer_eval/utils/precisionrecall_utils.py b/cytominer_eval/utils/precisionrecall_utils.py new file mode 100644 index 0000000..dfd0917 --- /dev/null +++ b/cytominer_eval/utils/precisionrecall_utils.py @@ -0,0 +1,43 @@ +import pandas as pd + + +def calculate_precision_recall(replicate_group_df: pd.DataFrame, k: int) -> pd.Series: + """Given an elongated pairwise correlation dataframe of replicate groups, calculate + precision and recall. + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.precision_recall.precision_recall`. + + Parameters + ---------- + replicate_group_df : pandas.DataFrame + An elongated dataframe storing pairwise correlations of all profiles to a single + replicate group. + k : int + an integer indicating how many pairwise comparisons to threshold. + + Returns + ------- + dict + A return bundle of identifiers (k) and results (precision and recall at k). + The dictionary has keys ("k", "precision", "recall"). + """ + assert ( + "group_replicate" in replicate_group_df.columns + ), "'group_replicate' not found in dataframe; remember to call assign_replicates()." + + recall_denom__total_relevant_items = sum(replicate_group_df.group_replicate) + precision_denom__num_recommended_items = k + + num_recommended_items_at_k = sum( + replicate_group_df.iloc[ + :k, + ].group_replicate + ) + + precision_at_k = num_recommended_items_at_k / precision_denom__num_recommended_items + recall_at_k = num_recommended_items_at_k / recall_denom__total_relevant_items + + return_bundle = {"k": k, "precision": precision_at_k, "recall": recall_at_k} + + return pd.Series(return_bundle) diff --git a/cytominer_eval/utils/transform_utils.py b/cytominer_eval/utils/transform_utils.py index b957248..04cea1a 100644 --- a/cytominer_eval/utils/transform_utils.py +++ b/cytominer_eval/utils/transform_utils.py @@ -4,30 +4,10 @@ import pandas.api.types as ptypes from collections import OrderedDict - -def get_available_eval_metrics(): - r"""Output the available eval metrics in the cytominer_eval library""" - return [ - "replicate_reproducibility", - "precision_recall", - "grit", - "mp_value", - "enrichment", - ] - - -def get_available_similarity_metrics(): - r"""Output the available metrics for calculating pairwise similarity in the - cytominer_eval library - """ - return ["pearson", "kendall", "spearman"] - - -def get_available_grit_summary_methods(): - r"""Output the available metrics for calculating pairwise similarity in the - cytominer_eval library - """ - return ["mean", "median"] +from cytominer_eval.utils.availability_utils import ( + get_available_eval_metrics, + get_available_similarity_metrics, +) def get_upper_matrix(df: pd.DataFrame) -> np.array: @@ -233,69 +213,3 @@ def check_replicate_groups( ), "Replicate groups must be a list for the {op} operation".format( op=eval_metric ) - - -def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: - r"""Transform column names to be used in calculating grit - - In calculating grit, the data must have a metadata feature describing the core - replicate perturbation (profile_col) and a separate metadata feature(s) describing - the larger group (replicate_group_col) that the perturbation belongs to (e.g. gene, - MOA). - - Parameters - ---------- - profile_col : str - the metadata column storing profile ids. The column can have unique or replicate - identifiers. - replicate_group_col : str - the metadata column indicating a higher order structure (group) than the - profile column. E.g. target gene vs. guide in a CRISPR experiment. - - Returns - ------- - dict - A nested dictionary of renamed columns indicating how to determine replicates - """ - # Identify column transform names - pair_ids = set_pair_ids() - - profile_id_with_suffix = [ - "{col}{suf}".format(col=profile_col, suf=pair_ids[x]["suffix"]) - for x in pair_ids - ] - - group_id_with_suffix = [ - "{col}{suf}".format(col=replicate_group_col, suf=pair_ids[x]["suffix"]) - for x in pair_ids - ] - - col_info = ["id", "comparison"] - profile_id_info = dict(zip(col_info, profile_id_with_suffix)) - group_id_info = dict(zip(col_info, group_id_with_suffix)) - - column_id_info = {"profile": profile_id_info, "group": group_id_info} - return column_id_info - - -def check_grit_replicate_summary_method(replicate_summary_method: str) -> None: - r"""Helper function to ensure that we support the user input replicate summary - - Parameters - ---------- - replicate_summary_method : str - The user input replicate summary method - - Returns - ------- - None - Assertion will fail if the user inputs an incorrect replicate summary method - """ - avail_methods = get_available_grit_summary_methods() - - if replicate_summary_method not in avail_methods: - raise ValueError( - "{input} method not supported, use one of: {avail}".format( - input=replicate_summary_method, avail=avail_methods - ) - ) From 0328cebb3b26397fb716520601e9c202fed960d4 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 11 May 2021 17:12:57 -0400 Subject: [PATCH 07/15] split util files into new test files --- .../test_utils/test_availability_utils.py | 34 +++++++++++ .../tests/test_utils/test_grit_utils.py | 54 ++++++++++++++++++ .../tests/test_utils/test_transform_utils.py | 56 +------------------ 3 files changed, 90 insertions(+), 54 deletions(-) create mode 100644 cytominer_eval/tests/test_utils/test_availability_utils.py create mode 100644 cytominer_eval/tests/test_utils/test_grit_utils.py diff --git a/cytominer_eval/tests/test_utils/test_availability_utils.py b/cytominer_eval/tests/test_utils/test_availability_utils.py new file mode 100644 index 0000000..b3e0568 --- /dev/null +++ b/cytominer_eval/tests/test_utils/test_availability_utils.py @@ -0,0 +1,34 @@ +import random +import pytest +import pathlib +import tempfile +import numpy as np +import pandas as pd +import pandas.api.types as ptypes + +from cytominer_eval.utils.availability_utils import ( + get_available_eval_metrics, + get_available_similarity_metrics, + get_available_grit_summary_methods, +) + + +def test_get_available_grit_summary_methods(): + expected_result = ["mean", "median"] + assert expected_result == get_available_grit_summary_methods() + + +def test_get_available_eval_metrics(): + expected_result = [ + "replicate_reproducibility", + "precision_recall", + "grit", + "mp_value", + "enrichment", + ] + assert expected_result == get_available_eval_metrics() + + +def test_get_available_similarity_metrics(): + expected_result = ["pearson", "kendall", "spearman"] + assert expected_result == get_available_similarity_metrics() diff --git a/cytominer_eval/tests/test_utils/test_grit_utils.py b/cytominer_eval/tests/test_utils/test_grit_utils.py new file mode 100644 index 0000000..f29b82d --- /dev/null +++ b/cytominer_eval/tests/test_utils/test_grit_utils.py @@ -0,0 +1,54 @@ +import random +import pytest +import pathlib +import tempfile +import numpy as np +import pandas as pd +import pandas.api.types as ptypes + +from cytominer_eval.utils.grit_utils import ( + set_grit_column_info, + check_grit_replicate_summary_method, +) + +from cytominer_eval.utils.availability_utils import get_available_grit_summary_methods + +random.seed(123) +tmpdir = tempfile.gettempdir() + +data_df = pd.DataFrame( + { + "float_a": np.random.normal(1, 1, 4), + "float_b": np.random.normal(1, 1, 4), + "string_a": ["a"] * 4, + "string_b": ["b"] * 4, + } +) +float_cols = ["float_a", "float_b"] + + +def test_check_grit_replicate_summary_method(): + + # Pass + for metric in get_available_grit_summary_methods(): + check_grit_replicate_summary_method(metric) + + with pytest.raises(ValueError) as ve: + output = check_grit_replicate_summary_method("fail") + assert "method not supported, use one of:" in str(ve.value) + + +def test_set_grit_column_info(): + profile_col = "test_replicate" + replicate_group_col = "test_group" + + result = set_grit_column_info( + profile_col=profile_col, replicate_group_col=replicate_group_col + ) + + assert result["profile"]["id"] == "{rep}_pair_a".format(rep=profile_col) + assert result["profile"]["comparison"] == "{rep}_pair_b".format(rep=profile_col) + assert result["group"]["id"] == "{group}_pair_a".format(group=replicate_group_col) + assert result["group"]["comparison"] == "{group}_pair_b".format( + group=replicate_group_col + ) diff --git a/cytominer_eval/tests/test_utils/test_transform_utils.py b/cytominer_eval/tests/test_utils/test_transform_utils.py index 53b0705..fa24067 100644 --- a/cytominer_eval/tests/test_utils/test_transform_utils.py +++ b/cytominer_eval/tests/test_utils/test_transform_utils.py @@ -6,20 +6,16 @@ import pandas as pd import pandas.api.types as ptypes -from cytominer_eval.transform.util import ( - get_available_eval_metrics, - get_available_similarity_metrics, - get_available_grit_summary_methods, +from cytominer_eval.utils.transform_utils import ( get_upper_matrix, convert_pandas_dtypes, assert_pandas_dtypes, set_pair_ids, - set_grit_column_info, assert_eval_metric, assert_melt, check_replicate_groups, - check_grit_replicate_summary_method, ) +from cytominer_eval.utils.availability_utils import get_available_eval_metrics random.seed(123) tmpdir = tempfile.gettempdir() @@ -35,27 +31,6 @@ float_cols = ["float_a", "float_b"] -def test_get_available_eval_metrics(): - expected_result = [ - "replicate_reproducibility", - "precision_recall", - "grit", - "mp_value", - "enrichment" - ] - assert expected_result == get_available_eval_metrics() - - -def test_get_available_similarity_metrics(): - expected_result = ["pearson", "kendall", "spearman"] - assert expected_result == get_available_similarity_metrics() - - -def test_get_available_grit_summary_methods(): - expected_result = ["mean", "median"] - assert expected_result == get_available_grit_summary_methods() - - def test_assert_eval_metric(): with pytest.raises(AssertionError) as ae: output = assert_eval_metric(eval_metric="NOT SUPPORTED") @@ -111,22 +86,6 @@ def test_set_pair_ids(): assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b) -def test_set_grit_column_info(): - profile_col = "test_replicate" - replicate_group_col = "test_group" - - result = set_grit_column_info( - profile_col=profile_col, replicate_group_col=replicate_group_col - ) - - assert result["profile"]["id"] == "{rep}_pair_a".format(rep=profile_col) - assert result["profile"]["comparison"] == "{rep}_pair_b".format(rep=profile_col) - assert result["group"]["id"] == "{group}_pair_a".format(group=replicate_group_col) - assert result["group"]["comparison"] == "{group}_pair_b".format( - group=replicate_group_col - ) - - def test_check_replicate_groups(): available_metrics = get_available_eval_metrics() @@ -174,14 +133,3 @@ def test_check_replicate_groups(): eval_metric="grit", replicate_groups=wrong_group_dict ) assert "replicate_groups for grit not formed properly." in str(ae.value) - - -def test_check_grit_replicate_summary_method(): - - # Pass - for metric in get_available_grit_summary_methods(): - check_grit_replicate_summary_method(metric) - - with pytest.raises(ValueError) as ve: - output = check_grit_replicate_summary_method("fail") - assert "method not supported, use one of:" in str(ve.value) From e7aa8ed998f14c8d4448564c2ea90815da5ef55a Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Wed, 12 May 2021 14:13:16 -0400 Subject: [PATCH 08/15] fix assertion tolerance --- cytominer_eval/tests/test_utils/test_operation_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cytominer_eval/tests/test_utils/test_operation_utils.py b/cytominer_eval/tests/test_utils/test_operation_utils.py index 4d478a7..58ffd6f 100644 --- a/cytominer_eval/tests/test_utils/test_operation_utils.py +++ b/cytominer_eval/tests/test_utils/test_operation_utils.py @@ -119,7 +119,7 @@ def test_calculate_precision_recall(): expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333} expected_result = pd.DataFrame(expected_result, index=["result"]).transpose() - assert_frame_equal(result, expected_result) + assert_frame_equal(result, expected_result, rtol=1e-3) # Check that recall is 1 when k is maximized result = pd.DataFrame( From c730a0b334b89e22d6a15253394a20660c5617b2 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Wed, 12 May 2021 14:37:22 -0400 Subject: [PATCH 09/15] fix np builtin type deprecation in 1.2 --- cytominer_eval/evaluate.py | 2 +- .../operations/replicate_reproducibility.py | 4 ++-- .../tests/test_utils/test_transform_utils.py | 8 ++++---- cytominer_eval/transform/transform.py | 8 ++++---- cytominer_eval/utils/transform_utils.py | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py index b7f2e37..166cdfb 100644 --- a/cytominer_eval/evaluate.py +++ b/cytominer_eval/evaluate.py @@ -25,7 +25,7 @@ def evaluate( replicate_groups: Union[List[str], dict], operation: str = "replicate_reproducibility", similarity_metric: str = "pearson", - replicate_reproducibility_quantile: np.float = 0.95, + replicate_reproducibility_quantile: float = 0.95, replicate_reproducibility_return_median_cor: bool = False, precision_recall_k: Union[int, List[int]] = 10, grit_control_perts: List[str] = ["None"], diff --git a/cytominer_eval/operations/replicate_reproducibility.py b/cytominer_eval/operations/replicate_reproducibility.py index c537e45..098e4c9 100644 --- a/cytominer_eval/operations/replicate_reproducibility.py +++ b/cytominer_eval/operations/replicate_reproducibility.py @@ -12,9 +12,9 @@ def replicate_reproducibility( similarity_melted_df: pd.DataFrame, replicate_groups: List[str], - quantile_over_null: np.float = 0.95, + quantile_over_null: float = 0.95, return_median_correlations: bool = False, -) -> np.float: +) -> float: r"""Summarize pairwise replicate correlations For a given pairwise similarity matrix, replicate information, and specific options, diff --git a/cytominer_eval/tests/test_utils/test_transform_utils.py b/cytominer_eval/tests/test_utils/test_transform_utils.py index fa24067..d0de05d 100644 --- a/cytominer_eval/tests/test_utils/test_transform_utils.py +++ b/cytominer_eval/tests/test_utils/test_transform_utils.py @@ -53,7 +53,7 @@ def test_convert_pandas_dtypes(): data_string_type_df = data_df.astype(str) output_df = convert_pandas_dtypes( - data_string_type_df.loc[:, float_cols], col_fix=np.float64 + data_string_type_df.loc[:, float_cols], col_fix=float ) assert all([ptypes.is_numeric_dtype(output_df[x]) for x in output_df.columns]) @@ -65,12 +65,12 @@ def test_assert_pandas_dtypes(): with pytest.raises(AssertionError) as ve: output = assert_pandas_dtypes(data_df, col_fix="not supported") - assert "Only np.str and np.float64 are supported" in str(ve.value) + assert "Only str and float are supported" in str(ve.value) - output_df = assert_pandas_dtypes(data_df, col_fix=np.str) + output_df = assert_pandas_dtypes(data_df, col_fix=str) all([ptypes.is_string_dtype(output_df[x]) for x in output_df.columns]) - output_df = convert_pandas_dtypes(output_df.loc[:, float_cols], col_fix=np.float64) + output_df = convert_pandas_dtypes(output_df.loc[:, float_cols], col_fix=float) assert all([ptypes.is_numeric_dtype(output_df[x]) for x in output_df.columns]) diff --git a/cytominer_eval/transform/transform.py b/cytominer_eval/transform/transform.py index a24c10a..2555ed8 100644 --- a/cytominer_eval/transform/transform.py +++ b/cytominer_eval/transform/transform.py @@ -15,7 +15,7 @@ def get_pairwise_metric(df: pd.DataFrame, similarity_metric: str) -> pd.DataFrame: - df = assert_pandas_dtypes(df=df, col_fix=np.float64) + df = assert_pandas_dtypes(df=df, col_fix=float) assert ( similarity_metric in available_pairwise_similarity_metrics @@ -108,9 +108,9 @@ def metric_melt( meta_df = df.loc[:, metadata_features] df = df.loc[:, features] - # Convert and assert conversion success - meta_df = assert_pandas_dtypes(df=meta_df, col_fix=np.str) - df = assert_pandas_dtypes(df=df, col_fix=np.float64) + # Convert pandas column types and assert conversion success + meta_df = assert_pandas_dtypes(df=meta_df, col_fix=str) + df = assert_pandas_dtypes(df=df, col_fix=float) # Get pairwise metric matrix pair_df = get_pairwise_metric(df=df, similarity_metric=similarity_metric) diff --git a/cytominer_eval/utils/transform_utils.py b/cytominer_eval/utils/transform_utils.py index 04cea1a..66191da 100644 --- a/cytominer_eval/utils/transform_utils.py +++ b/cytominer_eval/utils/transform_utils.py @@ -26,14 +26,14 @@ def get_upper_matrix(df: pd.DataFrame) -> np.array: return np.triu(np.ones(df.shape), k=1).astype(bool) -def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame: +def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = float) -> pd.DataFrame: r"""Helper funtion to convert pandas column dtypes Parameters ---------- df : pandas.DataFrame A pandas dataframe to convert columns - col_fix : {np.float64, np.str}, optional + col_fix : {float, str}, optional A column type to convert the input dataframe. Returns @@ -53,14 +53,14 @@ def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Da return df -def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame: +def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = float) -> pd.DataFrame: r"""Helper funtion to ensure pandas columns have compatible columns Parameters ---------- df : pandas.DataFrame A pandas dataframe to convert columns - col_fix : {np.float64, np.str}, optional + col_fix : {float, str}, optional A column type to convert the input dataframe. Returns @@ -68,7 +68,7 @@ def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Dat pd.DataFrame A dataframe with converted columns """ - assert col_fix in [np.str, np.float64], "Only np.str and np.float64 are supported" + assert col_fix in [str, float], "Only str and float are supported" df = convert_pandas_dtypes(df=df, col_fix=col_fix) @@ -76,7 +76,7 @@ def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Dat if col_fix == np.str: assert all([ptypes.is_string_dtype(df[x]) for x in df.columns]), assert_error - if col_fix == np.float64: + if col_fix == float: assert all([ptypes.is_numeric_dtype(df[x]) for x in df.columns]), assert_error return df From 03c09ee03b48fa846b1cb0edb996324bbbc8fdd1 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Wed, 12 May 2021 14:46:45 -0400 Subject: [PATCH 10/15] revert back to deprecated check_less_precise --- cytominer_eval/tests/test_utils/test_operation_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cytominer_eval/tests/test_utils/test_operation_utils.py b/cytominer_eval/tests/test_utils/test_operation_utils.py index 58ffd6f..276070b 100644 --- a/cytominer_eval/tests/test_utils/test_operation_utils.py +++ b/cytominer_eval/tests/test_utils/test_operation_utils.py @@ -119,7 +119,7 @@ def test_calculate_precision_recall(): expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333} expected_result = pd.DataFrame(expected_result, index=["result"]).transpose() - assert_frame_equal(result, expected_result, rtol=1e-3) + assert_frame_equal(result, expected_result, check_less_precise=True) # Check that recall is 1 when k is maximized result = pd.DataFrame( From bdb9c7882f3a603c76e9b101427ddcd1c8758429 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 18 May 2021 13:36:19 -0400 Subject: [PATCH 11/15] add util init file --- cytominer_eval/utils/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cytominer_eval/utils/__init__.py diff --git a/cytominer_eval/utils/__init__.py b/cytominer_eval/utils/__init__.py new file mode 100644 index 0000000..e69de29 From ec01949bf895df77c32302983204eaa5b4a2f78e Mon Sep 17 00:00:00 2001 From: Greg Way Date: Tue, 18 May 2021 13:48:50 -0400 Subject: [PATCH 12/15] Update cytominer_eval/tests/test_operations/test_mp_value.py Co-authored-by: Michael Bornholdt <56402523+michaelbornholdt@users.noreply.github.com> --- cytominer_eval/tests/test_operations/test_mp_value.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cytominer_eval/tests/test_operations/test_mp_value.py b/cytominer_eval/tests/test_operations/test_mp_value.py index 00fb2bc..0f5b04f 100644 --- a/cytominer_eval/tests/test_operations/test_mp_value.py +++ b/cytominer_eval/tests/test_operations/test_mp_value.py @@ -13,6 +13,7 @@ calculate_mahalanobis, ) + # Load CRISPR dataset example_file = "SQ00014610_normalized_feature_select.csv.gz" example_file = pathlib.Path( From e450cfff73c378c376ead4529a34189531b11b10 Mon Sep 17 00:00:00 2001 From: Greg Way Date: Tue, 18 May 2021 13:49:22 -0400 Subject: [PATCH 13/15] Update cytominer_eval/tests/test_utils/test_availability_utils.py Co-authored-by: Michael Bornholdt <56402523+michaelbornholdt@users.noreply.github.com> --- cytominer_eval/tests/test_utils/test_availability_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cytominer_eval/tests/test_utils/test_availability_utils.py b/cytominer_eval/tests/test_utils/test_availability_utils.py index b3e0568..e179b8c 100644 --- a/cytominer_eval/tests/test_utils/test_availability_utils.py +++ b/cytominer_eval/tests/test_utils/test_availability_utils.py @@ -6,6 +6,7 @@ import pandas as pd import pandas.api.types as ptypes + from cytominer_eval.utils.availability_utils import ( get_available_eval_metrics, get_available_similarity_metrics, From 16df0c712068cd407afe44012a69c40fb10dcc76 Mon Sep 17 00:00:00 2001 From: Greg Way Date: Tue, 18 May 2021 13:49:49 -0400 Subject: [PATCH 14/15] Update cytominer_eval/tests/test_utils/test_grit_utils.py Co-authored-by: Michael Bornholdt <56402523+michaelbornholdt@users.noreply.github.com> --- cytominer_eval/tests/test_utils/test_grit_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cytominer_eval/tests/test_utils/test_grit_utils.py b/cytominer_eval/tests/test_utils/test_grit_utils.py index f29b82d..163a92a 100644 --- a/cytominer_eval/tests/test_utils/test_grit_utils.py +++ b/cytominer_eval/tests/test_utils/test_grit_utils.py @@ -13,6 +13,7 @@ from cytominer_eval.utils.availability_utils import get_available_grit_summary_methods + random.seed(123) tmpdir = tempfile.gettempdir() From 5d2215ca165a8fd3191253cd8411c7b8c480d63a Mon Sep 17 00:00:00 2001 From: Greg Way Date: Tue, 18 May 2021 13:54:00 -0400 Subject: [PATCH 15/15] Apply suggestions from code review Fixing spaces after function import, docstring consistency, and a missing numpy builtin update Co-authored-by: Michael Bornholdt <56402523+michaelbornholdt@users.noreply.github.com> --- cytominer_eval/tests/test_utils/test_operation_utils.py | 1 + cytominer_eval/tests/test_utils/test_transform_utils.py | 1 + cytominer_eval/utils/grit_utils.py | 4 ++-- cytominer_eval/utils/transform_utils.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cytominer_eval/tests/test_utils/test_operation_utils.py b/cytominer_eval/tests/test_utils/test_operation_utils.py index 276070b..87c771a 100644 --- a/cytominer_eval/tests/test_utils/test_operation_utils.py +++ b/cytominer_eval/tests/test_utils/test_operation_utils.py @@ -12,6 +12,7 @@ from cytominer_eval.utils.operation_utils import assign_replicates from cytominer_eval.utils.precisionrecall_utils import calculate_precision_recall + random.seed(123) tmpdir = tempfile.gettempdir() diff --git a/cytominer_eval/tests/test_utils/test_transform_utils.py b/cytominer_eval/tests/test_utils/test_transform_utils.py index d0de05d..4c64f68 100644 --- a/cytominer_eval/tests/test_utils/test_transform_utils.py +++ b/cytominer_eval/tests/test_utils/test_transform_utils.py @@ -17,6 +17,7 @@ ) from cytominer_eval.utils.availability_utils import get_available_eval_metrics + random.seed(123) tmpdir = tempfile.gettempdir() diff --git a/cytominer_eval/utils/grit_utils.py b/cytominer_eval/utils/grit_utils.py index 33a5256..b73b867 100644 --- a/cytominer_eval/utils/grit_utils.py +++ b/cytominer_eval/utils/grit_utils.py @@ -102,7 +102,7 @@ def get_grit_entry(df: pd.DataFrame, col: str) -> str: def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: - r"""Transform column names to be used in calculating grit + """Transform column names to be used in calculating grit In calculating grit, the data must have a metadata feature describing the core replicate perturbation (profile_col) and a separate metadata feature(s) describing @@ -145,7 +145,7 @@ def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: def check_grit_replicate_summary_method(replicate_summary_method: str) -> None: - r"""Helper function to ensure that we support the user input replicate summary + """Helper function to ensure that we support the user input replicate summary Parameters ---------- diff --git a/cytominer_eval/utils/transform_utils.py b/cytominer_eval/utils/transform_utils.py index 66191da..3d7e679 100644 --- a/cytominer_eval/utils/transform_utils.py +++ b/cytominer_eval/utils/transform_utils.py @@ -73,7 +73,7 @@ def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = float) -> pd.DataFram df = convert_pandas_dtypes(df=df, col_fix=col_fix) assert_error = "Columns not successfully updated, is the dataframe consistent?" - if col_fix == np.str: + if col_fix == str: assert all([ptypes.is_string_dtype(df[x]) for x in df.columns]), assert_error if col_fix == float: