From f713b7b6211c386f8981d6519496ee5a70b99f5c Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Tue, 6 Feb 2024 12:21:28 -0800 Subject: [PATCH] Sort features on sample abundances --- bin/bin_metagenomes.py | 61 +++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/bin/bin_metagenomes.py b/bin/bin_metagenomes.py index 6a6b106..2cd4122 100755 --- a/bin/bin_metagenomes.py +++ b/bin/bin_metagenomes.py @@ -676,17 +676,21 @@ def log_scale(df: pd.DataFrame): lowest = df.apply(lambda c: c[c > 0].min()).min() return df.clip(lower=lowest).apply(np.log10) - @staticmethod - def sort_index(df: pd.DataFrame): - return df.index.values[ - hierarchy.leaves_list( - hierarchy.linkage( - df.values, - metric="cosine", - method="average" + def sort_index(self, df: pd.DataFrame, metric="cosine", method="average"): + try: + return df.index.values[ + hierarchy.leaves_list( + hierarchy.linkage( + df.values, + metric=metric, + method=method + ) ) - ) - ] + ] + except Exception as e: + logger.info("Error encountered while sorting table:") + self.log_df(df) + raise e def write_image( self, @@ -714,9 +718,32 @@ def write_image( row_heights = np.array([0.5, heatmap_size, 1, heatmap_size, 1, 1, 1]) row_heights = list(row_heights / row_heights.sum()) + # Genomes across samples + genomes_df: pd.DataFrame = ( + self.data + .mod["genomes"] + .to_df("prop") + ) + + # Bins across samples + bins_df: pd.DataFrame = ( + self.data + .mod["bins"] + .to_df("prop") + ) + # Sort the bins and genomes - bin_order = self.sort_index(self.data.uns["group_profile"]) - genome_order = self.sort_index(self.data.uns["group_profile"].T) + bin_order = self.sort_index( + bins_df.T, + metric="euclidean", + method="ward" + ) + + genome_order = self.sort_index( + genomes_df.T, + metric="euclidean", + method="ward" + ) cols = 6 rows = 7 @@ -771,11 +798,6 @@ def write_image( ) # Genomes across samples - genomes_df: pd.DataFrame = ( - self.data - .mod["genomes"] - .to_df("prop") - ) sample_order = self.sort_index(genomes_df) genomes_df = genomes_df.reindex( columns=genome_order, @@ -884,11 +906,6 @@ def write_image( ) # Bins across samples - bins_df: pd.DataFrame = ( - self.data - .mod["bins"] - .to_df("prop") - ) sample_order = self.sort_index(bins_df) bins_df = bins_df.reindex( columns=bin_order,