Skip to content

Commit

Permalink
Merge pull request #362 from MannLabs/remove_index_column
Browse files Browse the repository at this point in the history
Remove index column
  • Loading branch information
mschwoer authored Nov 14, 2024
2 parents 0f4694f + 48c8f7e commit c5f9fbb
Show file tree
Hide file tree
Showing 16 changed files with 82 additions and 65 deletions.
16 changes: 7 additions & 9 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

from alphastats import BaseLoader
from alphastats.dataset_factory import DataSetFactory
from alphastats.dataset_harmonizer import DataHarmonizer
from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess
from alphastats.DataSet_Statistics import Statistics
from alphastats.keys import Cols
from alphastats.plots.ClusterMap import ClusterMap
from alphastats.plots.DimensionalityReduction import DimensionalityReduction
from alphastats.plots.IntensityPlot import IntensityPlot
Expand Down Expand Up @@ -63,9 +65,11 @@ def __init__(
self._check_loader(loader=loader)

# fill data from loader
self.rawinput: pd.DataFrame = loader.rawinput
self.rawinput: pd.DataFrame = DataHarmonizer(loader).get_harmonized_rawinput(
loader.rawinput
)
self.filter_columns: List[str] = loader.filter_columns
self.index_column: str = loader.index_column

self.software: str = loader.software
self._gene_names: str = loader.gene_names

Expand All @@ -81,7 +85,6 @@ def __init__(

self._dataset_factory = DataSetFactory(
rawinput=self.rawinput,
index_column=self.index_column,
intensity_column=self._intensity_column,
metadata_path_or_df=metadata_path_or_df,
sample_column=sample_column,
Expand All @@ -100,7 +103,7 @@ def __init__(
for k, v in dict(
zip(
self.rawinput[self._gene_names].tolist(),
self.rawinput[self.index_column].tolist(),
self.rawinput[Cols.INDEX].tolist(),
)
).items()
if isinstance(k, str) # avoid having NaN as key
Expand Down Expand Up @@ -155,7 +158,6 @@ def _get_preprocess(self) -> Preprocess:
return Preprocess(
self.filter_columns,
self.rawinput,
self.index_column,
self.sample,
self.metadata,
self.preprocessing_info,
Expand Down Expand Up @@ -206,7 +208,6 @@ def _get_statistics(self) -> Statistics:
return Statistics(
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
)
Expand Down Expand Up @@ -239,7 +240,6 @@ def tukey_test(self, protein_id: str, group: str) -> pd.DataFrame:
df,
protein_id,
group,
self.index_column,
)

def anova(self, column: str, protein_ids="all", tukey: bool = True) -> pd.DataFrame:
Expand Down Expand Up @@ -400,7 +400,6 @@ def plot_volcano(
rawinput=self.rawinput,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
gene_names=self._gene_names,
preprocessing_info=self.preprocessing_info,
group1=group1,
Expand Down Expand Up @@ -523,7 +522,6 @@ def plot_clustermap(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
preprocessing_info=self.preprocessing_info,
label_bar=label_bar,
only_significant=only_significant,
Expand Down
5 changes: 2 additions & 3 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import streamlit as st
from sklearn.experimental import enable_iterative_imputer # noqa

from alphastats.keys import Cols
from alphastats.utils import ignore_warning


Expand Down Expand Up @@ -45,7 +46,6 @@ def __init__(
self,
filter_columns: List[str],
rawinput: pd.DataFrame,
index_column: str,
sample: str,
metadata: pd.DataFrame,
preprocessing_info: Dict,
Expand All @@ -54,7 +54,6 @@ def __init__(
self.filter_columns = filter_columns

self.rawinput = rawinput
self.index_column = index_column
self.sample = sample

self.metadata = metadata
Expand Down Expand Up @@ -157,7 +156,7 @@ def _filter(self):
# print column names with contamination
protein_groups_to_remove = self.rawinput[
self.rawinput[self.filter_columns].any(axis=1)
][self.index_column].tolist()
][Cols.INDEX].tolist()

protein_groups_to_remove = list(
set(protein_groups_to_remove) & set(self.mat.columns.to_list())
Expand Down
4 changes: 0 additions & 4 deletions alphastats/DataSet_Statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,11 @@ def __init__(
*,
mat: pd.DataFrame,
metadata: pd.DataFrame,
index_column: str,
sample: str,
preprocessing_info: Dict,
):
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.index_column: str = index_column
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info

Expand Down Expand Up @@ -62,7 +60,6 @@ def diff_expression_analysis(
df = DifferentialExpressionAnalysis(
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group1=group1,
Expand Down Expand Up @@ -93,7 +90,6 @@ def anova(self, column: str, protein_ids="all", tukey: bool = True) -> pd.DataFr
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
column=column,
protein_ids=protein_ids,
tukey=tukey,
Expand Down
6 changes: 3 additions & 3 deletions alphastats/dataset_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy as np
import pandas as pd

from alphastats.keys import Cols


class DataSetFactory:
"""Create all 'heavy' data structures of a DataSet."""
Expand All @@ -13,22 +15,20 @@ def __init__(
self,
*,
rawinput: pd.DataFrame,
index_column: str,
intensity_column: Union[List[str], str],
metadata_path_or_df: Union[str, pd.DataFrame],
sample_column: str,
):
self.rawinput: pd.DataFrame = rawinput
self.sample_column: str = sample_column
self.index_column: str = index_column
self.intensity_column: Union[List[str], str] = intensity_column
self.metadata_path_or_df: Union[str, pd.DataFrame] = metadata_path_or_df

def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a matrix: features (Proteins) as columns, samples as rows."""

df = self.rawinput
df = df.set_index(self.index_column)
df = df.set_index(Cols.INDEX)

if isinstance(self.intensity_column, str):
regex_find_intensity_columns = self.intensity_column.replace(
Expand Down
26 changes: 26 additions & 0 deletions alphastats/dataset_harmonizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Harmonize the input data to a common format."""

import pandas as pd

from alphastats import BaseLoader
from alphastats.keys import Cols


class DataHarmonizer:
"""Harmonize input data to a common format."""

def __init__(self, loader: BaseLoader):
self._rename_dict = {loader.index_column: Cols.INDEX}

def get_harmonized_rawinput(self, rawinput: pd.DataFrame) -> pd.DataFrame:
"""Harmonize the rawinput data to a common format."""
for target_name in self._rename_dict.values():
if target_name in rawinput.columns:
raise ValueError(
f"Column name {target_name} already exists in rawinput. Please rename the column."
)

return rawinput.rename(
columns=self._rename_dict,
errors="raise",
)
1 change: 0 additions & 1 deletion alphastats/gui/utils/analysis_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ def gui_volcano_plot() -> Tuple[Optional[Any], Optional[Any], Optional[Dict]]:
rawinput=dataset.rawinput,
metadata=dataset.metadata,
sample=dataset.sample,
index_column=dataset.index_column,
gene_names=dataset._gene_names,
preprocessing_info=dataset.preprocessing_info,
**parameters,
Expand Down
5 changes: 4 additions & 1 deletion alphastats/gui/utils/import_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def _check_softwarefile_df(df: pd.DataFrame, software: str) -> None:
Can be fragile when different settings are used or software is updated.
"""
# TODO this needs to go to the loader

if software == "MaxQuant":
expected_columns = ["Protein IDs", "Reverse", "Potential contaminant"]
Expand Down Expand Up @@ -241,7 +242,9 @@ def get_sample_names_from_software_file(loader: BaseLoader) -> List[str]:
"""
extract sample names from software
"""
if isinstance(loader.intensity_column, str):
if isinstance(
loader.intensity_column, str
): # TODO duplicated logic in MaxQuantLoader
regex_find_intensity_columns = loader.intensity_column.replace("[sample]", ".*")
df = loader.rawinput
df = df.set_index(loader.index_column)
Expand Down
7 changes: 7 additions & 0 deletions alphastats/keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""String constants for accessing columns."""


class Cols:
"""String constants for accessing columns of the main dataframe in DataSet."""

INDEX = "index_"
2 changes: 0 additions & 2 deletions alphastats/multicova/multicova.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,6 @@ def perform_ttest_analysis(
s0=1,
n_perm=2,
fdr=0.01,
id_col="Genes",
plot_fdr_line=False,
parallelize=False,
):
"""
Expand Down
6 changes: 2 additions & 4 deletions alphastats/plots/ClusterMap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import seaborn as sns

from alphastats.DataSet_Statistics import Statistics
from alphastats.keys import Cols
from alphastats.plots.PlotUtils import PlotUtils


Expand All @@ -15,7 +16,6 @@ def __init__(
mat: pd.DataFrame,
metadata: pd.DataFrame,
sample: str,
index_column: str,
preprocessing_info: Dict,
label_bar,
only_significant,
Expand All @@ -25,13 +25,11 @@ def __init__(
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.index_column: str = index_column
self.preprocessing_info: Dict = preprocessing_info

self._statistics = Statistics(
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
)
Expand Down Expand Up @@ -61,7 +59,7 @@ def _prepare_df(self):
if self.only_significant and self.group is not None:
anova_df = self._statistics.anova(column=self.group, tukey=False)
significant_proteins = anova_df[anova_df["ANOVA_pvalue"] < 0.05][
self.index_column
Cols.INDEX
].to_list()
df = df[significant_proteins] # TODO bug?

Expand Down
24 changes: 10 additions & 14 deletions alphastats/plots/VolcanoPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from alphastats.DataSet_Preprocess import PreprocessingStateKeys
from alphastats.DataSet_Statistics import Statistics
from alphastats.keys import Cols
from alphastats.multicova import multicova
from alphastats.plots.PlotUtils import PlotUtils, plotly_object
from alphastats.statistics.DifferentialExpressionAnalysis import (
Expand Down Expand Up @@ -49,7 +50,6 @@ def __init__(
rawinput: pd.DataFrame,
metadata: pd.DataFrame,
sample: str,
index_column: str,
gene_names: str,
preprocessing_info: Dict,
group1: Union[List[str], str],
Expand All @@ -70,7 +70,6 @@ def __init__(
self.rawinput = rawinput
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.index_column: str = index_column
self.gene_names: str = gene_names
self.preprocessing_info: Dict = preprocessing_info

Expand Down Expand Up @@ -103,7 +102,6 @@ def __init__(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
preprocessing_info=self.preprocessing_info,
)

Expand Down Expand Up @@ -143,7 +141,6 @@ def _perform_differential_expression_analysis(
res, tlim_ttest = DifferentialExpressionAnalysis(
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group1=self.group1,
Expand Down Expand Up @@ -213,27 +210,29 @@ def _anova(self) -> Tuple[pd.DataFrame, str]:
group2_samples,
self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED],
)
fc_df = pd.DataFrame({"log2fc": fc, self.index_column: mat_transpose.index})
fc_df = pd.DataFrame({"log2fc": fc, Cols.INDEX: mat_transpose.index})

# check how column is ordered
pvalue_column = self.group1 + " vs. " + self.group2 + " Tukey Test"

if pvalue_column not in result_df.columns:
pvalue_column = self.group2 + " vs. " + self.group1 + " Tukey Test"

res = result_df.reset_index().merge(fc_df.reset_index(), on=self.index_column)
res = result_df.reset_index().merge(fc_df.reset_index(), on=Cols.INDEX)

return res, pvalue_column

def _add_hover_data_columns(self):
# additional labeling with gene names
self.hover_data = [self.index_column]
self.hover_data = [
Cols.INDEX
] # TODO this now shows the internal column name as description

if self.gene_names is not None:
self.res = pd.merge(
self.res,
self.rawinput[[self.gene_names, self.index_column]],
on=self.index_column,
self.rawinput[[self.gene_names, Cols.INDEX]],
on=Cols.INDEX,
how="left",
)
self.hover_data.append(self.gene_names)
Expand Down Expand Up @@ -269,7 +268,7 @@ def _annotate_result_df(self):

if len(self.color_list) > 0:
self.res["color"] = np.where(
self.res[self.index_column].isin(self.color_list),
self.res[Cols.INDEX].isin(self.color_list),
"color",
"no_color",
)
Expand All @@ -279,10 +278,7 @@ def _add_labels_plot(self):
add gene names as hover data if they are given
"""

if self.gene_names is not None:
label_column = self.gene_names
else:
label_column = self.index_column
label_column = self.gene_names if self.gene_names is not None else Cols.INDEX

self.res["label"] = np.where(
self.res.color != "non_sig", self.res[label_column], ""
Expand Down
Loading

0 comments on commit c5f9fbb

Please sign in to comment.