Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove sample column #364

Merged
merged 16 commits into from
Nov 14, 2024
36 changes: 13 additions & 23 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,20 @@ def __init__(
"""
self._check_loader(loader=loader)

self._data_harmonizer = DataHarmonizer(loader, sample_column)
mschwoer marked this conversation as resolved.
Show resolved Hide resolved

# fill data from loader
self.rawinput: pd.DataFrame = DataHarmonizer(loader).get_harmonized_rawinput(
self.rawinput: pd.DataFrame = self._data_harmonizer.get_harmonized_rawinput(
loader.rawinput
)
self.filter_columns: List[str] = loader.filter_columns

self.software: str = loader.software

self._intensity_column: Union[str, list] = (
loader._extract_sample_names(
metadata=self.metadata, sample_column=self.sample
metadata=self.metadata, sample_column=sample_column
)
if loader == "Generic"
if loader
== "Generic" # TODO is this ever the case? not rather instanceof(loader, GenericLoader)?
else loader.intensity_column
)

Expand All @@ -86,14 +87,13 @@ def __init__(
rawinput=self.rawinput,
intensity_column=self._intensity_column,
metadata_path_or_df=metadata_path_or_df,
sample_column=sample_column,
data_harmonizer=self._data_harmonizer,
)

rawmat, mat, metadata, sample, preprocessing_info = self._get_init_dataset()
rawmat, mat, metadata, preprocessing_info = self._get_init_dataset()
self.rawmat: pd.DataFrame = rawmat
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info

self._gene_name_to_protein_id_map = (
Expand All @@ -115,11 +115,11 @@ def __init__(

def _get_init_dataset(
self,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict]:
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, Dict]:
"""Get the initial data structure for the DataSet."""
rawmat, mat = self._dataset_factory.create_matrix_from_rawinput()

metadata, sample = self._dataset_factory.create_metadata(mat)
metadata = self._dataset_factory.create_metadata(mat)

preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=mat.shape[0],
Expand All @@ -128,7 +128,7 @@ def _get_init_dataset(
filter_columns=self.filter_columns,
)

return rawmat, mat, metadata, sample, preprocessing_info
return rawmat, mat, metadata, preprocessing_info

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Expand Down Expand Up @@ -157,7 +157,6 @@ def _get_preprocess(self) -> Preprocess:
return Preprocess(
self.filter_columns,
self.rawinput,
self.sample,
self.metadata,
self.preprocessing_info,
self.mat,
Expand Down Expand Up @@ -194,7 +193,6 @@ def reset_preprocessing(self):
self.rawmat,
self.mat,
self.metadata,
self.sample,
self.preprocessing_info,
) = self._get_init_dataset()

Expand All @@ -207,7 +205,6 @@ def _get_statistics(self) -> Statistics:
return Statistics(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
)

Expand All @@ -232,8 +229,8 @@ def diff_expression_analysis(

def tukey_test(self, protein_id: str, group: str) -> pd.DataFrame:
"""A wrapper for tukey_test.tukey_test(), see documentation there."""
df = self.mat[[protein_id]].reset_index().rename(columns={"index": self.sample})
df = df.merge(self.metadata, how="inner", on=[self.sample])
df = self.mat[[protein_id]].reset_index().rename(columns={"index": Cols.SAMPLE})
df = df.merge(self.metadata, how="inner", on=[Cols.SAMPLE])

return tukey_test(
df,
Expand Down Expand Up @@ -265,7 +262,6 @@ def plot_pca(self, group: Optional[str] = None, circle: bool = False):
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
circle=circle,
Expand Down Expand Up @@ -293,7 +289,6 @@ def plot_tsne(
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="tsne",
Expand All @@ -317,7 +312,6 @@ def plot_umap(self, group: Optional[str] = None, circle: bool = False):
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="umap",
Expand Down Expand Up @@ -398,7 +392,6 @@ def plot_volcano(
mat=self.mat,
rawinput=self.rawinput,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
Expand Down Expand Up @@ -482,7 +475,6 @@ def plot_intensity(
intensity_plot = IntensityPlot(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
intensity_column=self._intensity_column,
preprocessing_info=self.preprocessing_info,
protein_id=protein_id,
Expand Down Expand Up @@ -519,7 +511,6 @@ def plot_clustermap(
clustermap = ClusterMap(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
label_bar=label_bar,
only_significant=only_significant,
Expand All @@ -542,7 +533,6 @@ def _get_plot(self) -> Plot:
self.mat,
self.rawmat,
self.metadata,
self.sample,
self.preprocessing_info,
)

Expand Down
11 changes: 5 additions & 6 deletions alphastats/DataSet_Plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import scipy
import seaborn as sns

from alphastats.keys import Cols
from alphastats.plots.PlotUtils import PlotUtils
from alphastats.utils import check_for_missing_values

Expand Down Expand Up @@ -50,13 +51,11 @@ def __init__(
mat: pd.DataFrame,
rawmat: pd.DataFrame,
metadata: pd.DataFrame,
sample: str,
preprocessing_info: Dict,
):
self.mat: pd.DataFrame = mat
self.rawmat: pd.DataFrame = rawmat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info

def plot_correlation_matrix(self, method: str = "pearson"): # TODO unused
Expand Down Expand Up @@ -95,15 +94,15 @@ def plot_sampledistribution(
# create long df
matrix = self.mat if not use_raw else self.rawmat
df = matrix.unstack().reset_index()
df.rename(columns={"level_1": self.sample, 0: "Intensity"}, inplace=True)
df.rename(columns={"level_1": Cols.SAMPLE, 0: "Intensity"}, inplace=True)
mschwoer marked this conversation as resolved.
Show resolved Hide resolved

if color is not None:
df = df.merge(self.metadata, how="inner", on=[self.sample])
df = df.merge(self.metadata, how="inner", on=[Cols.SAMPLE])

if method == "violin":
fig = px.violin(
df,
x=self.sample,
x=Cols.SAMPLE,
y="Intensity",
color=color,
template="simple_white+alphastats_colors",
Expand All @@ -112,7 +111,7 @@ def plot_sampledistribution(
elif method == "box":
fig = px.box(
df,
x=self.sample,
x=Cols.SAMPLE,
y="Intensity",
color=color,
template="simple_white+alphastats_colors",
Expand Down
16 changes: 7 additions & 9 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,13 @@ def __init__(
self,
filter_columns: List[str],
rawinput: pd.DataFrame,
sample: str,
metadata: pd.DataFrame,
preprocessing_info: Dict,
mat: pd.DataFrame,
):
self.filter_columns = filter_columns

self.rawinput = rawinput
self.sample = sample

self.metadata = metadata
self.preprocessing_info = preprocessing_info
Expand Down Expand Up @@ -88,17 +86,17 @@ def init_preprocessing_info(
def _remove_samples(self, sample_list: list):
# exclude samples for analysis
self.mat = self.mat.drop(sample_list)
self.metadata = self.metadata[~self.metadata[self.sample].isin(sample_list)]
self.metadata = self.metadata[~self.metadata[Cols.SAMPLE].isin(sample_list)]

@staticmethod
def subset(
mat: pd.DataFrame, metadata: pd.DataFrame, sample: str, preprocessing_info: Dict
mat: pd.DataFrame, metadata: pd.DataFrame, preprocessing_info: Dict
) -> pd.DataFrame:
"""Filter matrix so only samples that are described in metadata are also found in matrix."""
preprocessing_info.update(
{PreprocessingStateKeys.NUM_SAMPLES: metadata.shape[0]}
)
return mat[mat.index.isin(metadata[sample].tolist())]
return mat[mat.index.isin(metadata[Cols.SAMPLE].tolist())]

def _remove_na_values(self, cut_off):
if (
Expand Down Expand Up @@ -350,7 +348,7 @@ def batch_correction(self, batch: str) -> pd.DataFrame:
from combat.pycombat import pycombat

data = self.mat.transpose()
series_of_batches = self.metadata.set_index(self.sample).reindex(
series_of_batches = self.metadata.set_index(Cols.SAMPLE).reindex(
data.columns.to_list()
)[batch]

Expand Down Expand Up @@ -418,16 +416,16 @@ def preprocess(
]:
raise ValueError(f"Invalid keyword argument: {k}")

# TODO this is a stateful method as we change self.mat, self.metadata and self.processing_info
# refactor such that it does not change self.mat etc but just return the latest result
if remove_contaminations:
self._filter()

if remove_samples is not None:
self._remove_samples(sample_list=remove_samples)

if subset:
self.mat = self.subset(
self.mat, self.metadata, self.sample, self.preprocessing_info
)
self.mat = self.subset(self.mat, self.metadata, self.preprocessing_info)

if data_completeness > 0:
self._remove_na_values(cut_off=data_completeness)
Expand Down
9 changes: 3 additions & 6 deletions alphastats/DataSet_Statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import pingouin

from alphastats.keys import Cols
from alphastats.statistics.Anova import Anova
from alphastats.statistics.DifferentialExpressionAnalysis import (
DifferentialExpressionAnalysis,
Expand All @@ -17,12 +18,10 @@ def __init__(
*,
mat: pd.DataFrame,
metadata: pd.DataFrame,
sample: str,
preprocessing_info: Dict,
):
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info

@ignore_warning(RuntimeWarning)
Expand Down Expand Up @@ -60,7 +59,6 @@ def diff_expression_analysis(
df = DifferentialExpressionAnalysis(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
Expand Down Expand Up @@ -89,7 +87,6 @@ def anova(self, column: str, protein_ids="all", tukey: bool = True) -> pd.DataFr
return Anova(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
column=column,
protein_ids=protein_ids,
tukey=tukey,
Expand Down Expand Up @@ -119,8 +116,8 @@ def ancova(
* ``'p-unc'``: Uncorrected p-values
* ``'np2'``: Partial eta-squared
"""
df = self.mat[protein_id].reset_index().rename(columns={"index": self.sample})
df = self.metadata.merge(df, how="inner", on=[self.sample])
df = self.mat[protein_id].reset_index().rename(columns={"index": Cols.SAMPLE})
df = self.metadata.merge(df, how="inner", on=[Cols.SAMPLE])
ancova_df = pingouin.ancova(df, dv=protein_id, covar=covar, between=between)
return ancova_df

Expand Down
27 changes: 11 additions & 16 deletions alphastats/dataset_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from alphastats.dataset_harmonizer import DataHarmonizer
from alphastats.keys import Cols


Expand All @@ -17,12 +18,12 @@ def __init__(
rawinput: pd.DataFrame,
intensity_column: Union[List[str], str],
metadata_path_or_df: Union[str, pd.DataFrame],
sample_column: str,
data_harmonizer: DataHarmonizer,
):
self.rawinput: pd.DataFrame = rawinput
self.sample_column: str = sample_column
self.intensity_column: Union[List[str], str] = intensity_column
self.metadata_path_or_df: Union[str, pd.DataFrame] = metadata_path_or_df
self._data_harmonizer = data_harmonizer

def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a matrix: features (Proteins) as columns, samples as rows."""
Expand Down Expand Up @@ -58,28 +59,27 @@ def _check_matrix_values(mat: pd.DataFrame) -> None:
if np.isinf(mat).values.sum() > 0:
logging.warning("Data contains infinite values.")

def create_metadata(self, mat: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
def create_metadata(self, mat: pd.DataFrame) -> pd.DataFrame:
"""Create metadata DataFrame from metadata file or DataFrame."""

if self.metadata_path_or_df is not None:
sample = self.sample_column
metadata = self._load_metadata(file_path=self.metadata_path_or_df)
metadata = self._remove_missing_samples_from_metadata(mat, metadata, sample)
metadata = self._data_harmonizer.get_harmonized_metadata(metadata)
metadata = self._remove_missing_samples_from_metadata(mat, metadata)
else:
sample = "sample"
metadata = pd.DataFrame({"sample": list(mat.index)})
metadata = pd.DataFrame({Cols.SAMPLE: list(mat.index)})

return metadata, sample
return metadata

def _remove_missing_samples_from_metadata(
self, mat: pd.DataFrame, metadata: pd.DataFrame, sample
self, mat: pd.DataFrame, metadata: pd.DataFrame
) -> pd.DataFrame:
"""Remove samples from metadata that are not in the protein data."""
samples_matrix = mat.index.to_list()
samples_metadata = metadata[sample].to_list()
samples_metadata = metadata[Cols.SAMPLE].to_list()
misc_samples = list(set(samples_metadata) - set(samples_matrix))
if len(misc_samples) > 0:
metadata = metadata[~metadata[sample].isin(misc_samples)]
metadata = metadata[~metadata[Cols.SAMPLE].isin(misc_samples)]
logging.warning(
f"{misc_samples} are not described in the protein data and"
"are removed from the metadata."
Expand Down Expand Up @@ -116,11 +116,6 @@ def _load_metadata(
)
return None

if df is not None and self.sample_column not in df.columns:
logging.error(
f"sample_column: {self.sample_column} not found in {file_path}"
)

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
Expand Down
Loading
Loading