Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chat and fix #323

Draft
wants to merge 1 commit into
base: development
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 84 additions & 43 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sklearn.experimental import enable_iterative_imputer
import itertools

import streamlit as st


class Preprocess:
Expand All @@ -30,32 +31,45 @@ def preprocess_print_info(self):
print(pd.DataFrame(self.preprocessing_info.items()))

def _remove_na_values(self, cut_off):
if (
self.preprocessing_info.get("Missing values were removed")
and self.preprocessing_info.get("Data completeness cut-off") == cut_off
):
logging.info("Missing values have already been filtered.")
st.warning(
"Missing values have already been filtered. To apply another cutoff, reset preprocessing."
)
return
cut = 1 - cut_off
limit = self.mat.shape[0] * cut


num_samples, num_proteins = self.mat.shape
limit = num_samples * cut

self.mat.replace(0, np.nan, inplace=True)
keep_list = list()
invalid = 0
for column_name in self.mat.columns:
column = self.mat[column_name]
# Get the count of Zeros in column
count = (column == 0).sum()
count = column.isna().sum()
try:
count = count.item()
if isinstance(count, int):
if count < limit:
keep_list += [column_name]

except ValueError:
invalid +=1
invalid += 1
continue

self.mat= self.mat[keep_list]
self.mat = self.mat[keep_list]

self.preprocessing_info.update(
{"Data completeness cut-off": cut_off}
{
"Number of removed ProteinGroups due to data completeness cutoff": num_proteins
- self.mat.shape[1],
"Missing values were removed": True,
"Data completeness cut-off": cut_off,
}
)
percentage = cut_off * 100
print(f"Proteins with a data completeness across all samples of less than {percentage} % have been removed.")


def _filter(self):
if len(self.filter_columns) == 0:
Expand Down Expand Up @@ -105,15 +119,18 @@ def _imputation(self, method: str):
logging.info(
f" {len(protein_group_na)} Protein Groups were removed due to missing values."
)

logging.info("Imputing data...")

if method == "mean":
imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="mean", keep_empty_features=True)
imp = sklearn.impute.SimpleImputer(
missing_values=np.nan, strategy="mean", keep_empty_features=True
)
imputation_array = imp.fit_transform(self.mat.values)

elif method == "median":
imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="median", keep_empty_features=True)
imp = sklearn.impute.SimpleImputer(
missing_values=np.nan, strategy="median", keep_empty_features=True
)
imputation_array = imp.fit_transform(self.mat.values)

elif method == "knn":
Expand Down Expand Up @@ -155,6 +172,22 @@ def _imputation(self, method: str):
)
self.preprocessing_info.update({"Imputation": method})

def _linear_normalization(self, array):
"""Normalize data using l2 norm without breaking when encoutering nones
l2 = sqrt(sum(x**2))

Args:
array (pd.Series): array to normalize (1D array)

Returns:
np.array: normalized array
"""
square_sum_per_row = array.pow(2).sum(axis=1, skipna=True)

l2_norms = np.sqrt(square_sum_per_row)
normalized_vals = array.div(l2_norms.replace(0, 1), axis=0)
return normalized_vals.values

@ignore_warning(UserWarning)
@ignore_warning(RuntimeWarning)
def _normalization(self, method: str):
Expand All @@ -168,13 +201,13 @@ def _normalization(self, method: str):
normalized_array = qt.fit_transform(self.mat.values)

elif method == "linear":
normalized_array = sklearn.preprocessing.normalize(
self.mat.values, norm="l2"
)
normalized_array = self._linear_normalization(self.mat)

elif method == "vst":
scaler = sklearn.preprocessing.PowerTransformer(standardize=False)
normalized_array = scaler.fit_transform(self.mat.values)
minmax = sklearn.preprocessing.MinMaxScaler()
scaler = sklearn.preprocessing.PowerTransformer()
minmaxed_array = minmax.fit_transform(self.mat.values)
normalized_array = scaler.fit_transform(minmaxed_array)

else:
raise ValueError(
Expand All @@ -189,20 +222,19 @@ def _normalization(self, method: str):
self.preprocessing_info.update({"Normalization": method})

def reset_preprocessing(self):
""" Reset all preprocessing steps
"""
#  reset all preprocessing steps
"""Reset all preprocessing steps"""
self.create_matrix()
print("All preprocessing steps are reset.")

@ignore_warning(RuntimeWarning)
def _compare_preprocessing_modes(self, func, params_for_func) -> list:
dataset = self
imputation_methods = ["mean", "median", "knn", "randomforest"]
normalization_methods = ["vst","zscore", "quantile" ]

preprocessing_modes = list(itertools.product(normalization_methods, imputation_methods))
normalization_methods = ["vst", "zscore", "quantile"]

preprocessing_modes = list(
itertools.product(normalization_methods, imputation_methods)
)

results_list = []

Expand All @@ -212,7 +244,9 @@ def _compare_preprocessing_modes(self, func, params_for_func) -> list:
for preprocessing_mode in preprocessing_modes:
# reset preprocessing
dataset.reset_preprocessing()
print(f"Normalization {preprocessing_mode[0]}, Imputation {str(preprocessing_mode[1])}")
print(
f"Normalization {preprocessing_mode[0]}, Imputation {str(preprocessing_mode[1])}"
)
dataset.mat.replace([np.inf, -np.inf], np.nan, inplace=True)

dataset.preprocess(
Expand All @@ -223,7 +257,7 @@ def _compare_preprocessing_modes(self, func, params_for_func) -> list:

res = func(**params_for_func)
results_list.append(res)

print("\t")

return results_list
Expand All @@ -232,29 +266,32 @@ def _log2_transform(self):
self.mat = np.log2(self.mat + 0.1)
self.preprocessing_info.update({"Log2-transformed": True})
print("Data has been log2-transformed.")
def batch_correction(self, batch:str):

def batch_correction(self, batch: str):
"""Correct for technical bias/batch effects
Behdenna A, Haziza J, Azencot CA and Nordor A. (2020) pyComBat, a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods. bioRxiv doi: 10.1101/2020.03.17.995431
Args:
batch (str): column name in the metadata describing the different batches
"""
import combat
from combat.pycombat import pycombat

data = self.mat.transpose()
series_of_batches = self.metadata.set_index(self.sample).reindex(data.columns.to_list())[batch]
series_of_batches = self.metadata.set_index(self.sample).reindex(
data.columns.to_list()
)[batch]
self.mat = pycombat(data=data, batch=series_of_batches).transpose()

@ignore_warning(RuntimeWarning)
def preprocess(
self,
log2_transform: bool=True,
remove_contaminations: bool=False,
subset: bool=False,
data_completeness: float=0,
normalization: str=None,
imputation: str=None,
remove_samples: list=None,
log2_transform: bool = True,
remove_contaminations: bool = False,
subset: bool = False,
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
):
"""Preprocess Protein data

Expand Down Expand Up @@ -300,15 +337,14 @@ def preprocess(
"""
if remove_contaminations:
self._filter()

if remove_samples is not None:
self._remove_sampels(sample_list=remove_samples)

if subset:
self.mat = self._subset()


if data_completeness> 0:
if data_completeness > 0:
self._remove_na_values(cut_off=data_completeness)

if log2_transform and self.preprocessing_info.get("Log2-transformed") is False:
Expand All @@ -317,9 +353,14 @@ def preprocess(
if normalization is not None:
self._normalization(method=normalization)
self.mat = self.mat.replace([np.inf, -np.inf], np.nan)

if imputation is not None:
self._imputation(method=imputation)

self.mat = self.mat.loc[:, (self.mat != 0).any(axis=0)]
self.preprocessing_info.update(
{
"Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1],
}
)
self.preprocessed = True
60 changes: 33 additions & 27 deletions alphastats/gui/pages/02_Import Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ def select_sample_column_metadata(df, software):
submitted = st.form_submit_button("Create DataSet")

if submitted:
if len(df[st.session_state.sample_column].to_list()) != len(
df[st.session_state.sample_column].unique()
):
st.error("Sample names have to be unique.")
st.stop()
return True


Expand Down Expand Up @@ -209,7 +214,6 @@ def create_metadata_file():
# Write each dataframe to a different worksheet.
metadata.to_excel(writer, sheet_name="Sheet1", index=False)
# Close the Pandas Excel writer and output the Excel file to the buffer
writer.close()

st.download_button(
label="Download metadata template as Excel",
Expand Down Expand Up @@ -248,8 +252,6 @@ def upload_metadatafile(software):

load_options()

display_loaded_dataset()

if st.session_state.loader is not None:
create_metadata_file()
st.write(
Expand All @@ -265,8 +267,6 @@ def upload_metadatafile(software):

load_options()

display_loaded_dataset()


def load_sample_data():
_this_file = os.path.abspath(__file__)
Expand All @@ -279,9 +279,11 @@ def load_sample_data():

loader = MaxQuantLoader(file=filepath)
ds = DataSet(loader=loader, metadata_path=metadatapath, sample_column="sample")
metadatapath = os.path.join(_this_directory, "sample_data", "metadata.xlsx").replace(
"pages/", ""
).replace("pages\\", "")
metadatapath = (
os.path.join(_this_directory, "sample_data", "metadata.xlsx")
.replace("pages/", "")
.replace("pages\\", "")
)

loader = MaxQuantLoader(file=filepath)
ds = DataSet(loader=loader, metadata_path=metadatapath, sample_column="sample")
Expand All @@ -305,17 +307,19 @@ def load_sample_data():
def import_data():
options = ["<select>"] + list(software_options.keys())

software = st.selectbox(
st.selectbox(
"Select your Proteomics Software",
options=options,
key="software",
)
session_state_empty = False

if software != "<select>":
upload_softwarefile(software=software)

if st.session_state.software != "<select>":
upload_softwarefile(software=st.session_state.software)
if "loader" not in st.session_state:
st.session_state["loader"] = None
if st.session_state.loader is not None:
upload_metadatafile(software)
upload_metadatafile(st.session_state.software)


def display_loaded_dataset():
Expand Down Expand Up @@ -357,6 +361,7 @@ def empty_session_state():
for key in st.session_state.keys():
del st.session_state[key]
st.empty()
st.session_state["software"] = "<select>"

from streamlit.runtime import get_instance
from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx
Expand All @@ -367,16 +372,24 @@ def empty_session_state():

sidebar_info()


if "dataset" not in st.session_state:
st.markdown("### Import Proteomics Data")

st.markdown(
"Create a DataSet with the output of your proteomics software package and the corresponding metadata (optional). "
)

import_data()
st.markdown("### Or Load sample Dataset")
import_data()

if "dataset" in st.session_state:
st.info("DataSet has been imported")

if "distribution_plot" not in st.session_state:
save_plot_sampledistribution_rawdata()

display_loaded_dataset()

st.markdown("### Or Load sample Dataset")

if st.button("Load sample DataSet - PXD011839"):
st.write(
Expand Down Expand Up @@ -407,16 +420,9 @@ def empty_session_state():

load_sample_data()

if "dataset" in st.session_state:
st.info("DataSet has been imported")

if "distribution_plot" not in st.session_state:
save_plot_sampledistribution_rawdata()

if st.button("New Session: Import new dataset"):
empty_session_state()

import_data()
st.markdown("### To start a new session:")

if "dataset" in st.session_state:
display_loaded_dataset()
if st.button("New Session: Import new dataset"):
empty_session_state()
st.rerun()
Loading
Loading