Skip to content

Commit

Permalink
ibaqpy update
Browse files Browse the repository at this point in the history
  • Loading branch information
WangHong007 committed Sep 7, 2023
1 parent 2432672 commit 3f0be3a
Show file tree
Hide file tree
Showing 12 changed files with 892 additions and 855 deletions.
70 changes: 70 additions & 0 deletions bin/datasets_merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python
import re
import click
from ibaq.combiner import Combiner

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


@click.group(context_settings=CONTEXT_SETTINGS)
def cli():
"""
This is the main tool that gives access to all commands.
"""
pass


@click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq")
@click.option("--data_folder", "-d", help="Data dolfer contains SDRFs and ibaq CSVs.", required=True)
@click.option("--output", "-o", help="Output file after batch effect removal.", required=True)
@click.option(
"--covariate",
"-c",
default=None,
help="Indicators included in covariate consideration when datasets are merged.",
)
@click.option("--organism", help="Organism to keep in input data.", default="HUMAN")
@click.option("--covariate_to_keep", "-k", help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", default=None)
@click.option("--non_missing_percent_to_keep", "-m", help="non-missing values in each group.", default=0.3)
@click.option("--n_components", help="Number of principal components to be computed.", default=None)
@click.option("--min_cluster", help="The minimum size of clusters.", default=None)
@click.option(
"--min_sample_num",
help="The minimum number of samples in a neighborhood for a point to be considered as a core point.",
default=None,
)
@click.option("--n_iter", help="Number of iterations to be performed.", default=None)
@click.option("--verbose/--quiet", "-v/-q", help="Output debug information.", default=False, is_flag=True)
@click.pass_context
def datasets_merge(
ctx,
data_folder: str,
output: str,
covariate: str,
organism: str,
covariate_to_keep: list,
non_missing_percent_to_keep: float,
n_components: int,
min_cluster: int,
min_sample_num: int,
n_iter: int,
verbose: bool,
):
if covariate_to_keep:
covariate_to_keep = re.split(",\s*", covariate_to_keep)
combiner = Combiner(data_folder = data_folder, covariate = covariate)
combiner.imputer(covariate_to_keep)
combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter)
combiner.batch_correction(n_components, covariate_to_keep)
result = combiner.df_corrected
result.to_csv(output, sep=",", index=True)

cli.add_command(datasets_merge)


def main():
cli()


if __name__ == "__main__":
main()
71 changes: 41 additions & 30 deletions bin/peptide_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
PROTEIN_NAME, STUDY_ID, CONDITION, get_canonical_peptide, plot_distributions, plot_box_plot, \
sum_peptidoform_intensities, CHANNEL, REFERENCE, remove_extension_file, ISOTOPE_LABEL_TYPE, FRAGMENT_ION, TMT16plex, \
TMT11plex, TMT10plex, TMT6plex, get_reference_name, get_study_accession, ITRAQ4plex, ITRAQ8plex, \
parse_uniprot_accession, remove_protein_by_ids
parse_uniprot_accession, remove_protein_by_ids, FEATURE_COLUMNS, parquet_map


def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None:
Expand Down Expand Up @@ -122,7 +122,7 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame:
return dataset_df


def remove_low_frequency_peptides(dataset_df: DataFrame, percentage_samples: float = 0.20):
def remove_low_frequency_peptides_(dataset_df: DataFrame, percentage_samples: float = 0.20):
"""
Remove peptides that are present in less than 20% of the samples.
:param dataset_df: dataframe with the data
Expand Down Expand Up @@ -220,7 +220,8 @@ def impute_peptide_intensities(dataset_df, field, class_field):


@click.command()
@click.option("-m", "--msstats", help="MsStats file import generated by quantms", required=True)
@click.option("-m", "--msstats", help="MsStats file import generated by quantms", default=None)
@click.option("-p", "--parquet", help="Parquet file import generated by quantms", default=None)
@click.option("-s", "--sdrf", help="SDRF file import generated by quantms", required=True)
@click.option("--min_aa", help="Minimum number of amino acids to filter peptides", default=7)
@click.option("--min_unique", help="Minimum number of unique peptides to filter proteins", default=2)
Expand All @@ -241,57 +242,65 @@ def impute_peptide_intensities(dataset_df, field, class_field):
"after normalization, etc.",
is_flag=True)
@click.option("--qc_report", help="PDF file to store multiple QC images", default="peptideNorm-QCprofile.pdf")
def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int, remove_ids: str,
def peptide_normalization(msstats: str, parquet: str, sdrf: str, min_aa: int, min_unique: int, remove_ids: str,
remove_decoy_contaminants: bool, remove_low_frequency_peptides: bool,
output: str, skip_normalization: bool, nmethod: str,
pnormalization: bool, compress: bool, log2: bool,
violin: bool, verbose: bool, qc_report: str) -> None:


if msstats is None or sdrf is None or output is None:
if (msstats is None and parquet is None) or sdrf is None or output is None:
print_help_msg(peptide_normalization)
exit(1)

compression_method = 'gzip' if compress else None

# Read the msstats file
msstats_df = pd.read_csv(msstats, sep=',', compression=compression_method, dtype={CONDITION: 'category', ISOTOPE_LABEL_TYPE: 'category'})
if not parquet is None:
feature_df = pd.read_parquet(parquet)[FEATURE_COLUMNS]
feature_df = feature_df.rename(columns=parquet_map)
feature_df[PROTEIN_NAME] = feature_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis = 1)
label_type = feature_df[CHANNEL].unique().tolist()
if len(label_type) == 1:
feature_df.drop(CHANNEL, inplace=True, axis = 1)
else:
# Read the msstats file
feature_df = pd.read_csv(msstats, sep=',', compression=compression_method, dtype={CONDITION: 'category', ISOTOPE_LABEL_TYPE: 'category'})
# Remove 0 intensity signals from the msstats file
msstats_df = msstats_df[msstats_df[INTENSITY] > 0]
msstats_df[PEPTIDE_CANONICAL] = msstats_df.apply(lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1)
print(feature_df)
feature_df = feature_df[feature_df[INTENSITY] > 0]
feature_df[PEPTIDE_CANONICAL] = feature_df.apply(lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1)
# Only peptides with more than min_aa (default: 7) amino acids are retained
msstats_df = msstats_df[msstats_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)]
feature_df = feature_df[feature_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)]
# Only proteins with unique peptides number greater than min_unique (default: 2) are retained
unique_peptides = set(msstats_df.groupby(PEPTIDE_CANONICAL).filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[
unique_peptides = set(feature_df.groupby(PEPTIDE_CANONICAL).filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[
PEPTIDE_CANONICAL].tolist())
strong_proteins = set(msstats_df[msstats_df[PEPTIDE_CANONICAL].isin(unique_peptides)].groupby(PROTEIN_NAME).filter(
strong_proteins = set(feature_df[feature_df[PEPTIDE_CANONICAL].isin(unique_peptides)].groupby(PROTEIN_NAME).filter(
lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[PROTEIN_NAME].tolist())
msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)]
feature_df = feature_df[feature_df[PROTEIN_NAME].isin(strong_proteins)]

msstats_df.rename(
feature_df.rename(
columns={'ProteinName': PROTEIN_NAME, 'PeptideSequence': PEPTIDE_SEQUENCE, 'PrecursorCharge': PEPTIDE_CHARGE,
'Run': RUN,
'Condition': CONDITION, 'Intensity': INTENSITY}, inplace=True)

print(msstats_df)
msstats_df[PROTEIN_NAME] = msstats_df[PROTEIN_NAME].apply(parse_uniprot_accession)
feature_df[PROTEIN_NAME] = feature_df[PROTEIN_NAME].apply(parse_uniprot_accession)

# Read the sdrf file
sdrf_df = pd.read_csv(sdrf, sep='\t', compression=compression_method)
sdrf_df[REFERENCE] = sdrf_df['comment[data file]'].apply(remove_extension_file)
print(sdrf_df)

if FRACTION not in msstats_df.columns:
msstats_df[FRACTION] = 1
msstats_df = msstats_df[
if FRACTION not in feature_df.columns:
feature_df[FRACTION] = 1
feature_df = feature_df[
[PROTEIN_NAME, PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, INTENSITY, REFERENCE, CONDITION, RUN,
BIOREPLICATE, FRACTION, FRAGMENT_ION, ISOTOPE_LABEL_TYPE]]

# Merged the SDRF with the Resulted file
labels = set(sdrf_df['comment[label]'])
if CHANNEL not in msstats_df.columns:
msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(remove_extension_file)
dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE])
if CHANNEL not in feature_df.columns:
feature_df[REFERENCE] = feature_df[REFERENCE].apply(remove_extension_file)
dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE])
elif 'TMT' in ','.join(labels) or 'tmt' in ','.join(labels):
if (len(labels) > 11 or "TMT134N" in labels or "TMT133C" in labels
or "TMT133N" in labels or "TMT132C" in labels or "TMT132N" in labels):
Expand All @@ -305,8 +314,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
choice = pd.DataFrame.from_dict(choice, orient='index', columns=[CHANNEL]).reset_index().rename(
columns={'index': 'comment[label]'})
sdrf_df = sdrf_df.merge(choice, on='comment[label]', how='left')
msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_reference_name)
dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_reference_name)
dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
on=[REFERENCE, CHANNEL])
# result_df.drop(CHANNEL, axis=1, inplace=True)
dataset_df = dataset_df[dataset_df["Condition"] != "Empty"]
Expand All @@ -319,8 +328,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
choice = pd.DataFrame.from_dict(choice, orient='index', columns=[CHANNEL]).reset_index().rename(
columns={'index': 'comment[label]'})
sdrf_df = sdrf_df.merge(choice, on='comment[label]', how='left')
msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_reference_name)
dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_reference_name)
dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
on=[REFERENCE, CHANNEL])
dataset_df = dataset_df[dataset_df["Condition"] != "Empty"]
dataset_df.rename(columns={'Charge': PEPTIDE_CHARGE}, inplace=True)
Expand All @@ -329,11 +338,12 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
exit(1)

# Remove the intermediate variables and free the memory
del msstats_df, sdrf_df
del feature_df, sdrf_df
gc.collect()

dataset_df.rename(columns={'source name': SAMPLE_ID}, inplace=True)

if msstats:
dataset_df.rename(columns={'source name': SAMPLE_ID}, inplace=True)
print(dataset_df)
dataset_df[STUDY_ID] = dataset_df[SAMPLE_ID].apply(get_study_accession)
dataset_df = dataset_df.filter(
items=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, PROTEIN_NAME, STUDY_ID, CONDITION,
Expand Down Expand Up @@ -426,7 +436,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,

if remove_low_frequency_peptides:
print("Peptides before removing low frequency peptides: " + str(len(dataset_df.index)))
dataset_df = remove_low_frequency_peptides(dataset_df, 0.20)
print(dataset_df)
dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20)
print_dataset_size(dataset_df, "Peptides after remove low frequency peptides: ", verbose)

# Perform imputation using Random Forest in Peptide Intensities
Expand Down
1 change: 1 addition & 0 deletions conda-environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- pandas
- scikit-learn
- numpy
- pyarrow
- click
- matplotlib
- pip:
Expand Down
Loading

0 comments on commit 3f0be3a

Please sign in to comment.