diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 4864cd98..190653ec 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.4.1 +current_version = 1.4.2 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/alphabase/__init__.py b/alphabase/__init__.py index 86008ce9..12839ec1 100644 --- a/alphabase/__init__.py +++ b/alphabase/__init__.py @@ -2,7 +2,7 @@ __project__ = "alphabase" -__version__ = "1.4.1" +__version__ = "1.4.2" __license__ = "Apache" __description__ = "An infrastructure Python package of the AlphaX ecosystem" __author__ = "Mann Labs" diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml index 03ce74ee..389f5626 100644 --- a/alphabase/constants/const_files/psm_reader.yaml +++ b/alphabase/constants/const_files/psm_reader.yaml @@ -56,7 +56,7 @@ maxquant: 'Dimethyl@Any_N-term': - '(Dimethyl)' 'Acetyl@Protein_N-term': - - '_(Acetyl (Protein_N-term))' + - '_(Acetyl (Protein N-term))' - '_(ac)' 'Acetyl@K': - 'K(ac)' diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index df59f8fa..48f5b158 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -79,31 +80,31 @@ def _load_file(self, filename): with h5py.File(filename, "r") as _hdf: dataset = _hdf[self.hdf_dataset] df = pd.DataFrame({col: dataset[col] for col in dataset}) - df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")] + df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")] df["precursor"] = df["precursor"].str.decode("utf-8") # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8') if "scan_no" in df.columns: df["scan_no"] = df["scan_no"].astype("int") df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx - df["charge"] = df["charge"].astype(int) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int) return df def _load_modifications(self, df: pd.DataFrame): if len(df) == 0: - self._psm_df["sequence"] = "" - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["decoy"] = 0 + self._psm_df[PsmDfCols.SEQUENCE] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.DECOY] = 0 return ( - self._psm_df["sequence"], - self._psm_df["mods"], - self._psm_df["mod_sites"], + self._psm_df[PsmDfCols.SEQUENCE], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], _charges, - self._psm_df["decoy"], + self._psm_df[PsmDfCols.DECOY], ) = zip(*df["precursor"].apply(parse_ap)) - self._psm_df.decoy = self._psm_df.decoy.astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8) def register_readers(): diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index 310f4485..5db0b97e 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml @@ -127,7 +128,9 @@ def _load_file(self, filename): def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) - self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True) + self._psm_df.rename( + columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True + ) class SpectronautReportReader(MaxQuantReader): @@ -174,10 +177,10 @@ def _load_file(self, filename): self.mod_seq_column = "ModifiedSequence" self.csv_sep = self._get_table_delimiter(filename) df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False) - df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split( - ".", expand=True, n=2 - ) - df["charge"] = df.charge.astype(np.int8) + df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[ + self.precursor_column + ].str.split(".", expand=True, n=2) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8) return df diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py new file mode 100644 index 00000000..4b4449e9 --- /dev/null +++ b/alphabase/psm_reader/keys.py @@ -0,0 +1,71 @@ +class ConstantsClass(type): + """A metaclass for classes that should only contain string constants.""" + + def __setattr__(self, name, value): + raise TypeError("Constants class cannot be modified") + + def get_values(cls): + """Get all user-defined string values of the class.""" + return [ + value + for key, value in cls.__dict__.items() + if not key.startswith("__") and isinstance(value, str) + ] + + +class PsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a PSM dataframe.""" + + # TODO: these are used only in th psm_reader package and the spectral_library.reader module so far + MOD_SITES = "mod_sites" + MODIFIED_SEQUENCE = "modified_sequence" + SEQUENCE = "sequence" + DECOY = "decoy" + MODS = "mods" + SCORE = "score" + TO_REMOVE = "to_remove" + AA_MASS_DIFFS = "aa_mass_diffs" + AA_MASS_DIFF_SITES = "aa_mass_diff_sites" + RT = "rt" + RT_START = "rt_start" + RT_STOP = "rt_stop" + RT_NORM = "rt_norm" + SPEC_IDX = "spec_idx" + SCANNR = "scannr" + FDR = "fdr" + NAA = "nAA" + CCS = "ccs" + MOBILITY = "mobility" + PEPTIDE_FDR = "peptide_fdr" + PROTEIN_FDR = "protein_fdr" + + RAW_NAME = "raw_name" + CHARGE = "charge" + PROTEINS = "proteins" + + SCAN_NUM = "scan_num" + PRECURSOR_MZ = "precursor_mz" + DIANN_SPEC_INDEX = "diann_spec_idx" + + # part of the output, but not directly referenced + _UNIPROT_IDS = "uniprot_ids" + _GENES = "genes" + _QUERY_ID = "query_id" + + # part of psm_reader_yaml, but not directly referenced + _INTENSITY = "intensity" + + +class LibPsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a Library PSM dataframe.""" + + FRAG_START_IDX = "frag_start_idx" + FRAG_STOP_IDX = "frag_stop_idx" + + # not referenced in reader classes + FRAGMENT_INTENSITY = "fragment_intensity" + FRAGMENT_MZ = "fragment_mz" + FRAGMENT_TYPE = "fragment_type" + FRAGMENT_CHARGE = "fragment_charge" + FRAGMENT_SERIES = "fragment_series" + FRAGMENT_LOSS_TYPE = "fragment_loss_type" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 984ef1cc..9c290d11 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -1,11 +1,13 @@ import copy import warnings +from typing import Optional import numba import numpy as np import pandas as pd from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -195,7 +197,7 @@ def _init_modification_mapping(self): psm_reader_yaml["maxquant"]["modification_mapping"] ) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): super().set_modification_mapping(modification_mapping) self._add_all_unimod() self._extend_mod_brackets() @@ -237,8 +239,10 @@ def _extend_mod_brackets(self): self.modification_mapping[key] = list(mod_set) def _translate_decoy(self, origin_df=None): - if "decoy" in self._psm_df.columns: - self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8) + if PsmDfCols.DECOY in self._psm_df.columns: + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "-" + ).astype(np.int8) def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"] @@ -252,16 +256,16 @@ def _load_file(self, filename): # remove MBR PSMs as they are currently not supported and will crash import mapped_columns = self._find_mapped_columns(df) - if "scan_num" in mapped_columns: - scan_num_col = mapped_columns["scan_num"] + if PsmDfCols.SCAN_NUM in mapped_columns: + scan_num_col = mapped_columns[PsmDfCols.SCAN_NUM] no_ms2_mask = df[scan_num_col] == "" if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0: warnings.warn( - f"Maxquant psm file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed." + f"MaxQuant PSM file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed." ) df = df[~no_ms2_mask] df.reset_index(drop=True, inplace=True) - df[scan_num_col] = df[scan_num_col].astype(int) + df[scan_num_col] = df[scan_num_col].astype(int) # if 'K0' in df.columns: # df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0 @@ -278,15 +282,15 @@ def _load_modifications(self, origin_df: pd.DataFrame): else: mod_sep = "()" - (seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *origin_df[self.mod_seq_column].apply( parse_mod_seq, mod_sep=mod_sep, fixed_C57=self.fixed_C57, ) ) - if "sequence" not in self._psm_df.columns: - self._psm_df["sequence"] = seqs + if PsmDfCols.SEQUENCE not in self._psm_df.columns: + self._psm_df[PsmDfCols.SEQUENCE] = seqs def register_readers(): diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index d7159572..22301b94 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -5,6 +5,7 @@ from alphabase.constants.aa import AA_ASCII_MASS from alphabase.constants.atom import MASS_H, MASS_O from alphabase.constants.modification import MOD_MASS +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -115,9 +116,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass @@ -126,37 +124,41 @@ def _load_file(self, filename): msf_df.fillna("", inplace=True) if "ion_mobility" in msf_df.columns: msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float) - msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) - msf_df["to_remove"] = 0 - self.column_mapping["to_remove"] = "to_remove" + msf_df[PsmDfCols.RAW_NAME] = ( + msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) + ) + msf_df["to_remove"] = 0 # TODO revisit + self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove" return msf_df def _translate_decoy(self, origin_df=None): - self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype( - np.int8 + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8) ) - self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x)) - if not self.keep_decoy: - self._psm_df["to_remove"] += self._psm_df.decoy > 0 + self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply( + lambda x: ";".join(x) + ) + if not self._keep_decoy: + self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0 def _translate_score(self, origin_df=None): # evalue score - self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100) def _load_modifications(self, msf_df): if len(msf_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["aa_mass_diffs"] = "" - self._psm_df["aa_mass_diff_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFFS] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = "" return ( - self._psm_df["mods"], - self._psm_df["mod_sites"], - self._psm_df["aa_mass_diffs"], - self._psm_df["aa_mass_diff_sites"], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], + self._psm_df[PsmDfCols.AA_MASS_DIFFS], + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES], ) = zip( *msf_df[["peptide", "modifications"]].apply( lambda x: _get_mods_from_masses(*x), axis=1 @@ -164,16 +166,19 @@ def _load_modifications(self, msf_df): ) if not self.keep_unknown_aa_mass_diffs: - self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != "" + self._psm_df[PsmDfCols.TO_REMOVE] += ( + self._psm_df[PsmDfCols.AA_MASS_DIFFS] != "" + ) self._psm_df.drop( - columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True + columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES], + inplace=True, ) def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) self._psm_df = ( - self._psm_df.query("to_remove==0") - .drop(columns="to_remove") + self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0") + .drop(columns=PsmDfCols.TO_REMOVE) .reset_index(drop=True) ) diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 69e24549..a182a8c9 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -2,6 +2,7 @@ import pandas as pd import alphabase.constants.modification as ap_mod +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -104,9 +105,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["pfind"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass @@ -116,29 +114,35 @@ def _load_file(self, filename): ) pfind_df.fillna("", inplace=True) pfind_df = pfind_df[pfind_df.Sequence != ""] - pfind_df["raw_name"] = ( + pfind_df[PsmDfCols.RAW_NAME] = ( pfind_df["File_Name"].str.split(".").apply(lambda x: x[0]) ) pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein) return pfind_df def _translate_decoy(self, origin_df=None): - self._psm_df.decoy = (self._psm_df.decoy == "decoy").astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "decoy" + ).astype(np.int8) def _translate_score(self, origin_df=None): - self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log( + self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100 + ) def _load_modifications(self, pfind_df): if len(pfind_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" return - (self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *pfind_df["Modification"].apply(get_pFind_mods) ) - self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod) + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( + translate_pFind_mod + ) def register_readers(): diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index f242564e..381b5b81 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -1,6 +1,7 @@ import copy import os import warnings +from typing import Optional import numpy as np import pandas as pd @@ -8,6 +9,7 @@ import alphabase.peptide.mobility as mobility from alphabase.constants._const import CONST_FILE_FOLDER from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz +from alphabase.psm_reader.keys import PsmDfCols from alphabase.utils import get_delimiter from alphabase.yaml_utils import load_yaml @@ -49,7 +51,7 @@ def translate_other_modification(mod_str: str, mod_dict: dict) -> str: return ";".join(ret_mods), [] -def keep_modifications(mod_str: str, mod_set: set) -> str: +def _keep_modifications(mod_str: str, mod_set: set) -> str: """ Check if modifications of `mod_str` are in `mod_set`. @@ -148,12 +150,15 @@ def __init__( If keep decoy PSMs in self.psm_df. _min_max_rt_norm : bool if True, the 'rt_norm' values in self._psm_df - will be normalized by rt_norm = (self.psm_df.rt-rt_min)/(rt_max-rt_min). + will be normalized by rt_norm = (self.psm_df[PsmDfCols.RT]-rt_min)/(rt_max-rt_min). It is useful to normalize iRT values as they contain negative values. Defaults to False. """ - self.set_modification_mapping(None) + self.modification_mapping = None + self.rev_mod_mapping = None + + self.set_modification_mapping() self.add_modification_mapping(modification_mapping) if column_mapping is not None: @@ -162,8 +167,8 @@ def __init__( self._init_column_mapping() self._psm_df = pd.DataFrame() - self.keep_fdr = fdr - self.keep_decoy = keep_decoy + self._keep_fdr = fdr + self._keep_decoy = keep_decoy self._min_max_rt_norm = False self._engine_rt_unit = rt_unit self._min_irt_value = -100 @@ -210,7 +215,7 @@ def add_modification_mapping(self, modification_mapping: dict): self.set_modification_mapping(self.modification_mapping) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): if modification_mapping is None: self._init_modification_mapping() elif isinstance(modification_mapping, str): @@ -224,6 +229,7 @@ def set_modification_mapping(self, modification_mapping: dict): ) else: self.modification_mapping = copy.deepcopy(modification_mapping) + self._mods_as_lists() self._reverse_mod_mapping() @@ -311,18 +317,22 @@ def _translate_score(self, origin_df: pd.DataFrame = None): def _get_table_delimiter(self, _filename): return get_delimiter(_filename) - def normalize_rt(self): - if "rt" in self.psm_df.columns: + def _normalize_rt(self): + if PsmDfCols.RT in self._psm_df.columns: if self._engine_rt_unit == "second": # self.psm_df['rt_sec'] = self.psm_df.rt - self.psm_df["rt"] = self.psm_df.rt / 60 - if "rt_start" in self.psm_df.columns: - self.psm_df["rt_start"] = self.psm_df.rt_start / 60 - self.psm_df["rt_stop"] = self.psm_df.rt_stop / 60 + self._psm_df[PsmDfCols.RT] = self._psm_df[PsmDfCols.RT] / 60 + if PsmDfCols.RT_START in self._psm_df.columns: + self._psm_df[PsmDfCols.RT_START] = ( + self._psm_df[PsmDfCols.RT_START] / 60 + ) + self._psm_df[PsmDfCols.RT_STOP] = ( + self._psm_df[PsmDfCols.RT_STOP] / 60 + ) # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 - min_rt = self.psm_df.rt.min() - max_rt = self.psm_df.rt.max() + min_rt = self._psm_df[PsmDfCols.RT].min() + max_rt = self._psm_df[PsmDfCols.RT].max() if min_rt < 0: # iRT if min_rt < self._min_irt_value: min_rt = self._min_irt_value @@ -332,23 +342,20 @@ def normalize_rt(self): elif not self._min_max_rt_norm: min_rt = 0 - self.psm_df["rt_norm"] = ( - (self.psm_df.rt - min_rt) / (max_rt - min_rt) + self._psm_df[PsmDfCols.RT_NORM] = ( + (self._psm_df[PsmDfCols.RT] - min_rt) / (max_rt - min_rt) ).clip(0, 1) - def norm_rt(self): - self.normalize_rt() - def normalize_rt_by_raw_name(self): - if "rt" not in self.psm_df.columns: + if PsmDfCols.RT not in self._psm_df.columns: return - if "rt_norm" not in self.psm_df.columns: - self.norm_rt() - if "raw_name" not in self.psm_df.columns: + if PsmDfCols.RT_NORM not in self._psm_df.columns: + self._normalize_rt() + if PsmDfCols.RAW_NAME not in self._psm_df.columns: return - for _, df_group in self.psm_df.groupby("raw_name"): - self.psm_df.loc[df_group.index, "rt_norm"] = ( - df_group.rt_norm / df_group.rt_norm.max() + for _, df_group in self._psm_df.groupby(PsmDfCols.RAW_NAME): + self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = ( + df_group[PsmDfCols.RT_NORM] / df_group[PsmDfCols.RT_NORM].max() ) def _load_file(self, filename: str) -> pd.DataFrame: @@ -407,10 +414,10 @@ def _translate_columns(self, origin_df: pd.DataFrame): self._psm_df[col] = origin_df[map_col] if ( - "scan_num" in self._psm_df.columns - and "spec_idx" not in self._psm_df.columns + PsmDfCols.SCAN_NUM in self._psm_df.columns + and PsmDfCols.SPEC_IDX not in self._psm_df.columns ): - self._psm_df["spec_idx"] = self._psm_df.scan_num - 1 + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCAN_NUM] - 1 def _transform_table(self, origin_df: pd.DataFrame): """ @@ -454,8 +461,8 @@ def _translate_modifications(self): not in `self.modification_mapping` """ - self._psm_df.mods, unknown_mods = zip( - *self._psm_df.mods.apply( + self._psm_df[PsmDfCols.MODS], unknown_mods = zip( + *self._psm_df[PsmDfCols.MODS].apply( translate_other_modification, mod_dict=self.rev_mod_mapping ) ) @@ -482,32 +489,38 @@ def _post_process(self, origin_df: pd.DataFrame): origin_df : pd.DataFrame the loaded original df """ - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self.normalize_rt_by_raw_name() - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] keep_rows = np.ones(len(self._psm_df), dtype=bool) - if "fdr" in self._psm_df.columns: - keep_rows &= self._psm_df.fdr <= self.keep_fdr - if "decoy" in self._psm_df.columns and not self.keep_decoy: - keep_rows &= self._psm_df.decoy == 0 + if PsmDfCols.FDR in self._psm_df.columns: + keep_rows &= self._psm_df[PsmDfCols.FDR] <= self._keep_fdr + if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy: + keep_rows &= self._psm_df[PsmDfCols.DECOY] == 0 self._psm_df = self._psm_df[keep_rows] reset_precursor_df(self._psm_df) - if "precursor_mz" not in self._psm_df: + if PsmDfCols.PRECURSOR_MZ not in self._psm_df: self._psm_df = update_precursor_mz(self._psm_df) - if "ccs" in self._psm_df.columns and "mobility" not in self._psm_df.columns: - self._psm_df["mobility"] = mobility.ccs_to_mobility_for_df( - self._psm_df, "ccs" + if ( + PsmDfCols.CCS in self._psm_df.columns + and PsmDfCols.MOBILITY not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.MOBILITY] = mobility.ccs_to_mobility_for_df( + self._psm_df, PsmDfCols.CCS ) - elif "mobility" in self._psm_df.columns and "ccs" not in self._psm_df.columns: - self._psm_df["ccs"] = mobility.mobility_to_ccs_for_df( - self._psm_df, "mobility" + elif ( + PsmDfCols.MOBILITY in self._psm_df.columns + and PsmDfCols.CCS not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.CCS] = mobility.mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY ) def filter_psm_by_modifications( @@ -527,11 +540,11 @@ def filter_psm_by_modifications( "Acetyl@Protein_N-term", ] ) - self._psm_df.mods = self._psm_df.mods.apply( - keep_modifications, mod_set=include_mod_set + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( + _keep_modifications, mod_set=include_mod_set ) - self._psm_df.dropna(subset=["mods"], inplace=True) + self._psm_df.dropna(subset=[PsmDfCols.MODS], inplace=True) self._psm_df.reset_index(drop=True, inplace=True) diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 4d237514..c4a8cebc 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -9,6 +9,7 @@ from tqdm import tqdm from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -94,7 +95,7 @@ def __call__(self, psm_df: pd.DataFrame) -> pd.DataFrame: translated_psm_df = _apply_translate_modifications_mp(psm_df, translation_df) # 5. Drop PSMs with missing modifications - is_null = translated_psm_df["mod_sites"].isnull() + is_null = translated_psm_df[PsmDfCols.MOD_SITES].isnull() translated_psm_df = translated_psm_df[~is_null] if np.sum(is_null) > 0: logging.warning( @@ -217,7 +218,10 @@ def _discover_modifications(psm_df: pd.DataFrame) -> pd.DataFrame: """ modifications = ( - psm_df["modified_sequence"].apply(_match_modified_sequence).explode().unique() + psm_df[PsmDfCols.MODIFIED_SEQUENCE] + .apply(_match_modified_sequence) + .explode() + .unique() ) modifications = modifications[~pd.isnull(modifications)] return pd.DataFrame( @@ -414,14 +418,14 @@ def _translate_modifications( def _apply_translate_modifications( - df: pd.DataFrame, mod_translation_df: pd.DataFrame + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame ) -> pd.DataFrame: """Apply the translation of modifications to the PSMs. Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe with column 'modified_sequence'. mod_translation_df : pd.DataFrame @@ -435,12 +439,12 @@ def _apply_translate_modifications( """ - df["mod_sites"], df["mods"] = zip( - *df["modified_sequence"].apply( + psm_df[PsmDfCols.MOD_SITES], psm_df[PsmDfCols.MODS] = zip( + *psm_df[PsmDfCols.MODIFIED_SEQUENCE].apply( lambda x: _translate_modifications(x, mod_translation_df) ) ) - return df + return psm_df def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: @@ -466,7 +470,7 @@ def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: def _apply_translate_modifications_mp( - df: pd.DataFrame, + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame, mp_batch_size: int = 50000, mp_process_num: int = 10, @@ -477,7 +481,7 @@ def _apply_translate_modifications_mp( Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe. mod_translation_df : pd.DataFrame @@ -496,11 +500,11 @@ def _apply_translate_modifications_mp( partial( _apply_translate_modifications, mod_translation_df=mod_translation_df ), - _batchify_df(df, mp_batch_size), + _batchify_df(psm_df, mp_batch_size), ) if progress_bar: df_list = list( - tqdm(processing, total=int(np.ceil(len(df) / mp_batch_size))) + tqdm(processing, total=int(np.ceil(len(psm_df) / mp_batch_size))) ) else: df_list = list(processing) @@ -593,28 +597,31 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["sage"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _load_file(self, filename): raise NotImplementedError def _transform_table(self, origin_df): - self.psm_df["spec_idx"] = self.psm_df["scannr"].apply( + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCANNR].apply( _sage_spec_idx_from_scan_nr ) - self.psm_df.drop(columns=["scannr"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.SCANNR], inplace=True) def _translate_decoy(self, origin_df): - if not self.keep_decoy: - self._psm_df = self.psm_df[~self.psm_df["decoy"]] + if not self._keep_decoy: + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.DECOY]] - self._psm_df = self.psm_df[self.psm_df["fdr"] <= self.keep_fdr] - self._psm_df = self.psm_df[self.psm_df["peptide_fdr"] <= self.keep_fdr] - self._psm_df = self.psm_df[self.psm_df["protein_fdr"] <= self.keep_fdr] + self._psm_df = self._psm_df[self._psm_df[PsmDfCols.FDR] <= self._keep_fdr] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PEPTIDE_FDR] <= self._keep_fdr + ] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PROTEIN_FDR] <= self._keep_fdr + ] # drop peptide_fdr, protein_fdr - self._psm_df.drop(columns=["peptide_fdr", "protein_fdr"], inplace=True) + self._psm_df.drop( + columns=[PsmDfCols.PEPTIDE_FDR, PsmDfCols.PROTEIN_FDR], inplace=True + ) def _load_modifications(self, origin_df): pass @@ -627,7 +634,7 @@ def _translate_modifications(self): self._psm_df = sage_translation(self._psm_df) # drop modified_sequence - self._psm_df.drop(columns=["modified_sequence"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.MODIFIED_SEQUENCE], inplace=True) class SageReaderTSV(SageReaderBase): diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index 59f7dcda..ca5fc761 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -7,6 +7,7 @@ from alphabase.constants._const import PEAK_INTENSITY_DTYPE from alphabase.peptide.mobility import mobility_to_ccs_for_df from alphabase.psm_reader import psm_reader_provider +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_yaml from alphabase.spectral_library.base import SpecLibBase @@ -115,19 +116,21 @@ def _find_key_columns(self, lib_df: pd.DataFrame): Dataframe containing the spectral library. """ - if "fragment_loss_type" not in lib_df.columns: - lib_df["fragment_loss_type"] = "" + if LibPsmDfCols.FRAGMENT_LOSS_TYPE not in lib_df.columns: + lib_df[LibPsmDfCols.FRAGMENT_LOSS_TYPE] = "" - lib_df.fillna({"fragment_loss_type": ""}, inplace=True) + lib_df.fillna({LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, inplace=True) lib_df.replace( - {"fragment_loss_type": "noloss"}, {"fragment_loss_type": ""}, inplace=True + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: "noloss"}, + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, + inplace=True, ) - if "mods" not in lib_df.columns: - lib_df["mods"] = "" + if PsmDfCols.MODS not in lib_df.columns: + lib_df[PsmDfCols.MODS] = "" - if "mod_sites" not in lib_df.columns: - lib_df["mod_sites"] = "" + if PsmDfCols.MOD_SITES not in lib_df.columns: + lib_df[PsmDfCols.MOD_SITES] = "" def _get_fragment_intensity(self, lib_df: pd.DataFrame): """ @@ -161,21 +164,21 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): nAA_list = [] fragment_columns = [ - "fragment_mz", - "fragment_type", - "fragment_charge", - "fragment_series", - "fragment_loss_type", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_MZ, + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] # by default, all non-fragment columns are used to group the library - non_fragment_columns = list(set(lib_df.columns) - set(fragment_columns)) + non_fragment_columns = sorted(list(set(lib_df.columns) - set(fragment_columns))) for keys, df_group in tqdm(lib_df.groupby(non_fragment_columns)): precursor_columns = dict(zip(non_fragment_columns, keys)) - nAA = len(precursor_columns["sequence"]) + nAA = len(precursor_columns[PsmDfCols.SEQUENCE]) intens = np.zeros( (nAA - 1, len(self.charged_frag_types)), @@ -183,11 +186,11 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): ) for frag_type, frag_num, loss_type, frag_charge, inten in df_group[ [ - "fragment_type", - "fragment_series", - "fragment_loss_type", - "fragment_charge", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] ].values: if frag_type in "abc": @@ -233,8 +236,8 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): indices[1:] = np.array(nAA_list) - 1 indices = np.cumsum(indices) - df["frag_start_idx"] = indices[:-1] - df["frag_stop_idx"] = indices[1:] + df[LibPsmDfCols.FRAG_START_IDX] = indices[:-1] + df[LibPsmDfCols.FRAG_STOP_IDX] = indices[1:] return df @@ -286,7 +289,7 @@ def _post_process( # identify unknown modifications len_before = len(self._psm_df) - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] len_after = len(self._psm_df) if len_before != len_after: @@ -294,17 +297,19 @@ def _post_process( f"{len_before-len_after} Entries with unknown modifications are removed" ) - if "nAA" not in self._psm_df.columns: - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + if PsmDfCols.NAA not in self._psm_df.columns: + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self._psm_df = self._get_fragment_intensity(self._psm_df) self.normalize_rt_by_raw_name() - if "mobility" in self._psm_df.columns: - self._psm_df["ccs"] = mobility_to_ccs_for_df(self._psm_df, "mobility") + if PsmDfCols.MOBILITY in self._psm_df.columns: + self._psm_df[PsmDfCols.CCS] = mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY + ) - self._psm_df.drop("modified_sequence", axis=1, inplace=True) + self._psm_df.drop(PsmDfCols.MODIFIED_SEQUENCE, axis=1, inplace=True) self._precursor_df = self._psm_df self.calc_fragment_mz_df() diff --git a/docs/conf.py b/docs/conf.py index 687c35f7..c63a2cef 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ copyright = "2022, Mann Labs, MPIB" author = "Mann Labs, MPIB" -release = "1.4.1" +release = "1.4.2" # -- General configuration --------------------------------------------------- diff --git a/nbs_tests/psm_reader/dia_psm_reader.ipynb b/nbs_tests/psm_reader/dia_psm_reader.ipynb index 2d15c477..a008b7c4 100644 --- a/nbs_tests/psm_reader/dia_psm_reader.ipynb +++ b/nbs_tests/psm_reader/dia_psm_reader.ipynb @@ -438,9 +438,9 @@ "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843\t2\t\t_ALVAT[+80]PGK_\t\tTrue\t_ALVAT[Phospho (STY)]PGK_\tALVATPGK\t-5.032703\t0.758\t-5.032703\tP19338\tFalse\t_ALVAT[+80]PGK_\t_ALVAT[Phospho (STY)]PGK_\t418.717511324722\t0\t10352\tH3PO4\t4\ty\t1\t384.224142529733\t26.31595\tFalse\tsp\tP19338\tP19338\tNUCL_HUMAN\tNucleolin\tHomo sapiens\t\tNCL\t1\t3\tMCT_human_UP000005640_9606\n", "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843\t2\t\t_TLT[+80]PCPLR_\t\tTrue\t_TLT[Phospho (STY)]PC[Carbamidomethyl (C)]PLR_\tTLTPCPLR\t27.71659\t0.818\t27.71659\tQ5T200\tFalse\t_TLT[+80]PPLR_\t_TLT[Phospho (STY)]PPLR_\t439.230785875227\t0.000138389150379226\t23117\tnoloss\t3\tb\t1\t396.153027901512\t6.3264\tFalse\tsp\tQ5T200\tQ5T200\tZC3HD_HUMAN\tZinc finger CCCH domain-containing protein 13\tHomo sapiens\t\tZC3H13\t1\t1\tMCT_human_UP000005640_9606\n", "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843\t2\t\t_TLT[+80]PCPLR_\t\tTrue\t_TLT[Phospho (STY)]PC[Carbamidomethyl (C)]PLR_\tTLTPCPLR\t27.71659\t0.818\t27.71659\tQ5T200\tFalse\t_TLT[+80]PPLR_\t_TLT[Phospho (STY)]PPLR_\t439.230785875227\t0.000138389150379226\t23117\tnoloss\t3\ty\t1\t385.255780000092\t29.70625\tFalse\tsp\tQ5T200\tQ5T200\tZC3HD_HUMAN\tZinc finger CCCH domain-containing protein 13\tHomo sapiens\t\tZC3H13\t1\t1\tMCT_human_UP000005640_9606\n", - "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein_N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tH3PO4\t4\tb\t1\t443.265279065723\t12.24525\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", - "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein_N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tnoloss\t6\ty\t1\t632.299829640042\t46.07855\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", - "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein_N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tnoloss\t7\ty\t1\t729.352593488892\t100\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", + "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tH3PO4\t4\tb\t1\t443.265279065723\t12.24525\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", + "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tnoloss\t6\ty\t1\t632.299829640042\t46.07855\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", + "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867\t2\t\t_LFVT[+80]PPEGSSR_\t\tTrue\t_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_\tLFVSPPEGSSR\t38.05031\t0.917\t38.05031\tQ14244;Q14244-6;Q14244-7\tFalse\t_LFVT[+80]PPEGSSR_\t_LFVT[Phospho (STY)]PPEGSSR_\t635.297385373987\t0\t14164\tnoloss\t7\ty\t1\t729.352593488892\t100\tFalse\tsp\tQ14244;Q14244-6;Q14244-7\tQ14244;Q14244-6;Q14244-7\tMAP7_HUMAN\tEnsconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin\tHomo sapiens\t\tMAP7\t1;;\t1;;\tMCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional\n", "''')\n", "\n", "spn_reader = psm_reader_provider.get_reader('spectronaut')\n", diff --git a/nbs_tests/psm_reader/maxquant_reader.ipynb b/nbs_tests/psm_reader/maxquant_reader.ipynb index 16b5f745..f1e63a2c 100644 --- a/nbs_tests/psm_reader/maxquant_reader.ipynb +++ b/nbs_tests/psm_reader/maxquant_reader.ipynb @@ -154,10 +154,10 @@ "metadata": {}, "outputs": [], "source": [ - "mq_tsv = io.StringIO('''Raw file\tScan number\tScan index\tSequence\tLength\tMissed cleavages\tModifications\tModified sequence\tOxidation (M) Probabilities\tOxidation (M) Score diffs\tAcetyl (Protein_N-term)\tOxidation (M)\tProteins\tCharge\tFragmentation\tMass analyzer\tType\tScan event number\tIsotope index\tm/z\tMass\tMass error [ppm]\tMass error [Da]\tSimple mass error [ppm]\tRetention time\tPEP\tScore\tDelta score\tScore diff\tLocalization prob\tCombinatorics\tPIF\tFraction of total spectrum\tBase peak fraction\tPrecursor full scan number\tPrecursor Intensity\tPrecursor apex fraction\tPrecursor apex offset\tPrecursor apex offset time\tMatches\tIntensities\tMass deviations [Da]\tMass deviations [ppm]\tMasses\tNumber of matches\tIntensity coverage\tPeak coverage\tNeutral loss level\tETD identification type\tReverse\tAll scores\tAll sequences\tAll modified sequences\tReporter PIF\tReporter fraction\tid\tProtein group IDs\tPeptide ID\tMod. peptide ID\tEvidence ID\tOxidation (M) site IDs\n", - "20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t81358\t73979\tAAAAAAAAAPAAAATAPTTAATTAATAAQ\t29\t0\tUnmodified\t_(Acetyl (Protein_N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_\t\t\t0\t0\tsp|P37108|SRP14_HUMAN\t3\tHCD\tFTMS\tMULTI-MSMS\t13\t1\t790.07495\t2367.203\t0.35311\t0.00027898\t-0.061634807\t70.261\t0.012774\t41.423\t36.666\tNaN\tNaN\t1\t0\t0\t0\t81345\t10653955\t0.0338597821787898\t-11\t0.139877319335938\ty1;y2;y3;y4;y11;y1-NH3;y2-NH3;a2;b2;b3;b4;b5;b6;b7;b8;b9;b11;b12;b6(2+);b8(2+);b13(2+);b18(2+)\t2000000;2000000;300000;400000;200000;1000000;400000;300000;600000;1000000;2000000;3000000;3000000;3000000;3000000;2000000;600000;500000;1000000;2000000;300000;200000\t5.2861228709844E-06;-6.86980268369553E-05;-0.00238178789771837;0.000624715964988809;-0.0145624692099773;-0.000143471782706683;-0.000609501446461991;-0.000524972720768346;0.00010190530804266;5.8620815195809E-05;0.000229901232955854;-0.000108750048696038;-0.000229593152369034;0.00183148682538103;0.00276641182404092;0.000193118923334623;0.00200988580445483;0.000102216846016745;5.86208151389656E-05;0.000229901232955854;-0.00104559184393338;0.00525030008475369\t0.0359413365445091;-0.314964433555295;-8.23711898839045;1.60102421155213;-14.8975999917227;-1.10320467763838;-3.03102462870716;-4.56152475051625;0.712219104095465;0.273777366204575;0.806231096969562;-0.305312183824154;-0.537399178230218;3.67572664689217;4.85930954169285;0.301587577451224;2.48616190909398;0.116225745519871;0.273777365939099;0.806231096969562;-2.19774169175011;7.53961026980589\t147.076413378177;218.113601150127;289.153028027798;390.197699998035;977.50437775671;130.050013034583;201.087592852046;115.087114392821;143.081402136892;214.118559209185;285.155501716567;356.192954155649;427.230188786552;498.265241494374;569.301420357176;640.341107437877;808.429168310795;879.468189767554;214.118559209185;285.155501716567;475.757386711244;696.362265007215\t22\t0.262893575628735\t0.0826446280991736\tNone\tUnknown\t\t41.4230894199432;4.75668724862449;3.9515580701967\tAAAAAAAAAPAAAATAPTTAATTAATAAQ;FHRGPPDKDDMVSVTQILQGK;PVTLWITVTHMQADEVSVWR\t_AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_FHRGPPDKDDMVSVTQILQGK_;_PVTLWITVTHMQADEVSVWR_\t\t\t0\t1443\t0\t0\t0\t\n", + "mq_tsv = io.StringIO('''Raw file\tScan number\tScan index\tSequence\tLength\tMissed cleavages\tModifications\tModified sequence\tOxidation (M) Probabilities\tOxidation (M) Score diffs\tAcetyl (Protein N-term)\tOxidation (M)\tProteins\tCharge\tFragmentation\tMass analyzer\tType\tScan event number\tIsotope index\tm/z\tMass\tMass error [ppm]\tMass error [Da]\tSimple mass error [ppm]\tRetention time\tPEP\tScore\tDelta score\tScore diff\tLocalization prob\tCombinatorics\tPIF\tFraction of total spectrum\tBase peak fraction\tPrecursor full scan number\tPrecursor Intensity\tPrecursor apex fraction\tPrecursor apex offset\tPrecursor apex offset time\tMatches\tIntensities\tMass deviations [Da]\tMass deviations [ppm]\tMasses\tNumber of matches\tIntensity coverage\tPeak coverage\tNeutral loss level\tETD identification type\tReverse\tAll scores\tAll sequences\tAll modified sequences\tReporter PIF\tReporter fraction\tid\tProtein group IDs\tPeptide ID\tMod. peptide ID\tEvidence ID\tOxidation (M) site IDs\n", + "20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t81358\t73979\tAAAAAAAAAPAAAATAPTTAATTAATAAQ\t29\t0\tUnmodified\t_(Acetyl (Protein N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_\t\t\t0\t0\tsp|P37108|SRP14_HUMAN\t3\tHCD\tFTMS\tMULTI-MSMS\t13\t1\t790.07495\t2367.203\t0.35311\t0.00027898\t-0.061634807\t70.261\t0.012774\t41.423\t36.666\tNaN\tNaN\t1\t0\t0\t0\t81345\t10653955\t0.0338597821787898\t-11\t0.139877319335938\ty1;y2;y3;y4;y11;y1-NH3;y2-NH3;a2;b2;b3;b4;b5;b6;b7;b8;b9;b11;b12;b6(2+);b8(2+);b13(2+);b18(2+)\t2000000;2000000;300000;400000;200000;1000000;400000;300000;600000;1000000;2000000;3000000;3000000;3000000;3000000;2000000;600000;500000;1000000;2000000;300000;200000\t5.2861228709844E-06;-6.86980268369553E-05;-0.00238178789771837;0.000624715964988809;-0.0145624692099773;-0.000143471782706683;-0.000609501446461991;-0.000524972720768346;0.00010190530804266;5.8620815195809E-05;0.000229901232955854;-0.000108750048696038;-0.000229593152369034;0.00183148682538103;0.00276641182404092;0.000193118923334623;0.00200988580445483;0.000102216846016745;5.86208151389656E-05;0.000229901232955854;-0.00104559184393338;0.00525030008475369\t0.0359413365445091;-0.314964433555295;-8.23711898839045;1.60102421155213;-14.8975999917227;-1.10320467763838;-3.03102462870716;-4.56152475051625;0.712219104095465;0.273777366204575;0.806231096969562;-0.305312183824154;-0.537399178230218;3.67572664689217;4.85930954169285;0.301587577451224;2.48616190909398;0.116225745519871;0.273777365939099;0.806231096969562;-2.19774169175011;7.53961026980589\t147.076413378177;218.113601150127;289.153028027798;390.197699998035;977.50437775671;130.050013034583;201.087592852046;115.087114392821;143.081402136892;214.118559209185;285.155501716567;356.192954155649;427.230188786552;498.265241494374;569.301420357176;640.341107437877;808.429168310795;879.468189767554;214.118559209185;285.155501716567;475.757386711244;696.362265007215\t22\t0.262893575628735\t0.0826446280991736\tNone\tUnknown\t\t41.4230894199432;4.75668724862449;3.9515580701967\tAAAAAAAAAPAAAATAPTTAATTAATAAQ;FHRGPPDKDDMVSVTQILQGK;PVTLWITVTHMQADEVSVWR\t_AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_FHRGPPDKDDMVSVTQILQGK_;_PVTLWITVTHMQADEVSVWR_\t\t\t0\t1443\t0\t0\t0\t\n", "20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t81391\t74010\tAAAAAAAAAAPAAAATAPTTAATTAATAAQ\t29\t0\tUnmodified\t_AAAAAAAAAPAAAATAPTTAATTAATAAQ_\t\t\t0\t0\tsp|P37108|SRP14_HUMAN\t2\tHCD\tFTMS\tMULTI-MSMS\t14\t0\t1184.6088\t2367.203\t0.037108\t4.3959E-05\t1.7026696\t70.287\t7.1474E-09\t118.21\t100.52\tNaN\tNaN\t1\t0\t0\t0\t81377\t9347701\t0.166790347889974\t-10\t0.12664794921875\ty1;y2;y3;y4;y5;y9;y12;y13;y14;y20;y13-H2O;y20-H2O;y1-NH3;y20-NH3;b3;b4;b5;b6;b7;b8;b9;b11;b12;b13;b14;b15;b16;b19;b15-H2O;b16-H2O\t500000;600000;200000;400000;200000;100000;200000;1000000;200000;300000;200000;100000;100000;70000;300000;900000;2000000;3000000;5000000;8000000;6000000;600000;800000;600000;200000;300000;200000;300000;300000;1000000\t-0.000194444760495571;0.000149986878682284;0.000774202587820128;-0.0002445094036716;0.000374520568641401;-0.00694293246522193;-0.0109837291331587;-0.0037745820627606;-0.000945546471939451;0.00152326440706929;0.00506054832726477;0.00996886361417637;6.25847393393997E-05;-0.024881067836759;-3.11821549132674E-05;-0.000183099230639527;0.000161332473453513;0.000265434980121881;0.000747070697229901;0.000975534518261156;0.00101513939785036;0.00651913000274362;0.0058584595163893;0.00579536744021425;0.00131097834105276;-0.0131378531671089;0.00472955218901916;-0.00161006322559842;-0.00201443239325272;0.0227149399370319\t-1.32206444236914;0.687655553213019;2.6775131607882;-0.626628140021726;0.811995006209331;-8.6203492854282;-10.1838066275079;-3.21078702288986;-0.758483069159249;0.881072738747222;4.37168212373889;5.82682888353564;0.481236695337485;-14.5343986203644;-0.145630261806375;-0.642102166533079;0.452935954800214;0.621293379181583;1.49934012872483;1.71355878380837;1.58531240493271;8.06399202403175;6.6614096214532;6.09718023739784;1.28333378040908;-11.7030234519348;3.96235146626144;-1.07856912288932;-1.82370619437775;19.3220953109188\t147.07661310906;218.113382465221;289.149872037312;390.198569223404;461.235063981231;805.411965958065;1078.54847749073;1175.59403219566;1246.62831694787;1728.87474561429;1157.57463237897;1710.85573532879;130.049806978061;1711.87460084504;214.118649012155;285.155914717031;356.192684073126;427.22969375842;498.266325910503;569.303211234482;640.340285417402;808.424659066597;879.462433524883;950.49961040476;1021.54120858166;1122.60333588727;1193.62258226971;1492.77704268533;1104.58164778019;1175.59403219566\t30\t0.474003002083763\t0.167630057803468\tNone\tUnknown\t\t118.209976573419;17.6937689289157;17.2534171481793\tAAAAAAAAAPAAAATAPTTAATTAATAAQ;SELKQEAMQSEQLQSVLYLK;VGSSVPSKASELVVMGDHDAARR\t_AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_SELKQEAM(Oxidation (M))QSEQLQSVLYLK_;_VGSSVPSKASELVVMGDHDAARR_\t\t\t1\t1443\t0\t0\t1\t\n", - "20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t107307\t98306\tAAAAAAAGDSDSWDADAFSVEDPVRK\t26\t1\tAcetyl (Protein_N-term)\t_(Acetyl (Protein_N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_\t\t\t1\t0\tsp|O75822|EIF3J_HUMAN\t3\tHCD\tFTMS\tMULTI-MSMS\t10\t2\t879.06841\t2634.1834\t-0.93926\t-0.00082567\t-3.2012471\t90.978\t2.1945E-12\t148.95\t141.24\tNaN\tNaN\t1\t0\t0\t0\t107297\t10193939\t0.267970762043589\t-8\t0.10211181640625\ty1;y2;y4;y5;y6;y7;y8;y9;y10;y11;y12;y13;y14;y15;y17;y18;y19;y20;y21;y23;y21-H2O;y1-NH3;y19-NH3;y14(2+);y16(2+);y22(2+);a2;b2;b3;b4;b5;b6;b7\t300000;200000;3000000;600000;1000000;500000;2000000;1000000;1000000;1000000;90000;1000000;400000;900000;1000000;400000;3000000;2000000;1000000;400000;100000;200000;200000;80000;100000;200000;200000;2000000;5000000;5000000;5000000;2000000;300000\t1.34859050149316E-07;-6.05140996867704E-06;2.27812602133781E-05;0.00128986659160546;-0.00934536073077652;0.000941953783126337;-0.00160424237344614;-0.00239257341399934;-0.00111053968612396;-0.00331340710044969;0.00330702864630439;0.000963683996815234;0.00596290290945944;-0.00662057038289277;-0.0117122701335575;0.00777853472800416;0.0021841542961738;0.000144322111736983;-0.00087403893667215;0.0197121595674616;-0.021204007680808;-0.000308954599830713;-0.026636719419912;-0.0137790992353075;0.00596067266928912;-0.0077053835773313;9.11402199221811E-06;-0.000142539300128419;-0.000251999832926231;1.90791054137662E-05;-0.00236430185879044;-9.54583337602344E-05;-0.000556959493223985\t0.000916705048437201;-0.0199575598103408;0.0456231928690862;2.09952637717462;-12.5708704058425;1.11808305811426;-1.72590731777249;-2.22239181008062;-0.967696370445928;-2.62418809422166;2.47964286628144;0.665205752892023;3.64753748704453;-3.84510115530963;-6.08782672045773;3.81508105974837;1.04209904973991;0.0666012719936656;-0.390545453668809;8.28224925531311;-9.55133250134922;-2.37499239179248;-12.8127653858411;-16.846761946123;6.48662354975264;-6.67117082062383;0.0580151981289049;-0.770098855873447;-0.983876895688683;0.0583162347158579;-5.93738717724506;-0.203431522818505;-1.03087538746314\t147.112804035741;303.21392125011;499.33507018564;614.360746132308;743.413974455831;842.472101057517;929.506675663573;1076.57587791081;1147.61170966489;1262.6408555643;1333.67134891635;1448.700635293;1634.77494902759;1721.81956091078;1923.88362405243;2038.89107627957;2095.9181343836;2166.95728800359;2237.99542015244;2380.04906152953;2220.00518543488;130.0865640237;2078.92040615582;817.907873297785;918.917619246831;1155.02717356753;157.097144992378;185.0922112678;256.129434516133;327.166277224995;398.205774393759;469.240619338034;540.278194626993\t33\t0.574496146107112\t0.14410480349345\tNone\tUnknown\t\t148.951235201399;7.71201258444522;7.36039532447559\tAAAAAAAGDSDSWDADAFSVEDPVRK;PSRQESELMWQWVDQRSDGER;HTLTSFWNFKAGCEEKCYSNR\t_(Acetyl (Protein_N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_;_PSRQESELM(Oxidation (M))WQWVDQRSDGER_;_HTLTSFWNFKAGCEEKCYSNR_\t\t\t2\t625\t1\t1\t2\t'''\n", + "20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t107307\t98306\tAAAAAAAGDSDSWDADAFSVEDPVRK\t26\t1\tAcetyl (Protein N-term)\t_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_\t\t\t1\t0\tsp|O75822|EIF3J_HUMAN\t3\tHCD\tFTMS\tMULTI-MSMS\t10\t2\t879.06841\t2634.1834\t-0.93926\t-0.00082567\t-3.2012471\t90.978\t2.1945E-12\t148.95\t141.24\tNaN\tNaN\t1\t0\t0\t0\t107297\t10193939\t0.267970762043589\t-8\t0.10211181640625\ty1;y2;y4;y5;y6;y7;y8;y9;y10;y11;y12;y13;y14;y15;y17;y18;y19;y20;y21;y23;y21-H2O;y1-NH3;y19-NH3;y14(2+);y16(2+);y22(2+);a2;b2;b3;b4;b5;b6;b7\t300000;200000;3000000;600000;1000000;500000;2000000;1000000;1000000;1000000;90000;1000000;400000;900000;1000000;400000;3000000;2000000;1000000;400000;100000;200000;200000;80000;100000;200000;200000;2000000;5000000;5000000;5000000;2000000;300000\t1.34859050149316E-07;-6.05140996867704E-06;2.27812602133781E-05;0.00128986659160546;-0.00934536073077652;0.000941953783126337;-0.00160424237344614;-0.00239257341399934;-0.00111053968612396;-0.00331340710044969;0.00330702864630439;0.000963683996815234;0.00596290290945944;-0.00662057038289277;-0.0117122701335575;0.00777853472800416;0.0021841542961738;0.000144322111736983;-0.00087403893667215;0.0197121595674616;-0.021204007680808;-0.000308954599830713;-0.026636719419912;-0.0137790992353075;0.00596067266928912;-0.0077053835773313;9.11402199221811E-06;-0.000142539300128419;-0.000251999832926231;1.90791054137662E-05;-0.00236430185879044;-9.54583337602344E-05;-0.000556959493223985\t0.000916705048437201;-0.0199575598103408;0.0456231928690862;2.09952637717462;-12.5708704058425;1.11808305811426;-1.72590731777249;-2.22239181008062;-0.967696370445928;-2.62418809422166;2.47964286628144;0.665205752892023;3.64753748704453;-3.84510115530963;-6.08782672045773;3.81508105974837;1.04209904973991;0.0666012719936656;-0.390545453668809;8.28224925531311;-9.55133250134922;-2.37499239179248;-12.8127653858411;-16.846761946123;6.48662354975264;-6.67117082062383;0.0580151981289049;-0.770098855873447;-0.983876895688683;0.0583162347158579;-5.93738717724506;-0.203431522818505;-1.03087538746314\t147.112804035741;303.21392125011;499.33507018564;614.360746132308;743.413974455831;842.472101057517;929.506675663573;1076.57587791081;1147.61170966489;1262.6408555643;1333.67134891635;1448.700635293;1634.77494902759;1721.81956091078;1923.88362405243;2038.89107627957;2095.9181343836;2166.95728800359;2237.99542015244;2380.04906152953;2220.00518543488;130.0865640237;2078.92040615582;817.907873297785;918.917619246831;1155.02717356753;157.097144992378;185.0922112678;256.129434516133;327.166277224995;398.205774393759;469.240619338034;540.278194626993\t33\t0.574496146107112\t0.14410480349345\tNone\tUnknown\t\t148.951235201399;7.71201258444522;7.36039532447559\tAAAAAAAGDSDSWDADAFSVEDPVRK;PSRQESELMWQWVDQRSDGER;HTLTSFWNFKAGCEEKCYSNR\t_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_;_PSRQESELM(Oxidation (M))WQWVDQRSDGER_;_HTLTSFWNFKAGCEEKCYSNR_\t\t\t2\t625\t1\t1\t2\t'''\n", ")\n", "\n", "mq_reader = psm_reader_provider.get_reader('maxquant')\n", @@ -200,9 +200,9 @@ "import pytest\n", "\n", "mq_tsv = io.StringIO('''Sequence\tCharge\tRetention time\tModified sequence\tScan number\tScan index\tRaw file\tm/z\tScore\tProteins\tReverse\n", - "0\tAAAAAAAAAPAAAATAPTTAATTAATAAQ\t3\t70.261\t_(Acetyl (Protein_N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_\t81358\t73979\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t790.07495\t41.423\tsp|P37108|SRP14_HUMAN\tNaN\n", + "0\tAAAAAAAAAPAAAATAPTTAATTAATAAQ\t3\t70.261\t_(Acetyl (Protein N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_\t81358\t73979\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t790.07495\t41.423\tsp|P37108|SRP14_HUMAN\tNaN\n", "1\tAAAAAAAAAAPAAAATAPTTAATTAATAAQ\t2\t70.287\t_AAAAAAAAAPAAAATAPTTAATTAATAAQ_\t81391\t74010\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t1184.60880\t118.210\tsp|P37108|SRP14_HUMAN\tNaN\n", - "2\tAAAAAAAGDSDSWDADAFSVEDPVRK\t3\t90.978\t_(Acetyl (Protein_N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_\t107307\t98306\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t879.06841\t148.950\tsp|O75822|EIF3J_HUMAN\tNaN\n", + "2\tAAAAAAAGDSDSWDADAFSVEDPVRK\t3\t90.978\t_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_\t107307\t98306\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t879.06841\t148.950\tsp|O75822|EIF3J_HUMAN\tNaN\n", "3\tAAAAAAAGDSDSWDADAFSVEDPVRK\t3\t90.978\t_(UnkownMod)AAAAAAAGDSDSWDADAFSVEDPVRK_\t107307\t98306\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t879.06841\t148.950\tsp|O75822|EIF3J_HUMAN\tNaN\n", "4\tAAAAAAAGDSDSWDADAFSVEDPVRK\t3\t90.978\t_(UniMod:3)AAAAAAAGDSDSWDADAFSVEDPVRK_\t107307\t98306\t20190402_QX1_SeVW_MA_HeLa_500ng_LC11\t879.06841\t148.950\tsp|O75822|EIF3J_HUMAN\tNaN\n", "''')\n", diff --git a/nbs_tests/psm_reader/psm_reader.ipynb b/nbs_tests/psm_reader/psm_reader.ipynb index bb3481c1..f9253f1c 100644 --- a/nbs_tests/psm_reader/psm_reader.ipynb +++ b/nbs_tests/psm_reader/psm_reader.ipynb @@ -32,9 +32,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from alphabase.psm_reader.psm_reader import *" - ] + "source": "from alphabase.psm_reader.psm_reader import translate_other_modification, PSMReaderBase, psm_reader_yaml" }, { "cell_type": "code", @@ -43,11 +41,11 @@ "outputs": [], "source": [ "#| hide\n", - "assert 'a',[] == translate_other_modification('A', {'A':'a','B':'b'})\n", - "assert 'b',[] == translate_other_modification('B', {'A':'a','B':'b'})\n", - "assert 'a;a',[] == translate_other_modification('A;A', {'A':'a','B':'b'})\n", - "assert 'a;b',[] == translate_other_modification('A;B', {'A':'a','B':'b'})\n", - "assert 'a;b',['X'] == translate_other_modification('A;B;X', {'A':'a','B':'b'})" + "assert 'a', [] == translate_other_modification('A', {'A': 'a', 'B': 'b'})\n", + "assert 'b', [] == translate_other_modification('B', {'A': 'a', 'B': 'b'})\n", + "assert 'a;a', [] == translate_other_modification('A;A', {'A': 'a', 'B': 'b'})\n", + "assert 'a;b', [] == translate_other_modification('A;B', {'A': 'a', 'B': 'b'})\n", + "assert 'a;b', ['X'] == translate_other_modification('A;B;X', {'A': 'a', 'B': 'b'})" ] }, { @@ -106,34 +104,34 @@ " 'Acetyl@Protein_N-term': [\n", " '_(Acetyl (Protein_N-term))',\n", " '_(ac)',\n", - " ]\n", + " ],\n", " 'Carbamidomethyl@C': [\n", " 'C(Carbamidomethyl (C))',\n", - " ]\n", + " ],\n", " 'Oxidation@M': [\n", " 'M(Oxidation (M))',\n", " 'M(ox)',\n", - " ]\n", + " ],\n", " 'Phospho@S': [\n", " 'S(Phospho (S))',\n", " 'S(Phospho (ST))',\n", " 'S(Phospho (STY))',\n", " 'S(ph)',\n", " 'pS',\n", - " ]\n", + " ],\n", " 'Phospho@T': [\n", " 'T(Phospho (T))',\n", " 'T(Phospho (ST))',\n", " 'T(Phospho (STY))',\n", " 'T(ph)',\n", " 'pT',\n", - " ]\n", + " ],\n", " 'Phospho@Y': [\n", " 'Y(Phospho (Y))',\n", " 'Y(Phospho (STY))',\n", " 'Y(ph)',\n", " 'pY',\n", - " ]\n", + " ],\n", " 'Deamidated@N': ['N(Deamidation (NQ))','N(de)']\n", " 'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']\n", " 'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']\n", diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/reference_data/reference_diann.parquet b/tests/integration/reference_data/reference_diann.parquet new file mode 100644 index 00000000..404a7551 Binary files /dev/null and b/tests/integration/reference_data/reference_diann.parquet differ diff --git a/tests/integration/reference_data/reference_diann_speclib.parquet b/tests/integration/reference_data/reference_diann_speclib.parquet new file mode 100644 index 00000000..885c8a8e Binary files /dev/null and b/tests/integration/reference_data/reference_diann_speclib.parquet differ diff --git a/tests/integration/reference_data/reference_maxquant.parquet b/tests/integration/reference_data/reference_maxquant.parquet new file mode 100644 index 00000000..fa4aed31 Binary files /dev/null and b/tests/integration/reference_data/reference_maxquant.parquet differ diff --git a/tests/integration/reference_data/reference_msfragger_speclib.parquet b/tests/integration/reference_data/reference_msfragger_speclib.parquet new file mode 100644 index 00000000..36b0659f Binary files /dev/null and b/tests/integration/reference_data/reference_msfragger_speclib.parquet differ diff --git a/tests/integration/reference_data/reference_openswath.parquet b/tests/integration/reference_data/reference_openswath.parquet new file mode 100644 index 00000000..73a146b3 Binary files /dev/null and b/tests/integration/reference_data/reference_openswath.parquet differ diff --git a/tests/integration/reference_data/reference_pfind.parquet b/tests/integration/reference_data/reference_pfind.parquet new file mode 100644 index 00000000..d1fb8735 Binary files /dev/null and b/tests/integration/reference_data/reference_pfind.parquet differ diff --git a/tests/integration/reference_data/reference_spectronaut.parquet b/tests/integration/reference_data/reference_spectronaut.parquet new file mode 100644 index 00000000..3845884f Binary files /dev/null and b/tests/integration/reference_data/reference_spectronaut.parquet differ diff --git a/tests/integration/reference_data/reference_spectronaut_report.parquet b/tests/integration/reference_data/reference_spectronaut_report.parquet new file mode 100644 index 00000000..3e2ade9d Binary files /dev/null and b/tests/integration/reference_data/reference_spectronaut_report.parquet differ diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py new file mode 100644 index 00000000..1f740a58 --- /dev/null +++ b/tests/integration/test_psm_readers.py @@ -0,0 +1,254 @@ +"""Integration tests for the PSM Readers. + +Tests the output of defined inputs against reference data, which are expected in the `reference_data` folder. + +Most of the test data is taken from psm_readers.ipynb +""" + +import io +import logging +import os +from io import StringIO +from pathlib import Path + +import numpy as np +import pandas as pd + +from alphabase.psm_reader import ( + DiannReader, + MaxQuantReader, + SpectronautReader, + SpectronautReportReader, + SwathReader, + pFindReader, + psm_reader_yaml, +) +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols +from alphabase.spectral_library.reader import LibraryReaderBase + +current_file_directory = os.path.dirname(os.path.abspath(__file__)) +test_data_path = Path(f"{current_file_directory}/reference_data") + +# TODO add tests for AlphaPept + + +def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> None: + """Compare the output of a PSM reader against reference data. + + If reference is not present, save the output as reference data and raise. + """ + out_file_path = test_data_path / f"reference_{test_case_name}.parquet" + # psm_df.to_csv(test_data_path / f"reference_{test_case_name}.csv") + + # check that all columns are available in PsmDfCols + assert ( + set(psm_df.columns) + - set(PsmDfCols.get_values()) + - set(LibPsmDfCols.get_values()) + == set() + ) + + if out_file_path.exists(): + expected_df = pd.read_parquet(out_file_path) + + try: + pd.testing.assert_frame_equal(psm_df, expected_df) + raise AssertionError("Reference data is outdated.") + except AssertionError as e: + # for whatever reason, columns are int32 on windows runners + logging.warning(f"Converting int32 to int64 for comparison: {e}") + + for column in psm_df.columns: + if psm_df[column].dtype == np.int32: + psm_df[column] = psm_df[column].astype(np.int64) + + pd.testing.assert_frame_equal(psm_df, expected_df) + + else: + psm_df.to_parquet(out_file_path) + raise ValueError("No reference data found.") + + +def test_psm_reader_yaml() -> None: + """Test that all column mappings in the psm_reader.yaml are covered by string constant keys.""" + for reader_config in psm_reader_yaml.values(): + ks = [k for k in reader_config["column_mapping"]] + assert ( + set(ks) - set(PsmDfCols.get_values()) - set(LibPsmDfCols.get_values()) + == set() + ) + + +def test_maxquant_reader() -> None: + """Test the MaxQuant reader.""" + + input_data = io.StringIO("""Raw file Scan number Scan index Sequence Length Missed cleavages Modifications Modified sequence Oxidation (M) Probabilities Oxidation (M) Score diffs Acetyl (Protein N-term) Oxidation (M) Proteins Charge Fragmentation Mass analyzer Type Scan event number Isotope index m/z Mass Mass error [ppm] Mass error [Da] Simple mass error [ppm] Retention time PEP Score Delta score Score diff Localization prob Combinatorics PIF Fraction of total spectrum Base peak fraction Precursor full scan number Precursor Intensity Precursor apex fraction Precursor apex offset Precursor apex offset time Matches Intensities Mass deviations [Da] Mass deviations [ppm] Masses Number of matches Intensity coverage Peak coverage Neutral loss level ETD identification type Reverse All scores All sequences All modified sequences Reporter PIF Reporter fraction id Protein group IDs Peptide ID Mod. peptide ID Evidence ID Oxidation (M) site IDs + 20190402_QX1_SeVW_MA_HeLa_500ng_LC11 81358 73979 AAAAAAAAAPAAAATAPTTAATTAATAAQ 29 0 Unmodified _(Acetyl (Protein N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_ 0 0 sp|P37108|SRP14_HUMAN 3 HCD FTMS MULTI-MSMS 13 1 790.07495 2367.203 0.35311 0.00027898 -0.061634807 70.261 0.012774 41.423 36.666 NaN NaN 1 0 0 0 81345 10653955 0.0338597821787898 -11 0.139877319335938 y1;y2;y3;y4;y11;y1-NH3;y2-NH3;a2;b2;b3;b4;b5;b6;b7;b8;b9;b11;b12;b6(2+);b8(2+);b13(2+);b18(2+) 2000000;2000000;300000;400000;200000;1000000;400000;300000;600000;1000000;2000000;3000000;3000000;3000000;3000000;2000000;600000;500000;1000000;2000000;300000;200000 5.2861228709844E-06;-6.86980268369553E-05;-0.00238178789771837;0.000624715964988809;-0.0145624692099773;-0.000143471782706683;-0.000609501446461991;-0.000524972720768346;0.00010190530804266;5.8620815195809E-05;0.000229901232955854;-0.000108750048696038;-0.000229593152369034;0.00183148682538103;0.00276641182404092;0.000193118923334623;0.00200988580445483;0.000102216846016745;5.86208151389656E-05;0.000229901232955854;-0.00104559184393338;0.00525030008475369 0.0359413365445091;-0.314964433555295;-8.23711898839045;1.60102421155213;-14.8975999917227;-1.10320467763838;-3.03102462870716;-4.56152475051625;0.712219104095465;0.273777366204575;0.806231096969562;-0.305312183824154;-0.537399178230218;3.67572664689217;4.85930954169285;0.301587577451224;2.48616190909398;0.116225745519871;0.273777365939099;0.806231096969562;-2.19774169175011;7.53961026980589 147.076413378177;218.113601150127;289.153028027798;390.197699998035;977.50437775671;130.050013034583;201.087592852046;115.087114392821;143.081402136892;214.118559209185;285.155501716567;356.192954155649;427.230188786552;498.265241494374;569.301420357176;640.341107437877;808.429168310795;879.468189767554;214.118559209185;285.155501716567;475.757386711244;696.362265007215 22 0.262893575628735 0.0826446280991736 None Unknown 41.4230894199432;4.75668724862449;3.9515580701967 AAAAAAAAAPAAAATAPTTAATTAATAAQ;FHRGPPDKDDMVSVTQILQGK;PVTLWITVTHMQADEVSVWR _AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_FHRGPPDKDDMVSVTQILQGK_;_PVTLWITVTHMQADEVSVWR_ 0 1443 0 0 0 + 20190402_QX1_SeVW_MA_HeLa_500ng_LC11 81391 74010 AAAAAAAAAAPAAAATAPTTAATTAATAAQ 29 0 Unmodified _AAAAAAAAAPAAAATAPTTAATTAATAAQ_ 0 0 sp|P37108|SRP14_HUMAN 2 HCD FTMS MULTI-MSMS 14 0 1184.6088 2367.203 0.037108 4.3959E-05 1.7026696 70.287 7.1474E-09 118.21 100.52 NaN NaN 1 0 0 0 81377 9347701 0.166790347889974 -10 0.12664794921875 y1;y2;y3;y4;y5;y9;y12;y13;y14;y20;y13-H2O;y20-H2O;y1-NH3;y20-NH3;b3;b4;b5;b6;b7;b8;b9;b11;b12;b13;b14;b15;b16;b19;b15-H2O;b16-H2O 500000;600000;200000;400000;200000;100000;200000;1000000;200000;300000;200000;100000;100000;70000;300000;900000;2000000;3000000;5000000;8000000;6000000;600000;800000;600000;200000;300000;200000;300000;300000;1000000 -0.000194444760495571;0.000149986878682284;0.000774202587820128;-0.0002445094036716;0.000374520568641401;-0.00694293246522193;-0.0109837291331587;-0.0037745820627606;-0.000945546471939451;0.00152326440706929;0.00506054832726477;0.00996886361417637;6.25847393393997E-05;-0.024881067836759;-3.11821549132674E-05;-0.000183099230639527;0.000161332473453513;0.000265434980121881;0.000747070697229901;0.000975534518261156;0.00101513939785036;0.00651913000274362;0.0058584595163893;0.00579536744021425;0.00131097834105276;-0.0131378531671089;0.00472955218901916;-0.00161006322559842;-0.00201443239325272;0.0227149399370319 -1.32206444236914;0.687655553213019;2.6775131607882;-0.626628140021726;0.811995006209331;-8.6203492854282;-10.1838066275079;-3.21078702288986;-0.758483069159249;0.881072738747222;4.37168212373889;5.82682888353564;0.481236695337485;-14.5343986203644;-0.145630261806375;-0.642102166533079;0.452935954800214;0.621293379181583;1.49934012872483;1.71355878380837;1.58531240493271;8.06399202403175;6.6614096214532;6.09718023739784;1.28333378040908;-11.7030234519348;3.96235146626144;-1.07856912288932;-1.82370619437775;19.3220953109188 147.07661310906;218.113382465221;289.149872037312;390.198569223404;461.235063981231;805.411965958065;1078.54847749073;1175.59403219566;1246.62831694787;1728.87474561429;1157.57463237897;1710.85573532879;130.049806978061;1711.87460084504;214.118649012155;285.155914717031;356.192684073126;427.22969375842;498.266325910503;569.303211234482;640.340285417402;808.424659066597;879.462433524883;950.49961040476;1021.54120858166;1122.60333588727;1193.62258226971;1492.77704268533;1104.58164778019;1175.59403219566 30 0.474003002083763 0.167630057803468 None Unknown 118.209976573419;17.6937689289157;17.2534171481793 AAAAAAAAAPAAAATAPTTAATTAATAAQ;SELKQEAMQSEQLQSVLYLK;VGSSVPSKASELVVMGDHDAARR _AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_SELKQEAM(Oxidation (M))QSEQLQSVLYLK_;_VGSSVPSKASELVVMGDHDAARR_ 1 1443 0 0 1 + 20190402_QX1_SeVW_MA_HeLa_500ng_LC11 107307 98306 AAAAAAAGDSDSWDADAFSVEDPVRK 26 1 Acetyl (Protein N-term) _(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_ 1 0 sp|O75822|EIF3J_HUMAN 3 HCD FTMS MULTI-MSMS 10 2 879.06841 2634.1834 -0.93926 -0.00082567 -3.2012471 90.978 2.1945E-12 148.95 141.24 NaN NaN 1 0 0 0 107297 10193939 0.267970762043589 -8 0.10211181640625 y1;y2;y4;y5;y6;y7;y8;y9;y10;y11;y12;y13;y14;y15;y17;y18;y19;y20;y21;y23;y21-H2O;y1-NH3;y19-NH3;y14(2+);y16(2+);y22(2+);a2;b2;b3;b4;b5;b6;b7 300000;200000;3000000;600000;1000000;500000;2000000;1000000;1000000;1000000;90000;1000000;400000;900000;1000000;400000;3000000;2000000;1000000;400000;100000;200000;200000;80000;100000;200000;200000;2000000;5000000;5000000;5000000;2000000;300000 1.34859050149316E-07;-6.05140996867704E-06;2.27812602133781E-05;0.00128986659160546;-0.00934536073077652;0.000941953783126337;-0.00160424237344614;-0.00239257341399934;-0.00111053968612396;-0.00331340710044969;0.00330702864630439;0.000963683996815234;0.00596290290945944;-0.00662057038289277;-0.0117122701335575;0.00777853472800416;0.0021841542961738;0.000144322111736983;-0.00087403893667215;0.0197121595674616;-0.021204007680808;-0.000308954599830713;-0.026636719419912;-0.0137790992353075;0.00596067266928912;-0.0077053835773313;9.11402199221811E-06;-0.000142539300128419;-0.000251999832926231;1.90791054137662E-05;-0.00236430185879044;-9.54583337602344E-05;-0.000556959493223985 0.000916705048437201;-0.0199575598103408;0.0456231928690862;2.09952637717462;-12.5708704058425;1.11808305811426;-1.72590731777249;-2.22239181008062;-0.967696370445928;-2.62418809422166;2.47964286628144;0.665205752892023;3.64753748704453;-3.84510115530963;-6.08782672045773;3.81508105974837;1.04209904973991;0.0666012719936656;-0.390545453668809;8.28224925531311;-9.55133250134922;-2.37499239179248;-12.8127653858411;-16.846761946123;6.48662354975264;-6.67117082062383;0.0580151981289049;-0.770098855873447;-0.983876895688683;0.0583162347158579;-5.93738717724506;-0.203431522818505;-1.03087538746314 147.112804035741;303.21392125011;499.33507018564;614.360746132308;743.413974455831;842.472101057517;929.506675663573;1076.57587791081;1147.61170966489;1262.6408555643;1333.67134891635;1448.700635293;1634.77494902759;1721.81956091078;1923.88362405243;2038.89107627957;2095.9181343836;2166.95728800359;2237.99542015244;2380.04906152953;2220.00518543488;130.0865640237;2078.92040615582;817.907873297785;918.917619246831;1155.02717356753;157.097144992378;185.0922112678;256.129434516133;327.166277224995;398.205774393759;469.240619338034;540.278194626993 33 0.574496146107112 0.14410480349345 None Unknown 148.951235201399;7.71201258444522;7.36039532447559 AAAAAAAGDSDSWDADAFSVEDPVRK;PSRQESELMWQWVDQRSDGER;HTLTSFWNFKAGCEEKCYSNR _(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_;_PSRQESELM(Oxidation (M))WQWVDQRSDGER_;_HTLTSFWNFKAGCEEKCYSNR_ 2 625 1 1 2 """) + + reader = MaxQuantReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "maxquant") + + +def test_pfind_reader() -> None: + """Test the pFind reader.""" + input_data = StringIO("""File_Name Scan_No Exp.MH+ Charge Q-value Sequence Calc.MH+ Mass_Shift(Exp.-Calc.) Raw_Score Final_Score Modification Specificity Proteins Positions Label Target/Decoy Miss.Clv.Sites Avg.Frag.Mass.Shift Others + Ecoli-1to1to1-un-C13-N15-10mM-20150823.30507.30507.2.0.dta 30507 2074.030369 2 0 AMIEAGAAAVHFEDQLASVK 2074.027271 0.003098 35.299588 5.15726e-013 2,Oxidation[M]; 3 gi|16131841|ref|NP_418439.1|/ 173,K,K/ 1|0| target 0 0.948977 131070 0 0 0 262143 0 0 0 32 + Ecoli-1to1to1-un-C13-N15-150mM-20150823.41501.41501.3.0.dta 41501 2712.197421 3 0 EGDNYVVLSDILGDEDHLGDMDFK 2712.198013 -0.000592 27.073978 9.82619e-010 21,Unknown[M]; 3 gi|145698316|ref|NP_417633.4|/ 470,K,V/ 1|0| target 0 0.814438 65596 0 0 0 4194288 0 0 0 36 + XXX.25802.25802.4.0.dta 25802 2388.339186 4 0.0032066 SVFLIKGDKVWVYPPEKKEK 2388.332468 0.006718 17.822784 0.100787 21,Didehydro[AnyC-termK]; 0 sp|P02790|HEMO_HUMAN/ 106,N,G/ 1|0| target 0 0.704714 36 + """) + reader = pFindReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "pfind") + + +def test_diann_reader() -> None: + """Test the Diann reader.""" + input_data = StringIO("""File.Name Run Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised PG.MaxLFQ Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique Modified.Sequence Stripped.Sequence Precursor.Id Precursor.Charge Q.Value Global.Q.Value Protein.Q.Value PG.Q.Value Global.PG.Q.Value GG.Q.Value Translated.Q.Value Proteotypic Precursor.Quantity Precursor.Normalised Precursor.Translated Quantity.Quality RT RT.Start RT.Stop iRT Predicted.RT Predicted.iRT Lib.Q.Value Ms1.Profile.Corr Ms1.Area Evidence Spectrum.Similarity Mass.Evidence CScore Decoy.Evidence Decoy.CScore Fragment.Quant.Raw Fragment.Quant.Corrected Fragment.Correlations MS2.Scan IM iIM Predicted.IM Predicted.iIM + F:\XXX\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A2_1_22636.d 20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A2_1_22636 Q9UH36 Q9UH36 SRRD 3296.49 3428.89 3428.89 3296.49 3428.89 3428.89 3428.89 (UniMod:1)AAAAAAALESWQAAAPR AAAAAAALESWQAAAPR (UniMod:1)AAAAAAALESWQAAAPR2 2 3.99074e-05 1.96448e-05 0.000159821 0.000159821 0.000146135 0.000161212 0 1 3296.49 3428.89 3296.49 0.852479 19.9208 19.8731 19.9685 123.9 19.8266 128.292 0 0.960106 5308.05 1.96902 0.683134 0.362287 0.999997 1.23691 3.43242e-05 1212.01;2178.03;1390.01;1020.01;714.008;778.008; 1212.01;1351.73;887.591;432.92;216.728;732.751; 0.956668;0.757581;0.670497;0.592489;0.47072;0.855203; 30053 1.19708 1.19328 1.19453 1.19469 + F:\XXX\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642.d 20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642 Q9UH36 Q9UH36 SRRD 2365 2334.05 2334.05 2365 2334.05 2334.05 2334.05 (UniMod:1)AAAAAAALESWQAAAPR AAAAAAALESWQAAAPR (UniMod:1)AAAAAAALESWQAAAPR2 2 0.000184434 1.96448e-05 0.000596659 0.000596659 0.000146135 0.000604961 0 1 2365 2334.05 2365 0.922581 19.905 19.8573 19.9527 123.9 19.782 128.535 0 0.940191 4594.04 1.31068 0.758988 0 0.995505 0.28633 2.12584e-06 1209.02;1210.02;1414.02;1051.01;236.003;130.002; 1209.02;1109.89;732.154;735.384;0;46.0967; 0.919244;0.937624;0.436748;0.639369;0.296736;0.647924; 30029 1.195 1.19328 1.19381 1.19339 + F:\XXX\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-B2_1_22648.d 20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-B2_1_22648 Q9UH36 Q9UH36 SRRD 1664.51 1635.46 1635.47 1664.51 1635.46 1635.47 1635.47 (UniMod:1)AAAAAAALESWQAAAPR AAAAAAALESWQAAAPR (UniMod:1)AAAAAAALESWQAAAPR2 2 0.000185123 1.96448e-05 0.000307409 0.000307409 0.000146135 0.000311332 0 1 1664.51 1635.46 1664.51 0.811147 19.8893 19.8416 19.937 123.9 19.7567 128.896 0 0.458773 6614.06 1.7503 0.491071 0.00111683 0.997286 1.92753 2.80543e-05 744.01;1708.02;1630.02;1475.02;0;533.006; 322.907;808.594;577.15;536.033;0;533.006; 0.760181;0.764072;0.542005;0.415779;0;0.913438; 30005 1.19409 1.19328 1.19323 1.19308 + """) + reader = DiannReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "diann") + + +def test_spectronaut_reader() -> None: + """Test the Spectronaut reader.""" + input_data = StringIO("""ReferenceRun PrecursorCharge Workflow IntModifiedPeptide CV AllowForNormalization ModifiedPeptide StrippedPeptide iRT IonMobility iRTSourceSpecific BGSInferenceId IsProteotypic IntLabeledPeptide LabeledPeptide PrecursorMz ReferenceRunQvalue ReferenceRunMS1Response FragmentLossType FragmentNumber FragmentType FragmentCharge FragmentMz RelativeIntensity ExcludeFromAssay Database ProteinGroups UniProtIds Protein Name ProteinDescription Organisms OrganismId Genes Protein Existence Sequence Version FASTAName + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843 2 _ALVAT[+80]PGK_ True _ALVAT[Phospho (STY)]PGK_ ALVATPGK -5.032703 0.758 -5.032703 P19338 False _ALVAT[+80]PGK_ _ALVAT[Phospho (STY)]PGK_ 418.717511324722 0 10352 noloss 3 y 1 301.187031733932 53.1991 False sp P19338 P19338 NUCL_HUMAN Nucleolin Homo sapiens NCL 1 3 MCT_human_UP000005640_9606 + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843 2 _ALVAT[+80]PGK_ True _ALVAT[Phospho (STY)]PGK_ ALVATPGK -5.032703 0.758 -5.032703 P19338 False _ALVAT[+80]PGK_ _ALVAT[Phospho (STY)]PGK_ 418.717511324722 0 10352 H3PO4 4 y 1 384.224142529733 26.31595 False sp P19338 P19338 NUCL_HUMAN Nucleolin Homo sapiens NCL 1 3 MCT_human_UP000005640_9606 + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843 2 _TLT[+80]PCPLR_ True _TLT[Phospho (STY)]PC[Carbamidomethyl (C)]PLR_ TLTPCPLR 27.71659 0.818 27.71659 Q5T200 False _TLT[+80]PPLR_ _TLT[Phospho (STY)]PPLR_ 439.230785875227 0.000138389150379226 23117 noloss 3 b 1 396.153027901512 6.3264 False sp Q5T200 Q5T200 ZC3HD_HUMAN Zinc finger CCCH domain-containing protein 13 Homo sapiens ZC3H13 1 1 MCT_human_UP000005640_9606 + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843 2 _TLT[+80]PCPLR_ True _TLT[Phospho (STY)]PC[Carbamidomethyl (C)]PLR_ TLTPCPLR 27.71659 0.818 27.71659 Q5T200 False _TLT[+80]PPLR_ _TLT[Phospho (STY)]PPLR_ 439.230785875227 0.000138389150379226 23117 noloss 3 y 1 385.255780000092 29.70625 False sp Q5T200 Q5T200 ZC3HD_HUMAN Zinc finger CCCH domain-containing protein 13 Homo sapiens ZC3H13 1 1 MCT_human_UP000005640_9606 + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867 2 _LFVT[+80]PPEGSSR_ True _[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_ LFVSPPEGSSR 38.05031 0.917 38.05031 Q14244;Q14244-6;Q14244-7 False _LFVT[+80]PPEGSSR_ _LFVT[Phospho (STY)]PPEGSSR_ 635.297385373987 0 14164 H3PO4 4 b 1 443.265279065723 12.24525 False sp Q14244;Q14244-6;Q14244-7 Q14244;Q14244-6;Q14244-7 MAP7_HUMAN Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin Homo sapiens MAP7 1;; 1;; MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867 2 _LFVT[+80]PPEGSSR_ True _[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_ LFVSPPEGSSR 38.05031 0.917 38.05031 Q14244;Q14244-6;Q14244-7 False _LFVT[+80]PPEGSSR_ _LFVT[Phospho (STY)]PPEGSSR_ 635.297385373987 0 14164 noloss 6 y 1 632.299829640042 46.07855 False sp Q14244;Q14244-6;Q14244-7 Q14244;Q14244-6;Q14244-7 MAP7_HUMAN Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin Homo sapiens MAP7 1;; 1;; MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional + 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867 2 _LFVT[+80]PPEGSSR_ True _[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_ LFVSPPEGSSR 38.05031 0.917 38.05031 Q14244;Q14244-6;Q14244-7 False _LFVT[+80]PPEGSSR_ _LFVT[Phospho (STY)]PPEGSSR_ 635.297385373987 0 14164 noloss 7 y 1 729.352593488892 100 False sp Q14244;Q14244-6;Q14244-7 Q14244;Q14244-6;Q14244-7 MAP7_HUMAN Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin Homo sapiens MAP7 1;; 1;; MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional + """) + + reader = SpectronautReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "spectronaut") + + +def test_spectronaut_report_reader() -> None: + """Test the Spectronaut report reader.""" + input_data = StringIO("""R.FileName,R.Replicate,EG.PrecursorId,EG.ApexRT,FG.CalibratedMassAccuracy (PPM),FG.CalibratedMz + 20211203_EXPL2_SoSt_SA_DIA_HeLa_1000mz_noCB_01,1,_VIETPENDFK_.2,40.826847076416,-0.6350307649846,596.298998773218 + 20211203_EXPL2_SoSt_SA_DIA_HeLa_1000mz_noCB_01,1,_GFSNEVSSK_.2,19.1254806518555,-1.54873822486555,477.730400257423 + 20211203_EXPL2_SoSt_SA_DIA_HeLa_1000mz_noCB_01,1,_HLLNQAVGEEEVPK_.3,42.0593299865723,-0.309173676987587,521.611288926824 + 20211203_EXPL2_SoSt_SA_DIA_HeLa_1000mz_noCB_01,1,_DATM[Oxidation (M)]EVQR_.2,12.8398199081421,-3.31103772642203,483.222124398527 + """) + + reader = SpectronautReportReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "spectronaut_report") + + +def test_openswath_reader() -> None: + """Test the OpenSwath reader.""" + + input_data = StringIO("""PrecursorMz ProductMz Tr_recalibrated transition_name CE LibraryIntensity transition_group_id decoy PeptideSequence ProteinName Annotation FullUniModPeptideName PrecursorCharge GroupLabel UniprotID FragmentType FragmentCharge FragmentSeriesNumber + 685.732240417 886.020494795 59.0 255_AAAAAAAAAASGAAIPPLIPPRR_3 -1 5257.9 13_AAAAAAAAAASGAAIPPLIPPRR_3 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 y19^2/0.002 AAAAAAAAAASGAAIPPLIPPRR 3 light 1/O14654 y 2 19 + 514.550999438 473.303261576 59.2 268_AAAAAAAAAASGAAIPPLIPPRR_4 -1 10000.0 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 y8^2/0.002 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 y 2 8 + 514.550999438 629.39313922 59.2 276_AAAAAAAAAASGAAIPPLIPPRR_4 -1 5923.1 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 y12^2/0.001 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 y 2 12 + 514.550999438 672.909153425 59.2 279_AAAAAAAAAASGAAIPPLIPPRR_4 -1 5249.8 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 y13^2/0.001 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 y 2 13 + 514.550999438 356.19284545 59.2 262_AAAAAAAAAASGAAIPPLIPPRR_4 -1 5233.6 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 b5/0.001,b10^2/0.001,m6:10/0.001 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 b 1 5 + 514.550999438 498.26707303 59.2 269_AAAAAAAAAASGAAIPPLIPPRR_4 -1 4976.0 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 b7/0.001,m4:10/0.001 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 b 1 7 + 514.550999438 427.22995924 59.2 265_AAAAAAAAAASGAAIPPLIPPRR_4 -1 4859.4 14_AAAAAAAAAASGAAIPPLIPPRR_4 0 AAAAAAAAAASGAAIPPLIPPRR 1/O14654 b6/0.002,m5:10/0.002 AAAAAAAAAASGAAIPPLIPPRR 4 light 1/O14654 b 1 6 + 728.201724416 356.19284545 101.8 292_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 -1 10000.0 15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 0 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 1/O14654 b5/0.003,b10^2/0.003,m6:10/0.003 AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR 5 light 1/O14654 b 1 5 + 728.201724416 576.310000482 101.8 297_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 -1 7611.0 15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 0 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 1/O14654 y5/0.002 AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR 5 light 1/O14654 y 1 5 + 728.201724416 427.22995924 101.8 293_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 -1 6805.1 15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 0 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 1/O14654 b6/-0.002,m5:10/-0.002 AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR 5 light 1/O14654 b 1 6 + 728.201724416 569.30418682 101.8 296_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 -1 6312.7 15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5 0 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 1/O14654 b8/0.009,m3:10/0.009 AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR 5 light 1/O14654 b 1 8 + """) + + reader = SwathReader() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "openswath") + + +def test_diann_speclib_reader() -> None: + """Test the Diann speclib reader.""" + # this is the head of "https://datashare.biochem.mpg.de/s/DF12ObSdZnBnqUV" ("diann_speclib.tsv") + input_data = StringIO("""FileName PrecursorMz ProductMz Tr_recalibrated IonMobility transition_name LibraryIntensity transition_group_id decoy PeptideSequence Proteotypic QValue PGQValue Ms1ProfileCorr ProteinGroup ProteinName Genes FullUniModPeptideName ModifiedPeptide PrecursorCharge PeptideGroupLabel UniprotID NTerm CTerm FragmentType FragmentCharge FragmentSeriesNumber FragmentLossType ExcludeFromAssay +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 645.36896 -17.011904 0 AAAAAAAAAVSR2_121_1_0_5 1 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 7 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 716.40607 -17.011904 0 AAAAAAAAAVSR2_121_1_0_4 0.92588264 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 8 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 574.33185 -17.011904 0 AAAAAAAAAVSR2_121_1_0_6 0.73629588 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 6 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 503.29471 -17.011904 0 AAAAAAAAAVSR2_121_1_0_7 0.47699517 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 5 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 214.11917 -17.011904 0 AAAAAAAAAVSR2_98_1_0_3 0.47343451 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 b 1 3 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 787.44318 -17.011904 0 AAAAAAAAAVSR2_121_1_0_3 0.39700398 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 9 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 285.15628 -17.011904 0 AAAAAAAAAVSR2_98_1_0_4 0.30815825 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 b 1 4 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 432.2576 -17.011904 0 AAAAAAAAAVSR2_121_1_0_8 0.26575705 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 4 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 356.19339 -17.011904 0 AAAAAAAAAVSR2_98_1_0_5 0.23726191 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 b 1 5 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 858.48029 -17.011904 0 AAAAAAAAAVSR2_121_1_0_2 0.23109815 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 10 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 427.2305 -17.011904 0 AAAAAAAAAVSR2_98_1_0_6 0.13046893 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 b 1 6 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral3/2023_12/20231213_OA3_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P27_70091_cb_H6_1424.mzML 500.78116 361.22049 -17.011904 0 AAAAAAAAAVSR2_121_1_0_9 0.11459313 AAAAAAAAAVSR2 0 AAAAAAAAAVSR 0 1.4398672e-05 0.002044061 0.63356501 Q96JP5;Q96JP5-2 ZFP91-2_HUMAN;ZFP91_HUMAN ZFP91 AAAAAAAAAVSR AAAAAAAAAVSR 2 AAAAAAAAAVSR Q96JP5;Q96JP5-2;A0A0A6YYC7 0 0 y 1 3 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 672.40503 -14.478184 0 AAAAAAALQAK2_121_1_0_4 1 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 7 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 601.36786 -14.478184 0 AAAAAAALQAK2_121_1_0_5 0.81051117 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 6 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 214.11917 -14.478184 0 AAAAAAALQAK2_98_1_0_3 0.6025809 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 b 1 3 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 743.44214 -14.478184 0 AAAAAAALQAK2_121_1_0_3 0.55991524 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 8 noloss False +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 530.33075 -14.478184 0 AAAAAAALQAK2_121_1_0_6 0.42974085 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 5 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 814.47925 -14.478184 0 AAAAAAALQAK2_121_1_0_2 0.40478998 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 9 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 285.15628 -14.478184 0 AAAAAAALQAK2_98_1_0_4 0.27873126 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 b 1 4 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 459.29367 -14.478184 0 AAAAAAALQAK2_121_1_0_7 0.23921044 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 4 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 346.20959 -14.478184 0 AAAAAAALQAK2_121_1_0_8 0.17267427 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 3 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 356.19339 -14.478184 0 AAAAAAALQAK2_98_1_0_5 0.11922429 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 b 1 5 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 427.2305 -14.478184 0 AAAAAAALQAK2_98_1_0_6 0.042955909 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 b 1 6 noloss True +/fs/pool/pool-mann-ms14/MZML/Astral2/2023_12/20231206_OA2_ViAl_SA_FAIMS40_IO4_A556_MOMI-20231121_APAK_P35_72214_cb_A9_926.mzML 478.78064 885.51636 -14.478184 0 AAAAAAALQAK2_121_1_0_1 0.019872207 AAAAAAALQAK2 0 AAAAAAALQAK 1 1.5083094e-06 0.00029770765 0.97250301 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578;H3BU31;H3BM89 0 0 y 1 10 noloss True + """) + + reader = LibraryReaderBase() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "diann_speclib") + + +def test_msfragger_speclib_reader() -> None: + """Test the MSFragger speclib reader.""" + + # this is the head of https://datashare.biochem.mpg.de/s/Cka1utORt3r5A4a ("msfragger_speclib.tsv") + input_data = StringIO("""ModifiedPeptide PrecursorCharge Tr_recalibrated IonMobility StrippedPeptide PrecursorMz ProteinID Genes FragmentType FragmentMz RelativeIntensity FragmentCharge FragmentNumber FragmentLossType +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 547.3086 1.0 1 5 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 660.39264 0.51416683 1 6 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 213.15976 0.2875934 1 2 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 305.18195 0.28558257 1 3 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 418.26602 0.22692133 1 4 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 204.13426 0.14408894 1 2 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 330.69998 0.047275875 2 6 noloss +_VLELTGK_ 2 4.249923 0.75 VLELTGK 380.234178887762 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 342.20236 0.039677892 1 3 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 305.18195 1.0 1 3 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 418.26602 0.7783308 1 4 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 342.20236 0.7754817 1 3 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 547.3086 0.5358066 1 5 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 556.3341 0.38568112 1 5 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 455.2864 0.2794164 1 4 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 213.15976 0.25726682 1 2 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 204.13426 0.19917944 1 2 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 y 660.39264 0.12131426 1 6 noloss +_VLELTGK_ 1 4.249923 1.284 VLELTGK 759.4610813087119 A0A0B4J2D5;P0DPI2 GATD3B;GATD3 b 613.3555 0.11152817 1 6 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 269.67664 1.0 2 4 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 326.21866 0.9999356 2 5 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 538.346 0.90463305 1 4 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 401.28708 0.71957177 1 3 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 651.43005 0.6489045 1 5 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 b 229.11829 0.4481002 1 2 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 288.203 0.28588438 1 2 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 y 375.75287 0.16287889 2 6 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 b 479.26126 0.09841085 1 4 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 b 592.34534 0.06371137 1 5 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 b 342.20236 0.030663786 1 3 noloss +_EVLHLLR_ 2 21.775629 0.837 EVLHLLR 440.274169715862 A0AVF1 TTC26 b 705.4294 0.02848413 1 6 noloss +""") + + reader = LibraryReaderBase() + reader.import_file(input_data) + + _assert_reference_df_equal(reader.psm_df, "msfragger_speclib") diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 8a159d9b..8b92301b 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -6,11 +6,10 @@ DOCS_NBS=$(find ../docs/nbs -name "*.ipynb" | grep -v tutorial_dev_spectral_libr # corresponding notebook(s) if this occurs again # INCLUDED_NBS=$(find ../nbs_tests -name "*.ipynb" | grep -v test_isotope_mp.ipynb) -TEST_NBS=$(find ../nbs_tests -name "*.ipynb") +python -m pytest +TEST_NBS=$(find ../nbs_tests -name "*.ipynb") TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb") - ALL_NBS=$(echo $DOCS_NBS$'\n'$TEST_NBS$'\n'$TUTORIAL_NBS) python -m pytest --nbmake $(echo $ALL_NBS) -python -m pytest