Skip to content

Commit

Permalink
Merge pull request #244 from MannLabs/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
mschwoer authored Nov 19, 2024
2 parents 8cc183f + f2a4154 commit 91029a8
Show file tree
Hide file tree
Showing 28 changed files with 552 additions and 188 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.4.1
current_version = 1.4.2
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion alphabase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "alphabase"
__version__ = "1.4.1"
__version__ = "1.4.2"
__license__ = "Apache"
__description__ = "An infrastructure Python package of the AlphaX ecosystem"
__author__ = "Mann Labs"
Expand Down
2 changes: 1 addition & 1 deletion alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ maxquant:
'Dimethyl@Any_N-term':
- '(Dimethyl)'
'Acetyl@Protein_N-term':
- '_(Acetyl (Protein_N-term))'
- '_(Acetyl (Protein N-term))'
- '_(ac)'
'Acetyl@K':
- 'K(ac)'
Expand Down
23 changes: 12 additions & 11 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -79,31 +80,31 @@ def _load_file(self, filename):
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[self.hdf_dataset]
df = pd.DataFrame({col: dataset[col] for col in dataset})
df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df["charge"] = df["charge"].astype(int)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df

def _load_modifications(self, df: pd.DataFrame):
if len(df) == 0:
self._psm_df["sequence"] = ""
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["decoy"] = 0
self._psm_df[PsmDfCols.SEQUENCE] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.DECOY] = 0
return

(
self._psm_df["sequence"],
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df["decoy"],
self._psm_df[PsmDfCols.DECOY],
) = zip(*df["precursor"].apply(parse_ap))
self._psm_df.decoy = self._psm_df.decoy.astype(np.int8)
self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)


def register_readers():
Expand Down
13 changes: 8 additions & 5 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml

Expand Down Expand Up @@ -127,7 +128,9 @@ def _load_file(self, filename):

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True)
self._psm_df.rename(
columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True
)


class SpectronautReportReader(MaxQuantReader):
Expand Down Expand Up @@ -174,10 +177,10 @@ def _load_file(self, filename):
self.mod_seq_column = "ModifiedSequence"
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split(
".", expand=True, n=2
)
df["charge"] = df.charge.astype(np.int8)
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8)
return df


Expand Down
71 changes: 71 additions & 0 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
class ConstantsClass(type):
"""A metaclass for classes that should only contain string constants."""

def __setattr__(self, name, value):
raise TypeError("Constants class cannot be modified")

def get_values(cls):
"""Get all user-defined string values of the class."""
return [
value
for key, value in cls.__dict__.items()
if not key.startswith("__") and isinstance(value, str)
]


class PsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a PSM dataframe."""

# TODO: these are used only in th psm_reader package and the spectral_library.reader module so far
MOD_SITES = "mod_sites"
MODIFIED_SEQUENCE = "modified_sequence"
SEQUENCE = "sequence"
DECOY = "decoy"
MODS = "mods"
SCORE = "score"
TO_REMOVE = "to_remove"
AA_MASS_DIFFS = "aa_mass_diffs"
AA_MASS_DIFF_SITES = "aa_mass_diff_sites"
RT = "rt"
RT_START = "rt_start"
RT_STOP = "rt_stop"
RT_NORM = "rt_norm"
SPEC_IDX = "spec_idx"
SCANNR = "scannr"
FDR = "fdr"
NAA = "nAA"
CCS = "ccs"
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"

RAW_NAME = "raw_name"
CHARGE = "charge"
PROTEINS = "proteins"

SCAN_NUM = "scan_num"
PRECURSOR_MZ = "precursor_mz"
DIANN_SPEC_INDEX = "diann_spec_idx"

# part of the output, but not directly referenced
_UNIPROT_IDS = "uniprot_ids"
_GENES = "genes"
_QUERY_ID = "query_id"

# part of psm_reader_yaml, but not directly referenced
_INTENSITY = "intensity"


class LibPsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a Library PSM dataframe."""

FRAG_START_IDX = "frag_start_idx"
FRAG_STOP_IDX = "frag_stop_idx"

# not referenced in reader classes
FRAGMENT_INTENSITY = "fragment_intensity"
FRAGMENT_MZ = "fragment_mz"
FRAGMENT_TYPE = "fragment_type"
FRAGMENT_CHARGE = "fragment_charge"
FRAGMENT_SERIES = "fragment_series"
FRAGMENT_LOSS_TYPE = "fragment_loss_type"
24 changes: 14 additions & 10 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import copy
import warnings
from typing import Optional

import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -195,7 +197,7 @@ def _init_modification_mapping(self):
psm_reader_yaml["maxquant"]["modification_mapping"]
)

def set_modification_mapping(self, modification_mapping: dict):
def set_modification_mapping(self, modification_mapping: Optional[dict] = None):
super().set_modification_mapping(modification_mapping)
self._add_all_unimod()
self._extend_mod_brackets()
Expand Down Expand Up @@ -237,8 +239,10 @@ def _extend_mod_brackets(self):
self.modification_mapping[key] = list(mod_set)

def _translate_decoy(self, origin_df=None):
if "decoy" in self._psm_df.columns:
self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8)
if PsmDfCols.DECOY in self._psm_df.columns:
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "-"
).astype(np.int8)

def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"]
Expand All @@ -252,16 +256,16 @@ def _load_file(self, filename):

# remove MBR PSMs as they are currently not supported and will crash import
mapped_columns = self._find_mapped_columns(df)
if "scan_num" in mapped_columns:
scan_num_col = mapped_columns["scan_num"]
if PsmDfCols.SCAN_NUM in mapped_columns:
scan_num_col = mapped_columns[PsmDfCols.SCAN_NUM]
no_ms2_mask = df[scan_num_col] == ""
if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0:
warnings.warn(
f"Maxquant psm file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed."
f"MaxQuant PSM file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed."
)
df = df[~no_ms2_mask]
df.reset_index(drop=True, inplace=True)
df[scan_num_col] = df[scan_num_col].astype(int)
df[scan_num_col] = df[scan_num_col].astype(int)

# if 'K0' in df.columns:
# df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0
Expand All @@ -278,15 +282,15 @@ def _load_modifications(self, origin_df: pd.DataFrame):
else:
mod_sep = "()"

(seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)
if "sequence" not in self._psm_df.columns:
self._psm_df["sequence"] = seqs
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


def register_readers():
Expand Down
53 changes: 29 additions & 24 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.constants.aa import AA_ASCII_MASS
from alphabase.constants.atom import MASS_H, MASS_O
from alphabase.constants.modification import MOD_MASS
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -115,9 +116,6 @@ def __init__(
def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"]

def _init_modification_mapping(self):
self.modification_mapping = {}

def _translate_modifications(self):
pass

Expand All @@ -126,54 +124,61 @@ def _load_file(self, filename):
msf_df.fillna("", inplace=True)
if "ion_mobility" in msf_df.columns:
msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float)
msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
msf_df["to_remove"] = 0
self.column_mapping["to_remove"] = "to_remove"
msf_df[PsmDfCols.RAW_NAME] = (
msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
)
msf_df["to_remove"] = 0 # TODO revisit
self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove"
return msf_df

def _translate_decoy(self, origin_df=None):
self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype(
np.int8
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8)
)

self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x))
if not self.keep_decoy:
self._psm_df["to_remove"] += self._psm_df.decoy > 0
self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply(
lambda x: ";".join(x)
)
if not self._keep_decoy:
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0

def _translate_score(self, origin_df=None):
# evalue score
self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100)

def _load_modifications(self, msf_df):
if len(msf_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["aa_mass_diffs"] = ""
self._psm_df["aa_mass_diff_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFFS] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = ""
return

(
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df["aa_mass_diffs"],
self._psm_df["aa_mass_diff_sites"],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
self._psm_df[PsmDfCols.AA_MASS_DIFFS],
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*msf_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
)
)

if not self.keep_unknown_aa_mass_diffs:
self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != ""
self._psm_df[PsmDfCols.TO_REMOVE] += (
self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
)
self._psm_df.drop(
columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True
columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES],
inplace=True,
)

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df = (
self._psm_df.query("to_remove==0")
.drop(columns="to_remove")
self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0")
.drop(columns=PsmDfCols.TO_REMOVE)
.reset_index(drop=True)
)

Expand Down
Loading

0 comments on commit 91029a8

Please sign in to comment.