Merge pull request #126 from MannLabs/development

Release 1.1.2
MannLabs · Dec 25, 2023 · 57e9e51 · 57e9e51
2 parents d63e1f8 + cb797b9
commit 57e9e51
Show file tree

Hide file tree

Showing 27 changed files with 1,395 additions and 500 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.1.1
+current_version = 1.1.2
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

diff --git a/README.md b/README.md
@@ -36,6 +36,10 @@ The infrastructure package of AlphaX ecosystem for MS proteomics. It was first p
 
 - [AlphaPeptDeep](https://github.com/MannLabs/alphapeptdeep): deep learning framework for proteomics.
 - [AlphaRaw](https://github.com/MannLabs/alpharaw): raw data reader for different vendors.
+- [AlphaDIA](https://github.com/MannLabs/alphadia): DIA search engine.
+- [PeptDeep-HLA](https://github.com/MannLabs/peptdeep-hla): personalized HLA-binding peptide prediction.
+- [AlphaViz](https://github.com/MannLabs/alphaviz): visualization for MS-based proteomics.
+- [AlphaQuant](https://github.com/MannLabs/alphaquant): quantification for MS-based proteomics.
 
 ------------------------------------------------------------------------
 

diff --git a/alphabase/__init__.py b/alphabase/__init__.py
@@ -2,7 +2,7 @@
 
 
 __project__ = "alphabase"
-__version__ = "1.1.1"
+__version__ = "1.1.2"
 __license__ = "Apache"
 __description__ = "An infrastructure Python package of the AlphaX ecosystem"
 __author__ = "Mann Labs"

diff --git a/alphabase/constants/aa.py b/alphabase/constants/aa.py
@@ -1,14 +1,14 @@
 import os
 import pandas as pd
 import numpy as np
-
-from typing import Union, Tuple
+import typing
 
 from alphabase.yaml_utils import load_yaml
 
 from alphabase.constants.element import (
     calc_mass_from_formula, 
     MASS_H2O, parse_formula,
+    reset_elements
 )
 
 from alphabase.constants._const import CONST_FILE_FOLDER
@@ -19,19 +19,34 @@
 AA_Formula:dict = load_yaml(
     os.path.join(CONST_FILE_FOLDER, 'amino_acid.yaml')
 )
+#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
+AA_ASCII_MASS:np.ndarray = np.ones(128)*1e8
+
+#: 128-len AA dataframe
+AA_DF:pd.DataFrame = pd.DataFrame()
+
+# AA formula to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
+AA_Composition:dict = {}
+
+def replace_atoms(atom_replace_dict:typing.Dict):
+    for aa, formula in list(AA_Formula.items()):
+        atom_comp = dict(parse_formula(formula))
+        for atom_from, atom_to in atom_replace_dict.items():
+            if atom_from in atom_comp:
+                atom_comp[atom_to] = atom_comp[atom_from]
+                del atom_comp[atom_from]
+        AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])
 
 def reset_AA_mass()->np.ndarray:
     """AA mass in np.array with shape (128,)"""
-    AA_ASCII_MASS = np.ones(128)*1e8
+    global AA_ASCII_MASS
     for aa, chem in AA_Formula.items():
         AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
     return AA_ASCII_MASS
-
-#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
-AA_ASCII_MASS:np.ndarray = reset_AA_mass()
+reset_AA_mass()
 
 def reset_AA_df():
-    global AA_ASCII_MASS
+    global AA_DF
     AA_DF = pd.DataFrame()
     AA_DF['aa'] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
     AA_DF['formula'] = ['']*len(AA_ASCII_MASS)
@@ -42,23 +57,31 @@ def reset_AA_df():
         formulas.append(formula)
     AA_DF.loc[aa_idxes, 'formula'] = formulas
     AA_DF['mass'] = AA_ASCII_MASS
-    AA_ASCII_MASS = AA_DF.mass.to_numpy()
     return AA_DF
-
-#: 128-len AA dataframe
-AA_DF:pd.DataFrame = reset_AA_df()
-
-# AA to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
-AA_Composition:dict = {}
-for aa, formula, mass in AA_DF.values:
-    AA_Composition[aa] = dict(
-        parse_formula(formula)
-    )
+reset_AA_df()
+
+def reset_AA_Composition():
+    global AA_Composition
+    AA_Composition = {}
+    for aa, formula, mass in AA_DF.values:
+        AA_Composition[aa] = dict(
+            parse_formula(formula)
+        )
+    return AA_Composition
+reset_AA_Composition()
+
+def reset_AA_atoms(atom_replace_dict:typing.Dict = {}):
+    reset_elements()
+    replace_atoms(atom_replace_dict)
+    reset_AA_mass()
+    reset_AA_df()
+    reset_AA_Composition()
 
 def update_an_AA(aa:str, formula:str):
     aa_idx = ord(aa)
     AA_DF.loc[aa_idx,'formula'] = formula
-    AA_DF.loc[aa_idx,'mass'] = calc_mass_from_formula(formula)
+    AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
+    AA_DF.loc[aa_idx,'mass'] = AA_ASCII_MASS[aa_idx]
     AA_Formula[aa] = formula
     AA_Composition[aa] = dict(parse_formula(formula))
 

diff --git a/alphabase/constants/atom.py b/alphabase/constants/atom.py
@@ -1,6 +1,7 @@
 import os
 import numpy as np
 import numba
+import typing
 
 from alphabase.yaml_utils import load_yaml
 
@@ -89,7 +90,25 @@ def truncate_isotope(
 MASS_H2O:int = None #raise errors if the value is not reset
 MASS_NH3:int = None
 
+def update_atom_infos(new_atom_info:typing.Dict):
+    """
+    Args:
+        atom_dict (Dict): Example, replacing N with 15N
+          {"N":
+            {"abundance": [0.01,0.99]},
+            {"mass": [14.00307400443, 15.00010889888]},
+          }
+    """
+    for atom, info in new_atom_info.items():
+        CHEM_INFO_DICT[atom] = info
+
+    reset_elements()
+
 def reset_elements():
+
+    global MASS_C, MASS_H, MASS_O, MASS_N
+    global MASS_H2O, MASS_NH3
+
     for elem, items in CHEM_INFO_DICT.items():
         isotopes = np.array(items['abundance'])
         masses = np.array(items['mass'])
@@ -120,6 +139,13 @@ def reset_elements():
 
             CHEM_ISOTOPE_DIST[elem] = _isos[start:end]
             CHEM_MONO_IDX[elem] = _mono_idx
+
+    MASS_C = CHEM_MONO_MASS['C']
+    MASS_H = CHEM_MONO_MASS['H']
+    MASS_N = CHEM_MONO_MASS['N']
+    MASS_O = CHEM_MONO_MASS['O']
+    MASS_H2O = CHEM_MONO_MASS['H']*2 + CHEM_MONO_MASS['O']
+    MASS_NH3 = CHEM_MONO_MASS['H']*3 + CHEM_MONO_MASS['N']
 
 def load_elem_yaml(yaml_file:str):
     '''Load built-in or user-defined element yaml file. Default yaml is: 
@@ -129,8 +155,6 @@ def load_elem_yaml(yaml_file:str):
     global CHEM_MONO_MASS
     global CHEM_ISOTOPE_DIST
     global CHEM_MONO_IDX
-    global MASS_C, MASS_H, MASS_O, MASS_N
-    global MASS_H2O, MASS_NH3
 
     CHEM_INFO_DICT = load_yaml(yaml_file)
 
@@ -146,13 +170,6 @@ def load_elem_yaml(yaml_file:str):
     )
 
     reset_elements()
-
-    MASS_C = CHEM_MONO_MASS['C']
-    MASS_H = CHEM_MONO_MASS['H']
-    MASS_N = CHEM_MONO_MASS['N']
-    MASS_O = CHEM_MONO_MASS['O']
-    MASS_H2O = CHEM_MONO_MASS['H']*2 + CHEM_MONO_MASS['O']
-    MASS_NH3 = CHEM_MONO_MASS['H']*3 + CHEM_MONO_MASS['N']
 
 load_elem_yaml(
     os.path.join(CONST_FILE_FOLDER,

diff --git a/alphabase/peptide/fragment.py b/alphabase/peptide/fragment.py
@@ -588,10 +588,12 @@ def flatten_fragments(
         input precursor dataframe which contains the frag_start_idx and frag_stop_idx columns
     
     fragment_mz_df : pd.DataFrame
-        input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs
+        input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs.
+        Fragments with mz==0 will be excluded.
     
     fragment_intensity_df : pd.DataFrame
-        input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs
+        input fragment intensity dataframe of shape (N, T) which contains N * T fragment mzs.
+        Could be empty (len==0) to exclude intensity values.
     
     min_fragment_intensity : float, optional
         minimum intensity which should be retained. Defaults to -1
@@ -758,10 +760,12 @@ def compress_fragment_indices(frag_idx):
 
 def remove_unused_fragments(
         precursor_df: pd.DataFrame, 
-        fragment_df_list: Tuple[pd.DataFrame, ...]
+        fragment_df_list: Tuple[pd.DataFrame, ...],
+        frag_start_col:str = 'frag_start_idx',
+        frag_stop_col:str = 'frag_stop_idx',
     ) -> Tuple[pd.DataFrame, Tuple[pd.DataFrame, ...]]:
     """Removes unused fragments of removed precursors, 
-    reannotates the frag_start_idx and frag_stop_idx
+    reannotates the `frag_start_col` and `frag_stop_col`
     
     Parameters
     ----------
@@ -773,19 +777,27 @@ def remove_unused_fragments(
         Multiple fragment dataframes can be provided which will all be sliced in the same way. 
         This allows to slice both the fragment_mz_df and fragment_intensity_df. 
         At least one fragment dataframe needs to be provided. 
+
+    frag_start_col : str, optional
+        Fragment start idx column in `precursor_df`, such as "frag_start_idx" and "peak_start_idx".
+        Defaults to "frag_start_idx".
+
+    frag_stop_col : str, optional
+        Fragment stop idx column in `precursor_df`, such as "frag_stop_idx" and "peak_stop_idx".
+        Defaults to "frag_stop_idx".
     
     Returns
     -------
     pd.DataFrame, List[pd.DataFrame]
         returns the reindexed precursor DataFrame and the sliced fragment DataFrames
     """
 
-    precursor_df = precursor_df.sort_values(['frag_start_idx'], ascending=True)
-    frag_idx = precursor_df[['frag_start_idx','frag_stop_idx']].values
+    precursor_df = precursor_df.sort_values([frag_start_col], ascending=True)
+    frag_idx = precursor_df[[frag_start_col,frag_stop_col]].values
 
     new_frag_idx, fragment_pointer = compress_fragment_indices(frag_idx)
 
-    precursor_df[['frag_start_idx','frag_stop_idx']] = new_frag_idx
+    precursor_df[[frag_start_col,frag_stop_col]] = new_frag_idx
     precursor_df = precursor_df.sort_index()
 
     output_tuple = []

diff --git a/alphabase/peptide/precursor.py b/alphabase/peptide/precursor.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
 import numba
+import typing
 import multiprocessing as mp
 from tqdm import tqdm
 
@@ -486,10 +487,10 @@ def _count_batchify_df(df_group, mp_batch_size):
 def calc_precursor_isotope_mp(
     precursor_df:pd.DataFrame, 
     processes:int=8,
-    mp_batch_size:int=100000,
+    mp_batch_size:int=10000,
     process_bar=None,
     min_right_most_intensity:float=0.2,
-    min_precursor_num_to_run_mp:int=1000,
+    min_precursor_num_to_run_mp:int=10000,
 )->pd.DataFrame:
     """`calc_precursor_isotope` is not that fast for large dataframes, 
     so here we use multiprocessing for faster isotope pattern calculation. 
@@ -547,8 +548,9 @@ def calc_precursor_isotope_mp(
 def calc_precursor_isotope_intensity(
     precursor_df,
     max_isotope = 6, 
-    min_right_most_intensity = 0.001
-    ):
+    min_right_most_intensity = 0.001,
+    normalize:typing.Literal['mono','sum'] = "sum",
+)->pd.DataFrame:
     """Calculate isotope intensity values for precursor_df inplace.
 
     Parameters
@@ -577,28 +579,57 @@ def calc_precursor_isotope_intensity(
 
     precursor_dist = np.zeros((len(precursor_df), max_isotope), dtype=np.float32)
 
+    mono_idxes = np.zeros(len(precursor_df),dtype=np.int32)
+
     for i in range(len(precursor_df)):
 
         row = precursor_df.iloc[i]
         dist, mono = isotope_dist.calc_formula_distribution(
             get_mod_seq_formula(row['sequence'], row['mods'])
         )
         dist[dist <= min_right_most_intensity] = 0.
-        dist = dist / dist.sum()
-        precursor_dist[i] = dist[:max_isotope]
+
+        # mono should be always included in the i_x list
+        # after clipping max_isotope isotopes
+        mono_left_half_isotope = max_isotope//2
+        mono_right_half_isotope = (
+            mono_left_half_isotope if max_isotope%2==0 
+            else (mono_left_half_isotope+1)
+        )
+        if mono < mono_left_half_isotope:
+            precursor_dist[i] = dist[:max_isotope]
+            mono_idxes[i] = mono
+        elif mono + mono_right_half_isotope >= len(dist):
+            precursor_dist[i] = dist[-max_isotope:]
+            mono_idxes[i] = max_isotope+mono-len(dist)+1
+        else:
+            precursor_dist[i] = dist[
+                mono-mono_left_half_isotope:
+                mono+mono_right_half_isotope
+            ]
+            mono_idxes[i] = mono-mono_left_half_isotope
+
+    if normalize == "sum":
+        precursor_dist /= np.sum(precursor_dist, axis=1, keepdims=True)
+    else:
+        precursor_dist /= precursor_dist[
+            np.arange(len(precursor_dist)), mono_idxes
+        ].reshape(-1,1)
 
     precursor_df[col_names] = precursor_dist
+    precursor_df["mono_isotope_idx"] = mono_idxes
 
     return precursor_df
 
 def calc_precursor_isotope_intensity_mp(
     precursor_df,
     max_isotope = 6,
     min_right_most_intensity = 0.001,
+    normalize:typing.Literal['mono','sum'] = "sum",
     mp_batch_size = 1000,
     mp_process_num = 8,
-    progress_bar = True
-    ):
+    progress_bar = True,
+)->pd.DataFrame:
 
     """Calculate isotope intensity values for precursor_df using multiprocessing.
 
@@ -639,7 +670,8 @@ def calc_precursor_isotope_intensity_mp(
             partial(
                 calc_precursor_isotope_intensity,
                 max_isotope=max_isotope,
-                min_right_most_intensity=min_right_most_intensity
+                min_right_most_intensity=min_right_most_intensity,
+                normalize=normalize,
             ), _batchify_df(df_group, mp_batch_size)
         )
 

diff --git a/alphabase/spectral_library/base.py b/alphabase/spectral_library/base.py
@@ -326,7 +326,7 @@ def calc_precursor_isotope_intensity(self,
         multiprocessing : bool=True,
         max_isotope = 6,
         min_right_most_intensity = 0.001,
-        mp_batch_size = 1000,
+        mp_batch_size = 10000,
         mp_process_num = 8
         ):
         """