Merge pull request #107 from MannLabs/development

Development
MannLabs · Jul 17, 2023 · 525529a · 525529a
2 parents 5835683 + 4120205
commit 525529a
Show file tree

Hide file tree

Showing 45 changed files with 2,712 additions and 1,535 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.0.2
+current_version = 1.0.3
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
 Follow the changelog format from https://keepachangelog.com/en/1.0.0/.
 
+## 1.1.0 - 2023.xx.xx
+
+### Added
+
+- Separate `library_reader_base` in `psm_reader.yaml` config for `LibraryReaderBase`.
+
+### Changed
+
+- `mod@Any N-term` and `mod@Any_N-term` are both supported, `Any_N-term` is prefered as there are no spaces and hence better for command line tools. The same for `mod@Protein N-term`, `mod@Any C-term`, and `mod@Protein C-term`.
+- Enable customizing dtypes of peak mz and intensty values.
+- `SWATHLibraryReader` to `LibraryBaseReader` in `alphabase.spectral_library.reader`.
+- New `LibraryReaderBase._get_fragment_intensity` implementation which is called at the end of the parsing process in `PSMReaderBase._post_process`. This allows it to operate only on the translated column names. By default, all non-fragment columns will be grouped and part of the final output.
+- `SpecLibBase.copy()` for copying spectral libraries including all attributes.
+- `SpecLibBase.append()` for appending spectral libraries while maintaining the fragment index mapping.
+
 ## 1.0.2 - 2023.02.10
 
 ### Changed

diff --git a/alphabase/__init__.py b/alphabase/__init__.py
@@ -2,7 +2,7 @@
 
 
 __project__ = "alphabase"
-__version__ = "1.0.2"
+__version__ = "1.0.3"
 __license__ = "Apache"
 __description__ = "An infrastructure Python package of the AlphaX ecosystem"
 __author__ = "Mann Labs"
@@ -39,5 +39,5 @@
     "PyPi": "https://pypi.org/project/alphabase/",
 }
 __extra_requirements__ = {
-    "development": "requirements_development.txt",
+    "development": "extra_requirements/development.txt",
 }
diff --git a/alphabase/constants/_const.py b/alphabase/constants/_const.py
@@ -1,6 +1,23 @@
 import os
+import numpy as np
+
+from alphabase.yaml_utils import load_yaml
 
 CONST_FILE_FOLDER = os.path.join(
     os.path.dirname(__file__),
-    'const_files'
-)
+    "const_files"
+)
+
+common_const_dict:dict = load_yaml(
+    os.path.join(CONST_FILE_FOLDER, "common_constants.yaml")
+)
+
+# Only applied in peak and fragment dataframes to save RAM. 
+# Using float32 still keeps 0.1 ppm precision in any value range.
+# Default float dtype is "float64" for value calculation and other senarios.
+PEAK_MZ_DTYPE:np.dtype = np.dtype(
+    common_const_dict["PEAK_MZ_DTYPE"]
+).type
+PEAK_INTENSITY_DTYPE:np.dtype = np.dtype(
+    common_const_dict["PEAK_INTENSITY_DTYPE"]
+).type
diff --git a/alphabase/constants/atom.py b/alphabase/constants/atom.py
@@ -4,10 +4,9 @@
 
 from alphabase.yaml_utils import load_yaml
 
-from alphabase.constants._const import CONST_FILE_FOLDER
-
-common_const_dict:dict = load_yaml(
-    os.path.join(CONST_FILE_FOLDER, 'common_constants.yaml')
+from alphabase.constants._const import (
+    CONST_FILE_FOLDER,
+    common_const_dict
 )
 
 MASS_PROTON:float = common_const_dict['MASS_PROTON']

diff --git a/alphabase/constants/const_files/common_constants.yaml b/alphabase/constants/const_files/common_constants.yaml
@@ -6,4 +6,8 @@ MOBILITY:
   # Mason Schamp equation of Burker.
   CCS_IM_COEF: 1059.62245
   # 28 is the mass of N(2), the default gas in IM bruker
-  IM_GAS_MASS: 28.0
+  IM_GAS_MASS: 28.0
+
+# Only applied in peak/fragment dataframes to save RAM
+PEAK_MZ_DTYPE: float32
+PEAK_INTENSITY_DTYPE: float32
diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -20,6 +20,7 @@ alphapept:
     'Phospho@T': 'pT'
     'Phospho@Y': 'pY'
     'Acetyl@Protein N-term': 'a'
+
 maxquant:
   reader_type: maxquant
   rt_unit: minute
@@ -45,6 +46,7 @@ maxquant:
     'genes': ['Gene Names','Gene names']
     'decoy': 'Reverse'
     'intensity': 'Intensity'
+
   modification_mapping:
     'Acetyl@Protein N-term': 
       - '_(Acetyl (Protein N-term))'
@@ -74,6 +76,7 @@ maxquant:
     'Deamidated@N': ['N(Deamidation (NQ))','N(de)']
     'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
     'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
+
 pfind:
   reader_type: pfind
   rt_unit: minute
@@ -117,7 +120,9 @@ msfragger_pepxml:
     - 'Glu->pyro-Glu@E^Any N-term'
     - 'Gln->pyro-Glu@Q^Any N-term'
     - 'Dimethyl@K' # Any N-term is not needed here as it will be infered in-the-fly
+    - 'Methyl@E' #an example of a PTM that can be C-term
   mod_mass_tol: 0.1 # Da
+
 diann:
   reader_type: diann
   rt_unit: minute
@@ -148,6 +153,7 @@ spectronaut_report:
     'uniprot_ids': 'PG.UniProtIds'
     'charge': 'charge'
   modification_mapping: 'maxquant'
+
 spectronaut:
   reader_type: spectronaut
   rt_unit: irt
@@ -171,4 +177,35 @@ spectronaut:
     'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
     'genes': ['Genes','Gene','GeneName','GeneNames']
   modification_mapping: 'maxquant'
-
+
+library_reader_base:
+  reader_type: library_reader_base
+  rt_unit: irt
+  fixed_C57: False
+  csv_sep: "\t"
+  mod_seq_columns:
+    - 'ModifiedPeptideSequence'
+    - 'ModifiedPeptide'
+    - 'ModifiedSequence'
+    - 'FullUniModPeptideName'
+    - 'LabeledSequence'
+    - 'FullUniModPeptideName'
+  column_mapping:
+    'raw_name': 'ReferenceRun'
+    'sequence': ['PeptideSequence', 'StrippedPeptide']
+    'modified_sequence': ['ModifiedPeptideSequence','ModifiedPeptide']
+    'charge': 'PrecursorCharge'
+    'rt': ['RT','iRT','Tr_recalibrated','RetentionTime','NormalizedRetentionTime']
+    'ccs': 'CCS'
+    'precursor_mz': 'PrecursorMz'
+    'mobility': ['Mobility','IonMobility','PrecursorIonMobility']
+    'proteins': ['ProteinId','ProteinID','ProteinName','Protein Name',]
+    'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
+    'genes': ['GeneName','Genes','Gene',]
+    'fragment_intensity': ['LibraryIntensity','RelativeIntensity', 'RelativeFragmentIntensity', 'RelativeFragmentIonIntensity']
+    'fragment_mz': ['ProductMz']
+    'fragment_type': ['FragmentType', 'FragmentIonType', 'ProductType', 'ProductIonType']
+    'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge']
+    'fragment_series': ['FragmentSeriesNumber','FragmentNumber']
+    'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType']
+  modification_mapping: 'maxquant'
diff --git a/alphabase/constants/modification.py b/alphabase/constants/modification.py
@@ -84,6 +84,11 @@ def load_mod_df(
 ):
     global MOD_DF
     MOD_DF = pd.read_table(tsv)
+    _df = MOD_DF[MOD_DF.mod_name.str.contains(' ', regex=False)].copy()
+    _df["mod_name"] = MOD_DF.mod_name.str.replace(' ', '_', regex=False)
+    MOD_DF = pd.concat(
+        [MOD_DF, _df], ignore_index=True
+    ).drop_duplicates("mod_name")
     MOD_DF.fillna('',inplace=True)
     MOD_DF['unimod_id'] = MOD_DF.unimod_id.astype(np.int32)
     MOD_DF.set_index('mod_name', drop=False, inplace=True)