Merge pull request #218 from MannLabs/development

Development
MannLabs · Sep 12, 2024 · 7795e13 · 7795e13
2 parents ba154aa + 7b20a84
commit 7795e13
Show file tree

Hide file tree

Showing 28 changed files with 2,342 additions and 187 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.0
+current_version = 1.4.0
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

diff --git a/alphabase/__init__.py b/alphabase/__init__.py
@@ -2,7 +2,7 @@
 
 
 __project__ = "alphabase"
-__version__ = "1.3.0"
+__version__ = "1.4.0"
 __license__ = "Apache"
 __description__ = "An infrastructure Python package of the AlphaX ecosystem"
 __author__ = "Mann Labs"

diff --git a/alphabase/constants/aa.py b/alphabase/constants/aa.py
@@ -11,12 +11,14 @@
     parse_formula,
     reset_elements,
 )
-from alphabase.yaml_utils import load_yaml
 
 # We use all 128 ASCII code to represent amino acids for flexible extensions in the future.
 # The amino acid masses are stored in 128-lengh array :py:data:`AA_ASCII_MASS`.
-# If an ASCII code is not in `AA_Formula`, the mass will be set as a large value to disable MS search.
-AA_Formula: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "amino_acid.yaml"))
+# If an ASCII code is not in `aa_formula`, the mass will be set as a large value to disable MS search.
+aa_formula: pd.DataFrame = pd.read_csv(
+    os.path.join(CONST_FILE_FOLDER, "amino_acid.tsv"), sep="\t", index_col=0
+)
+
 #: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
 AA_ASCII_MASS: np.ndarray = np.ones(128) * 1e8
 
@@ -28,20 +30,23 @@
 
 
 def replace_atoms(atom_replace_dict: typing.Dict):
-    for aa, formula in list(AA_Formula.items()):
+    for aa, row in aa_formula.iterrows():
+        formula = row["formula"]
         atom_comp = dict(parse_formula(formula))
         for atom_from, atom_to in atom_replace_dict.items():
             if atom_from in atom_comp:
                 atom_comp[atom_to] = atom_comp[atom_from]
                 del atom_comp[atom_from]
-        AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])
+        aa_formula.loc[aa, "formula"] = "".join(
+            [f"{atom}({n})" for atom, n in atom_comp.items()]
+        )
 
 
 def reset_AA_mass() -> np.ndarray:
     """AA mass in np.array with shape (128,)"""
     global AA_ASCII_MASS
-    for aa, chem in AA_Formula.items():
-        AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
+    for aa, row in aa_formula.iterrows():
+        AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(row["formula"])
     return AA_ASCII_MASS
 
 
@@ -51,15 +56,14 @@ def reset_AA_mass() -> np.ndarray:
 def reset_AA_df():
     global AA_DF
     AA_DF = pd.DataFrame()
-    AA_DF["aa"] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
-    AA_DF["formula"] = [""] * len(AA_ASCII_MASS)
-    aa_idxes = []
-    formulas = []
-    for aa, formula in AA_Formula.items():
-        aa_idxes.append(ord(aa))
-        formulas.append(formula)
-    AA_DF.loc[aa_idxes, "formula"] = formulas
+    num_rows = len(AA_ASCII_MASS)
+    AA_DF["aa"] = [chr(aa) for aa in range(num_rows)]
+    AA_DF["formula"] = [""] * num_rows
+    AA_DF["smiles"] = [""] * num_rows
     AA_DF["mass"] = AA_ASCII_MASS
+    for aa, row in aa_formula.iterrows():
+        AA_DF.loc[ord(aa), "formula"] = row["formula"]
+        AA_DF.loc[ord(aa), "smiles"] = row["smiles"]
     return AA_DF
 
 
@@ -69,8 +73,8 @@ def reset_AA_df():
 def reset_AA_Composition():
     global AA_Composition
     AA_Composition = {}
-    for aa, formula, _mass in AA_DF.values:
-        AA_Composition[aa] = dict(parse_formula(formula))
+    for aa, row in aa_formula.iterrows():
+        AA_Composition[aa] = dict(parse_formula(row["formula"]))
     return AA_Composition
 
 
@@ -85,12 +89,14 @@ def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
     reset_AA_Composition()
 
 
-def update_an_AA(aa: str, formula: str):
+def update_an_AA(aa: str, formula: str, smiles: str = ""):
     aa_idx = ord(aa)
+    aa_formula.loc[aa, "formula"] = formula
+    aa_formula.loc[aa, "smiles"] = smiles
     AA_DF.loc[aa_idx, "formula"] = formula
+    AA_DF.loc[aa_idx, "smiles"] = smiles
     AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
     AA_DF.loc[aa_idx, "mass"] = AA_ASCII_MASS[aa_idx]
-    AA_Formula[aa] = formula
     AA_Composition[aa] = dict(parse_formula(formula))
 
 

diff --git a/alphabase/constants/atom.py b/alphabase/constants/atom.py
@@ -1,8 +1,12 @@
 import os
+import re
 import typing
+from collections import defaultdict
 
 import numba
 import numpy as np
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
 
 from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
 from alphabase.yaml_utils import load_yaml
@@ -200,3 +204,215 @@ def calc_mass_from_formula(formula: str):
         mass of the formula
     """
     return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])
+
+
+class ChemicalCompositonFormula:
+    """
+    Initialize the ChemicalCompositonFormula with a given formula.
+
+    Parameters
+    ----------
+    formula : str
+        The chemical formula as a string.
+
+    Returns
+    -------
+    None
+    """
+
+    def __init__(self, formula=None):
+        self.elements = (
+            defaultdict(int) if formula is None else self._parse_formula(formula)
+        )
+
+    @classmethod
+    def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula":
+        """
+        Create a ChemicalCompositonFormula instance from a SMILES string.
+
+        Parameters
+        ----------
+        smiles : str
+            The SMILES representation of the molecule.
+
+        Returns
+        -------
+        ChemicalCompositonFormula
+            An instance of the class based on the SMILES string.
+
+        Raises
+        ------
+        ValueError
+            If the SMILES string is invalid and can't be converted to an RDKit molecule.
+        """
+        mol = Chem.MolFromSmiles(smiles)
+        if not mol:
+            raise ValueError(f"Invalid RDKit molecule: {smiles}")
+        formula = rdMolDescriptors.CalcMolFormula(
+            mol, separateIsotopes=True, abbreviateHIsotopes=False
+        )
+        formula = formula.replace("[1H]", "H")
+        return cls._from_rdkit_formula(formula)
+
+    @classmethod
+    def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula":
+        """
+        Create a ChemicalCompositonFormula instance from an RDKit formula.
+
+        Parameters
+        ----------
+        formula : str
+            The chemical formula as generated by RDKit.
+
+        Returns
+        -------
+        ChemicalCompositonFormula
+            An instance of the class based on the RDKit formula.
+        """
+        instance = cls.__new__(cls)
+        instance.elements = instance._parse_rdkit_formula(formula)
+        return instance
+
+    def _parse_formula(self, formula) -> dict:
+        """
+        Parse a chemical formula string into a dictionary of elements and their counts.
+
+        Parameters
+        ----------
+        formula : str
+            The chemical formula to parse.
+
+        Returns
+        -------
+        dict
+            A dictionary with elements as keys and their counts as values.
+        """
+        # Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses
+        # Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')]
+        pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?"
+        matches = re.findall(pattern, formula)
+        element_counts = defaultdict(int)
+
+        for isotope, element, count in matches:
+            if isotope:
+                element = f"{isotope}{element}"
+            count = int(count) if count else 1
+            element_counts[element] += count
+
+        self._validate_atoms(element_counts)
+        return element_counts
+
+    def _parse_rdkit_formula(self, formula: str) -> dict:
+        """
+        Parse an RDKit-generated formula string into a dictionary of elements and their counts.
+
+        Parameters
+        ----------
+        formula : str
+            The RDKit-generated chemical formula to parse.
+
+        Returns
+        -------
+        dict
+            A dictionary with elements as keys and their counts as values.
+        """
+        # Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count
+        # Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')]
+        pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)"
+        matches = re.findall(pattern, formula)
+        element_counts = defaultdict(int)
+
+        for match in matches:
+            count = int(match[4]) if match[4] else 1
+            if match[1]:  # noqa: SIM108
+                # Isotope, see 0th element in the example above
+                element = f"{match[1]}{match[2]}"
+            else:
+                # Regular element, see the rest in the example above
+                element = match[3]
+            element_counts[element] += count
+
+        self._validate_atoms(element_counts)
+        return element_counts
+
+    def _validate_atoms(self, element_counts):
+        """
+        Validate the elements in the formula.
+
+        Parameters
+        ----------
+        element_counts : dict
+            The elements and their counts in the formula.
+
+        Raises
+        ------
+        ValueError
+            If the formula contains an unknown element.
+        """
+        for element in element_counts:
+            if element not in CHEM_MONO_MASS:
+                raise ValueError(f"Unknown element: {element}")
+
+    def __str__(self):
+        """
+        Return a string representation of the chemical formula.
+
+        Returns
+        -------
+        str
+            The chemical formula as a string.
+        """
+        return "".join(
+            f"{element}({count})"
+            for element, count in sorted(self.elements.items())
+            if count != 0
+        )
+
+    def __repr__(self):
+        """
+        Return a string representation of the ChemicalCompositonFormula instance.
+
+        Returns
+        -------
+        str
+            A string representation of the instance.
+        """
+        return f"ChemicalCompositonFormula('{self.__str__()}')"
+
+    def __add__(self, other):
+        """
+        Add two ChemicalCompositonFormula instances.
+
+        Parameters
+        ----------
+        other : ChemicalCompositonFormula
+            The other instance to add.
+
+        Returns
+        -------
+        ChemicalCompositonFormula
+            A new instance representing the sum of the two formulas.
+        """
+        result = ChemicalCompositonFormula()
+        for element in set(self.elements.keys()) | set(other.elements.keys()):
+            result.elements[element] = self.elements[element] + other.elements[element]
+        return result
+
+    def __sub__(self, other):
+        """
+        Subtract one ChemicalCompositonFormula instance from another.
+
+        Parameters
+        ----------
+        other : ChemicalCompositonFormula
+            The instance to subtract.
+
+        Returns
+        -------
+        ChemicalCompositonFormula
+            A new instance representing the difference of the two formulas.
+        """
+        result = ChemicalCompositonFormula()
+        for element in set(self.elements.keys()) | set(other.elements.keys()):
+            result.elements[element] = self.elements[element] - other.elements[element]
+        return result
diff --git a/alphabase/constants/const_files/amino_acid.tsv b/alphabase/constants/const_files/amino_acid.tsv
@@ -0,0 +1,30 @@
+	formula	smiles
+A	C(3)H(5)N(1)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
+B	C(1000000)
+C	C(3)H(5)N(1)O(1)S(1)	N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
+D	C(4)H(5)N(1)O(3)S(0)	N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
+E	C(5)H(7)N(1)O(3)S(0)	N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
+F	C(9)H(9)N(1)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
+G	C(2)H(3)N(1)O(1)S(0)	N([Xe])([Xe])CC(=O)[Rn]
+H	C(6)H(7)N(3)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
+I	C(6)H(11)N(1)O(1)S(0)	N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
+J	C(6)H(11)N(1)O(1)S(0)
+K	C(6)H(12)N(2)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
+L	C(6)H(11)N(1)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
+M	C(5)H(9)N(1)O(1)S(1)	N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
+N	C(4)H(6)N(2)O(2)S(0)	N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
+O	C(12)H(19)N(3)O(2)	C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N([Xe])([Xe])
+P	C(5)H(7)N(1)O(1)S(0)	N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
+Q	C(5)H(8)N(2)O(2)S(0)	N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
+R	C(6)H(12)N(4)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
+S	C(3)H(5)N(1)O(2)S(0)	N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
+T	C(4)H(7)N(1)O(2)S(0)	N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
+U	C(3)H(5)N(1)O(1)Se(1)	N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn]
+V	C(5)H(9)N(1)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]
+W	C(11)H(10)N(2)O(1)S(0)	N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
+X	C(1000000)
+Y	C(9)H(9)N(1)O(2)S(0)	N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
+Z	C(1000000)
+s	C(3)H(5)N(1)O(2)S(0)
+t	C(4)H(8)N(1)O(5)P(1)
+y	C(9)H(9)N(1)O(2)S(0)