Skip to content

Commit

Permalink
Merge pull request #218 from MannLabs/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
jalew188 authored Sep 12, 2024
2 parents ba154aa + 7b20a84 commit 7795e13
Show file tree
Hide file tree
Showing 28 changed files with 2,342 additions and 187 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.3.0
current_version = 1.4.0
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion alphabase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "alphabase"
__version__ = "1.3.0"
__version__ = "1.4.0"
__license__ = "Apache"
__description__ = "An infrastructure Python package of the AlphaX ecosystem"
__author__ = "Mann Labs"
Expand Down
44 changes: 25 additions & 19 deletions alphabase/constants/aa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
parse_formula,
reset_elements,
)
from alphabase.yaml_utils import load_yaml

# We use all 128 ASCII code to represent amino acids for flexible extensions in the future.
# The amino acid masses are stored in 128-lengh array :py:data:`AA_ASCII_MASS`.
# If an ASCII code is not in `AA_Formula`, the mass will be set as a large value to disable MS search.
AA_Formula: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "amino_acid.yaml"))
# If an ASCII code is not in `aa_formula`, the mass will be set as a large value to disable MS search.
aa_formula: pd.DataFrame = pd.read_csv(
os.path.join(CONST_FILE_FOLDER, "amino_acid.tsv"), sep="\t", index_col=0
)

#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS: np.ndarray = np.ones(128) * 1e8

Expand All @@ -28,20 +30,23 @@


def replace_atoms(atom_replace_dict: typing.Dict):
for aa, formula in list(AA_Formula.items()):
for aa, row in aa_formula.iterrows():
formula = row["formula"]
atom_comp = dict(parse_formula(formula))
for atom_from, atom_to in atom_replace_dict.items():
if atom_from in atom_comp:
atom_comp[atom_to] = atom_comp[atom_from]
del atom_comp[atom_from]
AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])
aa_formula.loc[aa, "formula"] = "".join(
[f"{atom}({n})" for atom, n in atom_comp.items()]
)


def reset_AA_mass() -> np.ndarray:
"""AA mass in np.array with shape (128,)"""
global AA_ASCII_MASS
for aa, chem in AA_Formula.items():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
for aa, row in aa_formula.iterrows():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(row["formula"])
return AA_ASCII_MASS


Expand All @@ -51,15 +56,14 @@ def reset_AA_mass() -> np.ndarray:
def reset_AA_df():
global AA_DF
AA_DF = pd.DataFrame()
AA_DF["aa"] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
AA_DF["formula"] = [""] * len(AA_ASCII_MASS)
aa_idxes = []
formulas = []
for aa, formula in AA_Formula.items():
aa_idxes.append(ord(aa))
formulas.append(formula)
AA_DF.loc[aa_idxes, "formula"] = formulas
num_rows = len(AA_ASCII_MASS)
AA_DF["aa"] = [chr(aa) for aa in range(num_rows)]
AA_DF["formula"] = [""] * num_rows
AA_DF["smiles"] = [""] * num_rows
AA_DF["mass"] = AA_ASCII_MASS
for aa, row in aa_formula.iterrows():
AA_DF.loc[ord(aa), "formula"] = row["formula"]
AA_DF.loc[ord(aa), "smiles"] = row["smiles"]
return AA_DF


Expand All @@ -69,8 +73,8 @@ def reset_AA_df():
def reset_AA_Composition():
global AA_Composition
AA_Composition = {}
for aa, formula, _mass in AA_DF.values:
AA_Composition[aa] = dict(parse_formula(formula))
for aa, row in aa_formula.iterrows():
AA_Composition[aa] = dict(parse_formula(row["formula"]))
return AA_Composition


Expand All @@ -85,12 +89,14 @@ def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
reset_AA_Composition()


def update_an_AA(aa: str, formula: str):
def update_an_AA(aa: str, formula: str, smiles: str = ""):
aa_idx = ord(aa)
aa_formula.loc[aa, "formula"] = formula
aa_formula.loc[aa, "smiles"] = smiles
AA_DF.loc[aa_idx, "formula"] = formula
AA_DF.loc[aa_idx, "smiles"] = smiles
AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
AA_DF.loc[aa_idx, "mass"] = AA_ASCII_MASS[aa_idx]
AA_Formula[aa] = formula
AA_Composition[aa] = dict(parse_formula(formula))


Expand Down
216 changes: 216 additions & 0 deletions alphabase/constants/atom.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
import re
import typing
from collections import defaultdict

import numba
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
from alphabase.yaml_utils import load_yaml
Expand Down Expand Up @@ -200,3 +204,215 @@ def calc_mass_from_formula(formula: str):
mass of the formula
"""
return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])


class ChemicalCompositonFormula:
"""
Initialize the ChemicalCompositonFormula with a given formula.
Parameters
----------
formula : str
The chemical formula as a string.
Returns
-------
None
"""

def __init__(self, formula=None):
self.elements = (
defaultdict(int) if formula is None else self._parse_formula(formula)
)

@classmethod
def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from a SMILES string.
Parameters
----------
smiles : str
The SMILES representation of the molecule.
Returns
-------
ChemicalCompositonFormula
An instance of the class based on the SMILES string.
Raises
------
ValueError
If the SMILES string is invalid and can't be converted to an RDKit molecule.
"""
mol = Chem.MolFromSmiles(smiles)
if not mol:
raise ValueError(f"Invalid RDKit molecule: {smiles}")
formula = rdMolDescriptors.CalcMolFormula(
mol, separateIsotopes=True, abbreviateHIsotopes=False
)
formula = formula.replace("[1H]", "H")
return cls._from_rdkit_formula(formula)

@classmethod
def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from an RDKit formula.
Parameters
----------
formula : str
The chemical formula as generated by RDKit.
Returns
-------
ChemicalCompositonFormula
An instance of the class based on the RDKit formula.
"""
instance = cls.__new__(cls)
instance.elements = instance._parse_rdkit_formula(formula)
return instance

def _parse_formula(self, formula) -> dict:
"""
Parse a chemical formula string into a dictionary of elements and their counts.
Parameters
----------
formula : str
The chemical formula to parse.
Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses
# Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')]
pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?"
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)

for isotope, element, count in matches:
if isotope:
element = f"{isotope}{element}"
count = int(count) if count else 1
element_counts[element] += count

self._validate_atoms(element_counts)
return element_counts

def _parse_rdkit_formula(self, formula: str) -> dict:
"""
Parse an RDKit-generated formula string into a dictionary of elements and their counts.
Parameters
----------
formula : str
The RDKit-generated chemical formula to parse.
Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count
# Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')]
pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)"
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)

for match in matches:
count = int(match[4]) if match[4] else 1
if match[1]: # noqa: SIM108
# Isotope, see 0th element in the example above
element = f"{match[1]}{match[2]}"
else:
# Regular element, see the rest in the example above
element = match[3]
element_counts[element] += count

self._validate_atoms(element_counts)
return element_counts

def _validate_atoms(self, element_counts):
"""
Validate the elements in the formula.
Parameters
----------
element_counts : dict
The elements and their counts in the formula.
Raises
------
ValueError
If the formula contains an unknown element.
"""
for element in element_counts:
if element not in CHEM_MONO_MASS:
raise ValueError(f"Unknown element: {element}")

def __str__(self):
"""
Return a string representation of the chemical formula.
Returns
-------
str
The chemical formula as a string.
"""
return "".join(
f"{element}({count})"
for element, count in sorted(self.elements.items())
if count != 0
)

def __repr__(self):
"""
Return a string representation of the ChemicalCompositonFormula instance.
Returns
-------
str
A string representation of the instance.
"""
return f"ChemicalCompositonFormula('{self.__str__()}')"

def __add__(self, other):
"""
Add two ChemicalCompositonFormula instances.
Parameters
----------
other : ChemicalCompositonFormula
The other instance to add.
Returns
-------
ChemicalCompositonFormula
A new instance representing the sum of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] + other.elements[element]
return result

def __sub__(self, other):
"""
Subtract one ChemicalCompositonFormula instance from another.
Parameters
----------
other : ChemicalCompositonFormula
The instance to subtract.
Returns
-------
ChemicalCompositonFormula
A new instance representing the difference of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] - other.elements[element]
return result
30 changes: 30 additions & 0 deletions alphabase/constants/const_files/amino_acid.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
formula smiles
A C(3)H(5)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
B C(1000000)
C C(3)H(5)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
D C(4)H(5)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
E C(5)H(7)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
F C(9)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
G C(2)H(3)N(1)O(1)S(0) N([Xe])([Xe])CC(=O)[Rn]
H C(6)H(7)N(3)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
I C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
J C(6)H(11)N(1)O(1)S(0)
K C(6)H(12)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
L C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
M C(5)H(9)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
N C(4)H(6)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
O C(12)H(19)N(3)O(2) C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N([Xe])([Xe])
P C(5)H(7)N(1)O(1)S(0) N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
Q C(5)H(8)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
R C(6)H(12)N(4)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
S C(3)H(5)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
T C(4)H(7)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
U C(3)H(5)N(1)O(1)Se(1) N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn]
V C(5)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]
W C(11)H(10)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
X C(1000000)
Y C(9)H(9)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
Z C(1000000)
s C(3)H(5)N(1)O(2)S(0)
t C(4)H(8)N(1)O(5)P(1)
y C(9)H(9)N(1)O(2)S(0)
Loading

0 comments on commit 7795e13

Please sign in to comment.