Merge pull request #27 from radifar/identify-electrostatic-interactions

Identify electrostatic interactions
radifar · Dec 24, 2023 · e280e1b · e280e1b
2 parents 869d6a1 + 644ecab
commit e280e1b
Show file tree

Hide file tree

Showing 25 changed files with 601 additions and 460 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,12 +8,11 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.10"
-pydantic = "^2.4.2"
 lark = "^1.1.7"
-click = "^8.1.7"
 rdkit = "^2023.3.3"
 pandas = "^2.1.4"
 pyarrow = "^14.0.1"
+scipy = "^1.11.4"
 
 [tool.poetry.group.dev.dependencies]
 rich = "^13.6.0"

diff --git a/src/deemian/chem/interaction_utility.py b/src/deemian/chem/interaction_utility.py
@@ -0,0 +1,67 @@
+from numpy.linalg import norm as dist
+import pandas as pd
+from rdkit.Chem import AllChem as Chem
+
+from deemian.chem.utility import flatten_atom_list
+
+
+def generate_pair_info(query_result, s1_df, s2_df, conformation, interaction_type):
+    pair_info_df = pd.DataFrame()
+    for i, s2_list in enumerate(query_result):
+        s1 = s1_df.iloc[[i]].reset_index()
+        s2 = s2_df.iloc[s2_list].reset_index()
+        s2["atom_id_1"] = s1.iloc[0].atom_id
+        pair = s1.merge(s2, left_on="atom_id", right_on="atom_id_1", suffixes=("_s1", "_s2")).drop(
+            columns=["atom_id_1"]
+        )
+        pair_info_df = pd.concat([pair_info_df, pair])
+
+    pair_info_df.reset_index(drop=True, inplace=True)
+
+    conf_1 = f"conf_{conformation}_s1"
+    conf_2 = f"conf_{conformation}_s2"
+    pair_info_df["distance"] = (pair_info_df[conf_1] - pair_info_df[conf_2]).map(dist)
+    pair_info_df["conformation"] = conformation
+    pair_info_df["interaction_type"] = interaction_type
+
+    return pair_info_df
+
+
+def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
+    apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
+    apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
+    apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation]  # p for pattern
+    apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]
+
+    potential_cation = [
+        "[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
+        "[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
+        "[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
+        "NC(=N)",
+        "[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
+    ]
+    potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
+    potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
+    potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]
+
+    all_cation = apparent_cation + potential_cation
+    all_anion = apparent_anion + potential_anion
+
+    filter_dictionary = dict(
+        apparent_cation=apparent_cation,
+        apparent_anion=apparent_anion,
+        potential_cation=potential_cation,
+        potential_anion=potential_anion,
+        all_cation=all_cation,
+        all_anion=all_anion,
+    )
+
+    combined_match = []
+    try:
+        for pattern in filter_dictionary[mode]:
+            matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
+            combined_match.append(matched)
+    except KeyError:
+        raise KeyError(f"Error: filtering mode {mode} is not recognized")
+
+    return flatten_atom_list(combined_match)
diff --git a/src/deemian/chem/interactions.py b/src/deemian/chem/interactions.py
@@ -1,43 +1,63 @@
-from rdkit.Chem import AllChem as Chem
+from dataclasses import dataclass
 
-from deemian.chem.utility import flatten_atom_list
-
-
-def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
-    apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
-    apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
-    apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation]  # p for pattern
-    apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]
-
-    potential_cation = [
-        "[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
-        "[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
-        "[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
-        "NC(=N)",
-        "[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
-    ]
-    potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
-    potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
-    potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]
-
-    all_cation = apparent_cation + potential_cation
-    all_anion = apparent_anion + potential_anion
-
-    filter_dictionary = dict(
-        apparent_cation=apparent_cation,
-        apparent_anion=apparent_anion,
-        potential_cation=potential_cation,
-        potential_anion=potential_anion,
-        all_cation=all_cation,
-        all_anion=all_anion,
-    )
-
-    combined_match = []
-    try:
-        for pattern in filter_dictionary[mode]:
-            matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
-            combined_match.append(matched)
-    except KeyError:
-        raise KeyError(f"Error: filtering mode {mode} is not recognized")
-
-    return flatten_atom_list(combined_match)
+import pandas as pd
+from rdkit.Chem import AllChem as Chem
+from scipy.spatial import KDTree
+
+from deemian.chem.interaction_utility import filter_charge, generate_pair_info
+
+
+@dataclass
+class InteractionData:
+    subject_1_mol: Chem.rdchem.Mol
+    subject_2_mol: Chem.rdchem.Mol
+    subject_1_df: pd.DataFrame
+    subject_2_df: pd.DataFrame
+    conformation: list
+    electrostatic_s1_as_cation: pd.DataFrame = None
+    electrostatic_s1_as_anion: pd.DataFrame = None
+
+    def calculate_electrostatic(self, positive: bool, negative: bool):
+        cation_mode = "all_cation" if positive else "apparent_cation"
+        anion_mode = "all_anion" if negative else "apparent_anion"
+
+        cation_1_ids = filter_charge(self.subject_1_mol, cation_mode)
+        anion_1_ids = filter_charge(self.subject_1_mol, anion_mode)
+        cation_2_ids = filter_charge(self.subject_2_mol, cation_mode)
+        anion_2_ids = filter_charge(self.subject_2_mol, anion_mode)
+
+        df1 = self.subject_1_df
+        df2 = self.subject_2_df
+        exclude_atom = ["C", "N", "S", "P"]
+        cation_1_df = df1[df1.index.isin(cation_1_ids)]
+        anion_1_df = df1[df1.index.isin(anion_1_ids) & ~df1["atom_symbol"].isin(exclude_atom)]
+        cation_2_df = df2[df2.index.isin(cation_2_ids)]
+        anion_2_df = df2[df2.index.isin(anion_2_ids) & ~df2["atom_symbol"].isin(exclude_atom)]
+
+        for conf_num in self.conformation:
+            conformation_column = "conf_" + str(conf_num)
+
+            if (not cation_1_df.empty) and (not anion_2_df.empty):
+                print(cation_1_df.empty, anion_2_df.empty)
+                cation_1_tree = KDTree(cation_1_df[conformation_column].to_list())
+                anion_2_tree = KDTree(anion_2_df[conformation_column].to_list())
+
+                s1_as_cation = cation_1_tree.query_ball_tree(anion_2_tree, 4.5)
+
+                self.electrostatic_s1_as_cation = generate_pair_info(
+                    s1_as_cation, cation_1_df, anion_2_df, conf_num, "electrostatic_cation"
+                )
+            else:
+                self.electrostatic_s1_as_cation = pd.DataFrame()
+
+            if (not cation_2_df.empty) and (not anion_1_df.empty):
+                cation_2_tree = KDTree(cation_2_df[conformation_column].to_list())
+                anion_1_tree = KDTree(anion_1_df[conformation_column].to_list())
+
+                s1_as_anion = anion_1_tree.query_ball_tree(cation_2_tree, 4.5)
+
+                self.electrostatic_s1_as_anion = generate_pair_info(
+                    s1_as_anion, anion_1_df, cation_2_df, conf_num, "electrostatic_anion"
+                )
+            else:
+                self.electrostatic_s1_as_anion = pd.DataFrame()
diff --git a/src/deemian/chem/reader.py b/src/deemian/chem/reader.py
@@ -32,7 +32,7 @@ def mol_to_dataframe(mol):
     # note: using pyarrow for list of float making it slower for 'to_list' conversion,
     #       the scipy's KDTree require list of list as the input.
     for index, conformer in enumerate(mol.GetConformers()):
-        coordinate_number = "conf_" + str(index)
+        coordinate_number = "conf_" + str(index + 1)
         mol_record[coordinate_number] = list(conformer.GetPositions())
 
     return pd.DataFrame(mol_record).set_index("atom_id")
diff --git a/src/deemian/chem/utility.py b/src/deemian/chem/utility.py
@@ -11,7 +11,7 @@ def dataframe_to_pdb_block(df: pd.DataFrame) -> str:
     df_noh = df[df["atom_symbol"] != "H"]
     # from https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns
 
-    df_pdb = pd.DataFrame(df_noh["conf_0"].to_list(), columns=["x", "y", "z"])
+    df_pdb = pd.DataFrame(df_noh["conf_1"].to_list(), columns=["x", "y", "z"])
     df_pdb.index = df_noh.index
     df_pdb = df_pdb.join(df_noh).reset_index()
     df_pdb["record_type"] = "HETATM"

diff --git a/src/deemian/engine/builder.py b/src/deemian/engine/builder.py
@@ -1,77 +1,106 @@
+from collections import defaultdict
 from dataclasses import dataclass, field
 
+import pandas as pd
 from rdkit.Chem import AllChem as Chem
 
+from deemian.chem.interactions import InteractionData
 from deemian.chem.reader import mol_to_dataframe
 from deemian.chem.selection import mol_dataframe_selection
 from deemian.chem.utility import dataframe_to_pdb_block
 
 
 @dataclass
-class DeemianData:
-    molecule_dataframe: dict = field(default_factory=lambda: {})
-    molecule_pdb_block: dict = field(default_factory=lambda: {})
-    selections: dict = field(default_factory=lambda: {})
+class Molecule:
+    rdkit_mol: Chem.rdchem.Mol
+    mol_dataframe: pd.DataFrame = None
+
+
+@dataclass
+class Selection:
+    mol_parent: str
+    mol_dataframe: pd.DataFrame
+    mol_pdb_block: str = None
+
+
+@dataclass
+class Measurement:
     interactions: list = field(default_factory=lambda: [])
     ionizable: dict = field(default_factory=lambda: {"positive": False, "negative": False})
     interacting_subjects: dict = field(default_factory=lambda: {})
     conformation: list = field(default_factory=lambda: [])
-    interaction_details: dict = field(default_factory=lambda: {})
-    readable_output: dict = field(default_factory=lambda: {})
 
+    def conformation_range(self, start, end):
+        self.conformation = list(range(int(start), int(end) + 1))
 
-class DeemianDataBuilder:
-    def __init__(self, deemian_data: DeemianData) -> None:
-        self.deemian_data = deemian_data
+    def set_ionizable(self, charge: str, boolean: str):
+        if boolean == "true":
+            boolean = True
+        elif boolean == "false":
+            boolean = False
+        self.ionizable[charge] = boolean
 
-    def read_molecule(self, mol_filename: str):
-        mol = Chem.MolFromPDBFile(mol_filename, removeHs=False)
+
+@dataclass
+class DeemianData:
+    molecule: dict[str, Molecule] = field(default_factory=lambda: {})
+    selection: dict[str, Selection] = field(default_factory=lambda: {})
+    measurement: dict[str, Measurement] = field(default_factory=lambda: defaultdict(Measurement))
+    interaction_details: dict = field(default_factory=lambda: {})
+    readable_output: dict = field(default_factory=lambda: {})
+
+    def add_molecule(self, name):
+        mol = Chem.MolFromPDBFile(name, removeHs=False)
         mol_df = mol_to_dataframe(mol)
-        self.deemian_data.molecule_dataframe[mol_filename] = mol_df
+        self.molecule[name] = Molecule(mol, mol_df)
+
+    def add_selection(self, name, selection, mol_parent):
+        parent_df = self.molecule[mol_parent].mol_dataframe
+        selection_df = mol_dataframe_selection(selection, parent_df)
 
-    def assign_selection(self, name: str, selection: list[tuple], mol_filename: str):
-        mol_df = self.deemian_data.molecule_dataframe[mol_filename]
-        self.deemian_data.molecule_dataframe[name] = mol_dataframe_selection(selection, mol_df)
-        selection.insert(0, mol_filename)
-        self.deemian_data.selections[name] = selection
+        self.selection[name] = Selection(mol_parent, selection_df)
 
-    def correct_bond(self, name: str, template: str):
-        mol_df = self.deemian_data.molecule_dataframe[name]
-        mol_pdb_block = dataframe_to_pdb_block(mol_df)
-        self.deemian_data.molecule_pdb_block[name] = mol_pdb_block
+    def correct_bond(self, name, template):
+        selection_df = self.selection[name].mol_dataframe
+        selection_pdb_block = dataframe_to_pdb_block(selection_df)
+        selection_mol = Chem.MolFromPDBBlock(selection_pdb_block)
 
-        mol = Chem.MolFromPDBBlock(mol_pdb_block)
         template_mol = Chem.MolFromSmiles(template)
-        mol = Chem.AssignBondOrdersFromTemplate(template_mol, mol)
-        self.deemian_data.molecule_dataframe[name] = mol_to_dataframe(mol)
+        corrected_mol = Chem.AssignBondOrdersFromTemplate(template_mol, selection_mol)
+        corrected_df = mol_to_dataframe(corrected_mol)
 
-    def set_interactions(self, interactions: list[str]):
-        self.deemian_data.interactions = interactions
+        self.molecule[name] = Molecule(corrected_mol)
+        self.selection[name] = Selection(name, corrected_df, selection_pdb_block)
 
-    def set_ionizable(self, charge: str, boolean: str):
-        if boolean == "true":
-            boolean = True
-        elif boolean == "false":
-            boolean = False
-        self.deemian_data.ionizable[charge] = boolean
+    def add_measurement(self, id):
+        return self.measurement[id]
+
+    def calculate_interactions(self, id):
+        measurement = self.measurement[id]
 
-    def set_interacting_subjects(self, subject_1: str, subject_2: str, name: str):
-        self.deemian_data.interacting_subjects[name] = (subject_1, subject_2)
+        for pair in measurement.interacting_subjects:
+            subject_1, subject_2 = measurement.interacting_subjects[pair]
+            subject_1 = self.selection[subject_1]
+            subject_2 = self.selection[subject_2]
+            subject_1_mol = self.molecule[subject_1.mol_parent]
+            subject_2_mol = self.molecule[subject_2.mol_parent]
+            subject_1_df = subject_1.mol_dataframe
+            subject_2_df = subject_2.mol_dataframe
+            conformation = measurement.conformation
+            if not any(conformation):
+                conformation = [1]
 
-    def set_conformation(self, number: str):
-        self.deemian_data.conformation = [int(number)]
+            interaction_data = InteractionData(subject_1_mol, subject_2_mol, subject_1_df, subject_2_df, conformation)
 
-    def set_conformation_range(self, start: str, end: str):
-        self.deemian_data.conformation = list(range(int(start), int(end) + 1))
+            interaction_type = measurement.interactions
 
-    def calculate_interactions(self, measurement_identifier: str):
-        return measurement_identifier
+            if ("electrostatic" in interaction_type) or ("all" in interaction_type):
+                interaction_data.calculate_electrostatic(**measurement.ionizable)
 
-    def write_readable_output(self, out_file: str, presentation_identifier: str):
-        return (out_file, presentation_identifier)
+            self.interaction_details[pair] = interaction_data
 
-    def write_deemian_data(self, out_file: str, presentation_identifier: str):
-        return (out_file, presentation_identifier)
+    def write_readable_output(self, out_file: str, presentation_id: str):
+        return (out_file, presentation_id)
 
-    def generate_deemian_data(self):
-        return self.deemian_data
+    def write_deemian_data(self, out_file: str, presentation_id: str):
+        return (out_file, presentation_id)