feat: electrostatic interactions identification

radifar · Dec 24, 2023 · 644ecab · 644ecab
1 parent 35b4b2e
commit 644ecab
Show file tree

Hide file tree

Showing 21 changed files with 394 additions and 104 deletions.
diff --git a/src/deemian/chem/interaction_utility.py b/src/deemian/chem/interaction_utility.py
@@ -0,0 +1,67 @@
+from numpy.linalg import norm as dist
+import pandas as pd
+from rdkit.Chem import AllChem as Chem
+
+from deemian.chem.utility import flatten_atom_list
+
+
+def generate_pair_info(query_result, s1_df, s2_df, conformation, interaction_type):
+    pair_info_df = pd.DataFrame()
+    for i, s2_list in enumerate(query_result):
+        s1 = s1_df.iloc[[i]].reset_index()
+        s2 = s2_df.iloc[s2_list].reset_index()
+        s2["atom_id_1"] = s1.iloc[0].atom_id
+        pair = s1.merge(s2, left_on="atom_id", right_on="atom_id_1", suffixes=("_s1", "_s2")).drop(
+            columns=["atom_id_1"]
+        )
+        pair_info_df = pd.concat([pair_info_df, pair])
+
+    pair_info_df.reset_index(drop=True, inplace=True)
+
+    conf_1 = f"conf_{conformation}_s1"
+    conf_2 = f"conf_{conformation}_s2"
+    pair_info_df["distance"] = (pair_info_df[conf_1] - pair_info_df[conf_2]).map(dist)
+    pair_info_df["conformation"] = conformation
+    pair_info_df["interaction_type"] = interaction_type
+
+    return pair_info_df
+
+
+def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
+    apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
+    apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
+    apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation]  # p for pattern
+    apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]
+
+    potential_cation = [
+        "[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
+        "[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
+        "[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
+        "NC(=N)",
+        "[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
+    ]
+    potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
+    potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
+    potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]
+
+    all_cation = apparent_cation + potential_cation
+    all_anion = apparent_anion + potential_anion
+
+    filter_dictionary = dict(
+        apparent_cation=apparent_cation,
+        apparent_anion=apparent_anion,
+        potential_cation=potential_cation,
+        potential_anion=potential_anion,
+        all_cation=all_cation,
+        all_anion=all_anion,
+    )
+
+    combined_match = []
+    try:
+        for pattern in filter_dictionary[mode]:
+            matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
+            combined_match.append(matched)
+    except KeyError:
+        raise KeyError(f"Error: filtering mode {mode} is not recognized")
+
+    return flatten_atom_list(combined_match)
diff --git a/src/deemian/chem/interactions.py b/src/deemian/chem/interactions.py
@@ -1,43 +1,63 @@
-from rdkit.Chem import AllChem as Chem
+from dataclasses import dataclass
 
-from deemian.chem.utility import flatten_atom_list
-
-
-def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
-    apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
-    apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
-    apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation]  # p for pattern
-    apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]
-
-    potential_cation = [
-        "[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
-        "[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
-        "[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
-        "NC(=N)",
-        "[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
-    ]
-    potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
-    potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
-    potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]
-
-    all_cation = apparent_cation + potential_cation
-    all_anion = apparent_anion + potential_anion
-
-    filter_dictionary = dict(
-        apparent_cation=apparent_cation,
-        apparent_anion=apparent_anion,
-        potential_cation=potential_cation,
-        potential_anion=potential_anion,
-        all_cation=all_cation,
-        all_anion=all_anion,
-    )
-
-    combined_match = []
-    try:
-        for pattern in filter_dictionary[mode]:
-            matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
-            combined_match.append(matched)
-    except KeyError:
-        raise KeyError(f"Error: filtering mode {mode} is not recognized")
-
-    return flatten_atom_list(combined_match)
+import pandas as pd
+from rdkit.Chem import AllChem as Chem
+from scipy.spatial import KDTree
+
+from deemian.chem.interaction_utility import filter_charge, generate_pair_info
+
+
+@dataclass
+class InteractionData:
+    subject_1_mol: Chem.rdchem.Mol
+    subject_2_mol: Chem.rdchem.Mol
+    subject_1_df: pd.DataFrame
+    subject_2_df: pd.DataFrame
+    conformation: list
+    electrostatic_s1_as_cation: pd.DataFrame = None
+    electrostatic_s1_as_anion: pd.DataFrame = None
+
+    def calculate_electrostatic(self, positive: bool, negative: bool):
+        cation_mode = "all_cation" if positive else "apparent_cation"
+        anion_mode = "all_anion" if negative else "apparent_anion"
+
+        cation_1_ids = filter_charge(self.subject_1_mol, cation_mode)
+        anion_1_ids = filter_charge(self.subject_1_mol, anion_mode)
+        cation_2_ids = filter_charge(self.subject_2_mol, cation_mode)
+        anion_2_ids = filter_charge(self.subject_2_mol, anion_mode)
+
+        df1 = self.subject_1_df
+        df2 = self.subject_2_df
+        exclude_atom = ["C", "N", "S", "P"]
+        cation_1_df = df1[df1.index.isin(cation_1_ids)]
+        anion_1_df = df1[df1.index.isin(anion_1_ids) & ~df1["atom_symbol"].isin(exclude_atom)]
+        cation_2_df = df2[df2.index.isin(cation_2_ids)]
+        anion_2_df = df2[df2.index.isin(anion_2_ids) & ~df2["atom_symbol"].isin(exclude_atom)]
+
+        for conf_num in self.conformation:
+            conformation_column = "conf_" + str(conf_num)
+
+            if (not cation_1_df.empty) and (not anion_2_df.empty):
+                print(cation_1_df.empty, anion_2_df.empty)
+                cation_1_tree = KDTree(cation_1_df[conformation_column].to_list())
+                anion_2_tree = KDTree(anion_2_df[conformation_column].to_list())
+
+                s1_as_cation = cation_1_tree.query_ball_tree(anion_2_tree, 4.5)
+
+                self.electrostatic_s1_as_cation = generate_pair_info(
+                    s1_as_cation, cation_1_df, anion_2_df, conf_num, "electrostatic_cation"
+                )
+            else:
+                self.electrostatic_s1_as_cation = pd.DataFrame()
+
+            if (not cation_2_df.empty) and (not anion_1_df.empty):
+                cation_2_tree = KDTree(cation_2_df[conformation_column].to_list())
+                anion_1_tree = KDTree(anion_1_df[conformation_column].to_list())
+
+                s1_as_anion = anion_1_tree.query_ball_tree(cation_2_tree, 4.5)
+
+                self.electrostatic_s1_as_anion = generate_pair_info(
+                    s1_as_anion, anion_1_df, cation_2_df, conf_num, "electrostatic_anion"
+                )
+            else:
+                self.electrostatic_s1_as_anion = pd.DataFrame()
diff --git a/src/deemian/chem/utility.py b/src/deemian/chem/utility.py
@@ -11,7 +11,7 @@ def dataframe_to_pdb_block(df: pd.DataFrame) -> str:
     df_noh = df[df["atom_symbol"] != "H"]
     # from https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns
 
-    df_pdb = pd.DataFrame(df_noh["conf_0"].to_list(), columns=["x", "y", "z"])
+    df_pdb = pd.DataFrame(df_noh["conf_1"].to_list(), columns=["x", "y", "z"])
     df_pdb.index = df_noh.index
     df_pdb = df_pdb.join(df_noh).reset_index()
     df_pdb["record_type"] = "HETATM"

diff --git a/src/deemian/engine/builder.py b/src/deemian/engine/builder.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from rdkit.Chem import AllChem as Chem
 
+from deemian.chem.interactions import InteractionData
 from deemian.chem.reader import mol_to_dataframe
 from deemian.chem.selection import mol_dataframe_selection
 from deemian.chem.utility import dataframe_to_pdb_block
@@ -71,11 +72,32 @@ def correct_bond(self, name, template):
         self.molecule[name] = Molecule(corrected_mol)
         self.selection[name] = Selection(name, corrected_df, selection_pdb_block)
 
-    def add_measurement(self, name):
-        return self.measurement[name]
+    def add_measurement(self, id):
+        return self.measurement[id]
 
-    def calculate_interactions(self):
-        return 1
+    def calculate_interactions(self, id):
+        measurement = self.measurement[id]
+
+        for pair in measurement.interacting_subjects:
+            subject_1, subject_2 = measurement.interacting_subjects[pair]
+            subject_1 = self.selection[subject_1]
+            subject_2 = self.selection[subject_2]
+            subject_1_mol = self.molecule[subject_1.mol_parent]
+            subject_2_mol = self.molecule[subject_2.mol_parent]
+            subject_1_df = subject_1.mol_dataframe
+            subject_2_df = subject_2.mol_dataframe
+            conformation = measurement.conformation
+            if not any(conformation):
+                conformation = [1]
+
+            interaction_data = InteractionData(subject_1_mol, subject_2_mol, subject_1_df, subject_2_df, conformation)
+
+            interaction_type = measurement.interactions
+
+            if ("electrostatic" in interaction_type) or ("all" in interaction_type):
+                interaction_data.calculate_electrostatic(**measurement.ionizable)
+
+            self.interaction_details[pair] = interaction_data
 
     def write_readable_output(self, out_file: str, presentation_id: str):
         return (out_file, presentation_id)

diff --git a/src/deemian/engine/director.py b/src/deemian/engine/director.py
@@ -38,7 +38,7 @@ def director(steps: Tree, data: DeemianData):
                 elif inst.type == "conformation_range":
                     measurement.conformation_range(inst.start, inst.end)
 
-            data.calculate_interactions()
+            data.calculate_interactions(measurement_id)
 
         elif step.data == "presentation":
             instructions = step.children

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,63 @@
+import pytest
+import pandas as pd
+
+from rdkit import Chem
+
+from deemian.chem.selection import mol_dataframe_selection
+
+
+@pytest.fixture
+def n1():
+    return Chem.MolFromPDBFile("tests/data/5nzn.pdb", removeHs=False)
+
+
+@pytest.fixture
+def oseltamivir_corrected():
+    return Chem.MolFromPDBFile("tests/data/oseltamivir_corrected.pdb", removeHs=False)
+
+
+@pytest.fixture
+def n1_df():
+    """
+    Parquet file originated from 5nzn.pdb, protonated using Protoss.
+    Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
+    """
+
+    return pd.read_parquet("tests/data/5nzn.parquet.gzip")
+
+
+@pytest.fixture
+def spike_df():
+    """
+    Parquet file originated from 7u0n.pdb, protonated using Protoss.
+    Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
+    """
+
+    return pd.read_parquet("tests/data/5nzn.parquet.gzip")
+
+
+@pytest.fixture
+def vps4_df():
+    """
+    Parquet file originated from 2k3w.pdb without preparation (it is NMR solution structure)
+    Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
+    """
+
+    return pd.read_parquet("tests/data/2k3w.parquet.gzip")
+
+
+@pytest.fixture
+def n1_protein_A(n1_df):
+    protein_A_selection = [("chain", "A"), ("and", "protein")]
+
+    return mol_dataframe_selection(protein_A_selection, n1_df)
+
+
+@pytest.fixture
+def oseltamivir_df():
+    return pd.read_parquet("tests/data/oseltamivir.parquet.gzip")
+
+
+@pytest.fixture
+def oseltamivir_corrected_df():
+    return pd.read_parquet("tests/data/oseltamivir_corrected.parquet.gzip")
diff --git a/tests/data/2k3w.parquet.gzip b/tests/data/2k3w.parquet.gzip
diff --git a/tests/data/5nzn.parquet.gzip b/tests/data/5nzn.parquet.gzip
diff --git a/tests/data/7u0n.parquet.gzip b/tests/data/7u0n.parquet.gzip
diff --git a/tests/data/n1-oseltamivir-5nzn.txt b/tests/data/n1-oseltamivir-5nzn.txt
@@ -8,7 +8,7 @@ measure protein_ligand [
 	interactions all
 	ionizable positive true
 	ionizable negative true
-	between protein_A and oseltamivir
+	between oseltamivir and protein_A
     conformation 1
 ]
 

diff --git a/tests/data/oseltamivir.parquet.gzip b/tests/data/oseltamivir.parquet.gzip
diff --git a/tests/data/oseltamivir_corrected.parquet.gzip b/tests/data/oseltamivir_corrected.parquet.gzip
diff --git a/tests/data/oseltamivir_corrected.pdb b/tests/data/oseltamivir_corrected.pdb
@@ -0,0 +1,37 @@
+HETATM    1 O1A  G39 A 503      42.667  -0.483  16.465  1.00  0.00           O
+HETATM    2 O1B  G39 A 503      42.969  -2.501  17.343  1.00  0.00           O
+HETATM    3 O10  G39 A 503      45.317   1.913  23.170  1.00  0.00           O
+HETATM    4  O7  G39 A 503      44.831  -1.644  21.897  1.00  0.00           O
+HETATM    5  N4  G39 A 503      42.738   2.586  20.613  1.00  0.00           N
+HETATM    6  N5  G39 A 503      43.460   0.767  22.766  1.00  0.00           N
+HETATM    7 C82  G39 A 503      45.681  -1.738  24.703  1.00  0.00           C
+HETATM    8 C91  G39 A 503      45.274  -4.558  21.185  1.00  0.00           C
+HETATM    9 C11  G39 A 503      43.775   1.710  24.951  1.00  0.00           C
+HETATM   10 C81  G39 A 503      45.439  -2.940  23.838  1.00  0.00           C
+HETATM   11  C9  G39 A 503      44.112  -4.068  22.043  1.00  0.00           C
+HETATM   12  C7  G39 A 503      43.656  -1.490  19.779  1.00  0.00           C
+HETATM   13  C3  G39 A 503      43.012   0.776  19.027  1.00  0.00           C
+HETATM   14  C1  G39 A 503      42.951  -1.249  17.452  1.00  0.00           C
+HETATM   15 C10  G39 A 503      44.286   1.453  23.533  1.00  0.00           C
+HETATM   16  C8  G39 A 503      44.399  -2.720  22.765  1.00  0.00           C
+HETATM   17  C6  G39 A 503      43.713  -1.030  21.244  1.00  0.00           C
+HETATM   18  C4  G39 A 503      42.719   1.127  20.490  1.00  0.00           C
+HETATM   19  C5  G39 A 503      43.759   0.470  21.389  1.00  0.00           C
+HETATM   20  C2  G39 A 503      43.243  -0.700  18.788  1.00  0.00           C
+CONECT    1   14
+CONECT    2   14   14
+CONECT    3   15   15
+CONECT    4   16   17
+CONECT    5   18
+CONECT    6   15   19
+CONECT    7   10
+CONECT    8   11
+CONECT    9   15
+CONECT   10   16
+CONECT   11   16
+CONECT   12   17   20   20
+CONECT   13   18   20
+CONECT   14   20
+CONECT   17   19
+CONECT   18   19
+END
diff --git a/tests/test_chem_interaction.py b/tests/test_chem_interaction.py
@@ -0,0 +1,24 @@
+import pytest
+
+from deemian.chem.interactions import InteractionData
+
+
+@pytest.fixture
+def interaction_data_n1(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A):
+    interaction_data = InteractionData(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A, [1])
+
+    return interaction_data
+
+
+def test_interaction_data_calculate_electrostatic(interaction_data_n1):
+    interaction_data_n1.calculate_electrostatic(positive=True, negative=True)
+
+    assert interaction_data_n1.electrostatic_s1_as_cation.shape == (3, 17)
+    interaction_data_n1.electrostatic_s1_as_anion.shape == (12, 17)
+
+
+def test_interaction_data_calculate_electrostatic_apparent_only(interaction_data_n1):
+    interaction_data_n1.calculate_electrostatic(positive=False, negative=False)
+
+    assert interaction_data_n1.electrostatic_s1_as_cation.shape == (0, 0)
+    assert interaction_data_n1.electrostatic_s1_as_anion.shape == (0, 0)