Skip to content

Commit

Permalink
feat: electrostatic interactions identification
Browse files Browse the repository at this point in the history
  • Loading branch information
radifar committed Dec 24, 2023
1 parent 35b4b2e commit 644ecab
Show file tree
Hide file tree
Showing 21 changed files with 394 additions and 104 deletions.
67 changes: 67 additions & 0 deletions src/deemian/chem/interaction_utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from numpy.linalg import norm as dist
import pandas as pd
from rdkit.Chem import AllChem as Chem

from deemian.chem.utility import flatten_atom_list


def generate_pair_info(query_result, s1_df, s2_df, conformation, interaction_type):
pair_info_df = pd.DataFrame()
for i, s2_list in enumerate(query_result):
s1 = s1_df.iloc[[i]].reset_index()
s2 = s2_df.iloc[s2_list].reset_index()
s2["atom_id_1"] = s1.iloc[0].atom_id
pair = s1.merge(s2, left_on="atom_id", right_on="atom_id_1", suffixes=("_s1", "_s2")).drop(
columns=["atom_id_1"]
)
pair_info_df = pd.concat([pair_info_df, pair])

pair_info_df.reset_index(drop=True, inplace=True)

conf_1 = f"conf_{conformation}_s1"
conf_2 = f"conf_{conformation}_s2"
pair_info_df["distance"] = (pair_info_df[conf_1] - pair_info_df[conf_2]).map(dist)
pair_info_df["conformation"] = conformation
pair_info_df["interaction_type"] = interaction_type

return pair_info_df


def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation] # p for pattern
apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]

potential_cation = [
"[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
"[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
"[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
"NC(=N)",
"[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
]
potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]

all_cation = apparent_cation + potential_cation
all_anion = apparent_anion + potential_anion

filter_dictionary = dict(
apparent_cation=apparent_cation,
apparent_anion=apparent_anion,
potential_cation=potential_cation,
potential_anion=potential_anion,
all_cation=all_cation,
all_anion=all_anion,
)

combined_match = []
try:
for pattern in filter_dictionary[mode]:
matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
combined_match.append(matched)
except KeyError:
raise KeyError(f"Error: filtering mode {mode} is not recognized")

return flatten_atom_list(combined_match)
104 changes: 62 additions & 42 deletions src/deemian/chem/interactions.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,63 @@
from rdkit.Chem import AllChem as Chem
from dataclasses import dataclass

from deemian.chem.utility import flatten_atom_list


def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]:
apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"]
apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"]
apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation] # p for pattern
apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion]

potential_cation = [
"[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]",
"[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
"[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]",
"NC(=N)",
"[n;R1]1[c;R1][n;R1][c;R1][c;R1]1",
]
potential_anion = ["O=[C,S,N,P]-[OH,O-]"]
potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation]
potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion]

all_cation = apparent_cation + potential_cation
all_anion = apparent_anion + potential_anion

filter_dictionary = dict(
apparent_cation=apparent_cation,
apparent_anion=apparent_anion,
potential_cation=potential_cation,
potential_anion=potential_anion,
all_cation=all_cation,
all_anion=all_anion,
)

combined_match = []
try:
for pattern in filter_dictionary[mode]:
matched = flatten_atom_list(mol.GetSubstructMatches(pattern))
combined_match.append(matched)
except KeyError:
raise KeyError(f"Error: filtering mode {mode} is not recognized")

return flatten_atom_list(combined_match)
import pandas as pd
from rdkit.Chem import AllChem as Chem
from scipy.spatial import KDTree

from deemian.chem.interaction_utility import filter_charge, generate_pair_info


@dataclass
class InteractionData:
subject_1_mol: Chem.rdchem.Mol
subject_2_mol: Chem.rdchem.Mol
subject_1_df: pd.DataFrame
subject_2_df: pd.DataFrame
conformation: list
electrostatic_s1_as_cation: pd.DataFrame = None
electrostatic_s1_as_anion: pd.DataFrame = None

def calculate_electrostatic(self, positive: bool, negative: bool):
cation_mode = "all_cation" if positive else "apparent_cation"
anion_mode = "all_anion" if negative else "apparent_anion"

cation_1_ids = filter_charge(self.subject_1_mol, cation_mode)
anion_1_ids = filter_charge(self.subject_1_mol, anion_mode)
cation_2_ids = filter_charge(self.subject_2_mol, cation_mode)
anion_2_ids = filter_charge(self.subject_2_mol, anion_mode)

df1 = self.subject_1_df
df2 = self.subject_2_df
exclude_atom = ["C", "N", "S", "P"]
cation_1_df = df1[df1.index.isin(cation_1_ids)]
anion_1_df = df1[df1.index.isin(anion_1_ids) & ~df1["atom_symbol"].isin(exclude_atom)]
cation_2_df = df2[df2.index.isin(cation_2_ids)]
anion_2_df = df2[df2.index.isin(anion_2_ids) & ~df2["atom_symbol"].isin(exclude_atom)]

for conf_num in self.conformation:
conformation_column = "conf_" + str(conf_num)

if (not cation_1_df.empty) and (not anion_2_df.empty):
print(cation_1_df.empty, anion_2_df.empty)
cation_1_tree = KDTree(cation_1_df[conformation_column].to_list())
anion_2_tree = KDTree(anion_2_df[conformation_column].to_list())

s1_as_cation = cation_1_tree.query_ball_tree(anion_2_tree, 4.5)

self.electrostatic_s1_as_cation = generate_pair_info(
s1_as_cation, cation_1_df, anion_2_df, conf_num, "electrostatic_cation"
)
else:
self.electrostatic_s1_as_cation = pd.DataFrame()

if (not cation_2_df.empty) and (not anion_1_df.empty):
cation_2_tree = KDTree(cation_2_df[conformation_column].to_list())
anion_1_tree = KDTree(anion_1_df[conformation_column].to_list())

s1_as_anion = anion_1_tree.query_ball_tree(cation_2_tree, 4.5)

self.electrostatic_s1_as_anion = generate_pair_info(
s1_as_anion, anion_1_df, cation_2_df, conf_num, "electrostatic_anion"
)
else:
self.electrostatic_s1_as_anion = pd.DataFrame()
2 changes: 1 addition & 1 deletion src/deemian/chem/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def dataframe_to_pdb_block(df: pd.DataFrame) -> str:
df_noh = df[df["atom_symbol"] != "H"]
# from https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns

df_pdb = pd.DataFrame(df_noh["conf_0"].to_list(), columns=["x", "y", "z"])
df_pdb = pd.DataFrame(df_noh["conf_1"].to_list(), columns=["x", "y", "z"])
df_pdb.index = df_noh.index
df_pdb = df_pdb.join(df_noh).reset_index()
df_pdb["record_type"] = "HETATM"
Expand Down
30 changes: 26 additions & 4 deletions src/deemian/engine/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
from rdkit.Chem import AllChem as Chem

from deemian.chem.interactions import InteractionData
from deemian.chem.reader import mol_to_dataframe
from deemian.chem.selection import mol_dataframe_selection
from deemian.chem.utility import dataframe_to_pdb_block
Expand Down Expand Up @@ -71,11 +72,32 @@ def correct_bond(self, name, template):
self.molecule[name] = Molecule(corrected_mol)
self.selection[name] = Selection(name, corrected_df, selection_pdb_block)

def add_measurement(self, name):
return self.measurement[name]
def add_measurement(self, id):
return self.measurement[id]

def calculate_interactions(self):
return 1
def calculate_interactions(self, id):
measurement = self.measurement[id]

for pair in measurement.interacting_subjects:
subject_1, subject_2 = measurement.interacting_subjects[pair]
subject_1 = self.selection[subject_1]
subject_2 = self.selection[subject_2]
subject_1_mol = self.molecule[subject_1.mol_parent]
subject_2_mol = self.molecule[subject_2.mol_parent]
subject_1_df = subject_1.mol_dataframe
subject_2_df = subject_2.mol_dataframe
conformation = measurement.conformation
if not any(conformation):
conformation = [1]

interaction_data = InteractionData(subject_1_mol, subject_2_mol, subject_1_df, subject_2_df, conformation)

interaction_type = measurement.interactions

if ("electrostatic" in interaction_type) or ("all" in interaction_type):
interaction_data.calculate_electrostatic(**measurement.ionizable)

self.interaction_details[pair] = interaction_data

def write_readable_output(self, out_file: str, presentation_id: str):
return (out_file, presentation_id)
Expand Down
2 changes: 1 addition & 1 deletion src/deemian/engine/director.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def director(steps: Tree, data: DeemianData):
elif inst.type == "conformation_range":
measurement.conformation_range(inst.start, inst.end)

data.calculate_interactions()
data.calculate_interactions(measurement_id)

elif step.data == "presentation":
instructions = step.children
Expand Down
63 changes: 63 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pytest
import pandas as pd

from rdkit import Chem

from deemian.chem.selection import mol_dataframe_selection


@pytest.fixture
def n1():
return Chem.MolFromPDBFile("tests/data/5nzn.pdb", removeHs=False)


@pytest.fixture
def oseltamivir_corrected():
return Chem.MolFromPDBFile("tests/data/oseltamivir_corrected.pdb", removeHs=False)


@pytest.fixture
def n1_df():
"""
Parquet file originated from 5nzn.pdb, protonated using Protoss.
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
"""

return pd.read_parquet("tests/data/5nzn.parquet.gzip")


@pytest.fixture
def spike_df():
"""
Parquet file originated from 7u0n.pdb, protonated using Protoss.
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
"""

return pd.read_parquet("tests/data/5nzn.parquet.gzip")


@pytest.fixture
def vps4_df():
"""
Parquet file originated from 2k3w.pdb without preparation (it is NMR solution structure)
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine
"""

return pd.read_parquet("tests/data/2k3w.parquet.gzip")


@pytest.fixture
def n1_protein_A(n1_df):
protein_A_selection = [("chain", "A"), ("and", "protein")]

return mol_dataframe_selection(protein_A_selection, n1_df)


@pytest.fixture
def oseltamivir_df():
return pd.read_parquet("tests/data/oseltamivir.parquet.gzip")


@pytest.fixture
def oseltamivir_corrected_df():
return pd.read_parquet("tests/data/oseltamivir_corrected.parquet.gzip")
Binary file modified tests/data/2k3w.parquet.gzip
Binary file not shown.
Binary file modified tests/data/5nzn.parquet.gzip
Binary file not shown.
Binary file modified tests/data/7u0n.parquet.gzip
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/data/n1-oseltamivir-5nzn.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ measure protein_ligand [
interactions all
ionizable positive true
ionizable negative true
between protein_A and oseltamivir
between oseltamivir and protein_A
conformation 1
]

Expand Down
Binary file modified tests/data/oseltamivir.parquet.gzip
Binary file not shown.
Binary file added tests/data/oseltamivir_corrected.parquet.gzip
Binary file not shown.
37 changes: 37 additions & 0 deletions tests/data/oseltamivir_corrected.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
HETATM 1 O1A G39 A 503 42.667 -0.483 16.465 1.00 0.00 O
HETATM 2 O1B G39 A 503 42.969 -2.501 17.343 1.00 0.00 O
HETATM 3 O10 G39 A 503 45.317 1.913 23.170 1.00 0.00 O
HETATM 4 O7 G39 A 503 44.831 -1.644 21.897 1.00 0.00 O
HETATM 5 N4 G39 A 503 42.738 2.586 20.613 1.00 0.00 N
HETATM 6 N5 G39 A 503 43.460 0.767 22.766 1.00 0.00 N
HETATM 7 C82 G39 A 503 45.681 -1.738 24.703 1.00 0.00 C
HETATM 8 C91 G39 A 503 45.274 -4.558 21.185 1.00 0.00 C
HETATM 9 C11 G39 A 503 43.775 1.710 24.951 1.00 0.00 C
HETATM 10 C81 G39 A 503 45.439 -2.940 23.838 1.00 0.00 C
HETATM 11 C9 G39 A 503 44.112 -4.068 22.043 1.00 0.00 C
HETATM 12 C7 G39 A 503 43.656 -1.490 19.779 1.00 0.00 C
HETATM 13 C3 G39 A 503 43.012 0.776 19.027 1.00 0.00 C
HETATM 14 C1 G39 A 503 42.951 -1.249 17.452 1.00 0.00 C
HETATM 15 C10 G39 A 503 44.286 1.453 23.533 1.00 0.00 C
HETATM 16 C8 G39 A 503 44.399 -2.720 22.765 1.00 0.00 C
HETATM 17 C6 G39 A 503 43.713 -1.030 21.244 1.00 0.00 C
HETATM 18 C4 G39 A 503 42.719 1.127 20.490 1.00 0.00 C
HETATM 19 C5 G39 A 503 43.759 0.470 21.389 1.00 0.00 C
HETATM 20 C2 G39 A 503 43.243 -0.700 18.788 1.00 0.00 C
CONECT 1 14
CONECT 2 14 14
CONECT 3 15 15
CONECT 4 16 17
CONECT 5 18
CONECT 6 15 19
CONECT 7 10
CONECT 8 11
CONECT 9 15
CONECT 10 16
CONECT 11 16
CONECT 12 17 20 20
CONECT 13 18 20
CONECT 14 20
CONECT 17 19
CONECT 18 19
END
24 changes: 24 additions & 0 deletions tests/test_chem_interaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

from deemian.chem.interactions import InteractionData


@pytest.fixture
def interaction_data_n1(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A):
interaction_data = InteractionData(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A, [1])

return interaction_data


def test_interaction_data_calculate_electrostatic(interaction_data_n1):
interaction_data_n1.calculate_electrostatic(positive=True, negative=True)

assert interaction_data_n1.electrostatic_s1_as_cation.shape == (3, 17)
interaction_data_n1.electrostatic_s1_as_anion.shape == (12, 17)


def test_interaction_data_calculate_electrostatic_apparent_only(interaction_data_n1):
interaction_data_n1.calculate_electrostatic(positive=False, negative=False)

assert interaction_data_n1.electrostatic_s1_as_cation.shape == (0, 0)
assert interaction_data_n1.electrostatic_s1_as_anion.shape == (0, 0)
Loading

0 comments on commit 644ecab

Please sign in to comment.