-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: electrostatic interactions identification
- Loading branch information
Showing
21 changed files
with
394 additions
and
104 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from numpy.linalg import norm as dist | ||
import pandas as pd | ||
from rdkit.Chem import AllChem as Chem | ||
|
||
from deemian.chem.utility import flatten_atom_list | ||
|
||
|
||
def generate_pair_info(query_result, s1_df, s2_df, conformation, interaction_type): | ||
pair_info_df = pd.DataFrame() | ||
for i, s2_list in enumerate(query_result): | ||
s1 = s1_df.iloc[[i]].reset_index() | ||
s2 = s2_df.iloc[s2_list].reset_index() | ||
s2["atom_id_1"] = s1.iloc[0].atom_id | ||
pair = s1.merge(s2, left_on="atom_id", right_on="atom_id_1", suffixes=("_s1", "_s2")).drop( | ||
columns=["atom_id_1"] | ||
) | ||
pair_info_df = pd.concat([pair_info_df, pair]) | ||
|
||
pair_info_df.reset_index(drop=True, inplace=True) | ||
|
||
conf_1 = f"conf_{conformation}_s1" | ||
conf_2 = f"conf_{conformation}_s2" | ||
pair_info_df["distance"] = (pair_info_df[conf_1] - pair_info_df[conf_2]).map(dist) | ||
pair_info_df["conformation"] = conformation | ||
pair_info_df["interaction_type"] = interaction_type | ||
|
||
return pair_info_df | ||
|
||
|
||
def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]: | ||
apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"] | ||
apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"] | ||
apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation] # p for pattern | ||
apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion] | ||
|
||
potential_cation = [ | ||
"[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]", | ||
"[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]", | ||
"[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]", | ||
"NC(=N)", | ||
"[n;R1]1[c;R1][n;R1][c;R1][c;R1]1", | ||
] | ||
potential_anion = ["O=[C,S,N,P]-[OH,O-]"] | ||
potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation] | ||
potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion] | ||
|
||
all_cation = apparent_cation + potential_cation | ||
all_anion = apparent_anion + potential_anion | ||
|
||
filter_dictionary = dict( | ||
apparent_cation=apparent_cation, | ||
apparent_anion=apparent_anion, | ||
potential_cation=potential_cation, | ||
potential_anion=potential_anion, | ||
all_cation=all_cation, | ||
all_anion=all_anion, | ||
) | ||
|
||
combined_match = [] | ||
try: | ||
for pattern in filter_dictionary[mode]: | ||
matched = flatten_atom_list(mol.GetSubstructMatches(pattern)) | ||
combined_match.append(matched) | ||
except KeyError: | ||
raise KeyError(f"Error: filtering mode {mode} is not recognized") | ||
|
||
return flatten_atom_list(combined_match) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,63 @@ | ||
from rdkit.Chem import AllChem as Chem | ||
from dataclasses import dataclass | ||
|
||
from deemian.chem.utility import flatten_atom_list | ||
|
||
|
||
def filter_charge(mol: Chem.rdchem.Mol, mode: str) -> list[int]: | ||
apparent_cation = ["[$([*+1,*+2,*+3]);!$([N+]-[O-])]", "[NX3&!$([NX3]-O)]-[C]=[NX3+]"] | ||
apparent_anion = ["O=[C,S,N,P]-[O-]", "[*-1,*-2]"] | ||
apparent_cation = [Chem.MolFromSmarts(p) for p in apparent_cation] # p for pattern | ||
apparent_anion = [Chem.MolFromSmarts(p) for p in apparent_anion] | ||
|
||
potential_cation = [ | ||
"[$([N;H2&+0][C;!$(C=*)]);!$(N[a])]", | ||
"[$([N;H1&+0]([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]", | ||
"[$([N;H0&+0]([C;!$(C=*)])([C;!$(C=*)])[C;!$(C=*)]);!$(N[a])]", | ||
"NC(=N)", | ||
"[n;R1]1[c;R1][n;R1][c;R1][c;R1]1", | ||
] | ||
potential_anion = ["O=[C,S,N,P]-[OH,O-]"] | ||
potential_cation = [Chem.MolFromSmarts(p) for p in potential_cation] | ||
potential_anion = [Chem.MolFromSmarts(p) for p in potential_anion] | ||
|
||
all_cation = apparent_cation + potential_cation | ||
all_anion = apparent_anion + potential_anion | ||
|
||
filter_dictionary = dict( | ||
apparent_cation=apparent_cation, | ||
apparent_anion=apparent_anion, | ||
potential_cation=potential_cation, | ||
potential_anion=potential_anion, | ||
all_cation=all_cation, | ||
all_anion=all_anion, | ||
) | ||
|
||
combined_match = [] | ||
try: | ||
for pattern in filter_dictionary[mode]: | ||
matched = flatten_atom_list(mol.GetSubstructMatches(pattern)) | ||
combined_match.append(matched) | ||
except KeyError: | ||
raise KeyError(f"Error: filtering mode {mode} is not recognized") | ||
|
||
return flatten_atom_list(combined_match) | ||
import pandas as pd | ||
from rdkit.Chem import AllChem as Chem | ||
from scipy.spatial import KDTree | ||
|
||
from deemian.chem.interaction_utility import filter_charge, generate_pair_info | ||
|
||
|
||
@dataclass | ||
class InteractionData: | ||
subject_1_mol: Chem.rdchem.Mol | ||
subject_2_mol: Chem.rdchem.Mol | ||
subject_1_df: pd.DataFrame | ||
subject_2_df: pd.DataFrame | ||
conformation: list | ||
electrostatic_s1_as_cation: pd.DataFrame = None | ||
electrostatic_s1_as_anion: pd.DataFrame = None | ||
|
||
def calculate_electrostatic(self, positive: bool, negative: bool): | ||
cation_mode = "all_cation" if positive else "apparent_cation" | ||
anion_mode = "all_anion" if negative else "apparent_anion" | ||
|
||
cation_1_ids = filter_charge(self.subject_1_mol, cation_mode) | ||
anion_1_ids = filter_charge(self.subject_1_mol, anion_mode) | ||
cation_2_ids = filter_charge(self.subject_2_mol, cation_mode) | ||
anion_2_ids = filter_charge(self.subject_2_mol, anion_mode) | ||
|
||
df1 = self.subject_1_df | ||
df2 = self.subject_2_df | ||
exclude_atom = ["C", "N", "S", "P"] | ||
cation_1_df = df1[df1.index.isin(cation_1_ids)] | ||
anion_1_df = df1[df1.index.isin(anion_1_ids) & ~df1["atom_symbol"].isin(exclude_atom)] | ||
cation_2_df = df2[df2.index.isin(cation_2_ids)] | ||
anion_2_df = df2[df2.index.isin(anion_2_ids) & ~df2["atom_symbol"].isin(exclude_atom)] | ||
|
||
for conf_num in self.conformation: | ||
conformation_column = "conf_" + str(conf_num) | ||
|
||
if (not cation_1_df.empty) and (not anion_2_df.empty): | ||
print(cation_1_df.empty, anion_2_df.empty) | ||
cation_1_tree = KDTree(cation_1_df[conformation_column].to_list()) | ||
anion_2_tree = KDTree(anion_2_df[conformation_column].to_list()) | ||
|
||
s1_as_cation = cation_1_tree.query_ball_tree(anion_2_tree, 4.5) | ||
|
||
self.electrostatic_s1_as_cation = generate_pair_info( | ||
s1_as_cation, cation_1_df, anion_2_df, conf_num, "electrostatic_cation" | ||
) | ||
else: | ||
self.electrostatic_s1_as_cation = pd.DataFrame() | ||
|
||
if (not cation_2_df.empty) and (not anion_1_df.empty): | ||
cation_2_tree = KDTree(cation_2_df[conformation_column].to_list()) | ||
anion_1_tree = KDTree(anion_1_df[conformation_column].to_list()) | ||
|
||
s1_as_anion = anion_1_tree.query_ball_tree(cation_2_tree, 4.5) | ||
|
||
self.electrostatic_s1_as_anion = generate_pair_info( | ||
s1_as_anion, anion_1_df, cation_2_df, conf_num, "electrostatic_anion" | ||
) | ||
else: | ||
self.electrostatic_s1_as_anion = pd.DataFrame() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import pytest | ||
import pandas as pd | ||
|
||
from rdkit import Chem | ||
|
||
from deemian.chem.selection import mol_dataframe_selection | ||
|
||
|
||
@pytest.fixture | ||
def n1(): | ||
return Chem.MolFromPDBFile("tests/data/5nzn.pdb", removeHs=False) | ||
|
||
|
||
@pytest.fixture | ||
def oseltamivir_corrected(): | ||
return Chem.MolFromPDBFile("tests/data/oseltamivir_corrected.pdb", removeHs=False) | ||
|
||
|
||
@pytest.fixture | ||
def n1_df(): | ||
""" | ||
Parquet file originated from 5nzn.pdb, protonated using Protoss. | ||
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine | ||
""" | ||
|
||
return pd.read_parquet("tests/data/5nzn.parquet.gzip") | ||
|
||
|
||
@pytest.fixture | ||
def spike_df(): | ||
""" | ||
Parquet file originated from 7u0n.pdb, protonated using Protoss. | ||
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine | ||
""" | ||
|
||
return pd.read_parquet("tests/data/5nzn.parquet.gzip") | ||
|
||
|
||
@pytest.fixture | ||
def vps4_df(): | ||
""" | ||
Parquet file originated from 2k3w.pdb without preparation (it is NMR solution structure) | ||
Loaded with deemian.chem.reader.mol_to_dataframe saved using pyarrow engine | ||
""" | ||
|
||
return pd.read_parquet("tests/data/2k3w.parquet.gzip") | ||
|
||
|
||
@pytest.fixture | ||
def n1_protein_A(n1_df): | ||
protein_A_selection = [("chain", "A"), ("and", "protein")] | ||
|
||
return mol_dataframe_selection(protein_A_selection, n1_df) | ||
|
||
|
||
@pytest.fixture | ||
def oseltamivir_df(): | ||
return pd.read_parquet("tests/data/oseltamivir.parquet.gzip") | ||
|
||
|
||
@pytest.fixture | ||
def oseltamivir_corrected_df(): | ||
return pd.read_parquet("tests/data/oseltamivir_corrected.parquet.gzip") |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
HETATM 1 O1A G39 A 503 42.667 -0.483 16.465 1.00 0.00 O | ||
HETATM 2 O1B G39 A 503 42.969 -2.501 17.343 1.00 0.00 O | ||
HETATM 3 O10 G39 A 503 45.317 1.913 23.170 1.00 0.00 O | ||
HETATM 4 O7 G39 A 503 44.831 -1.644 21.897 1.00 0.00 O | ||
HETATM 5 N4 G39 A 503 42.738 2.586 20.613 1.00 0.00 N | ||
HETATM 6 N5 G39 A 503 43.460 0.767 22.766 1.00 0.00 N | ||
HETATM 7 C82 G39 A 503 45.681 -1.738 24.703 1.00 0.00 C | ||
HETATM 8 C91 G39 A 503 45.274 -4.558 21.185 1.00 0.00 C | ||
HETATM 9 C11 G39 A 503 43.775 1.710 24.951 1.00 0.00 C | ||
HETATM 10 C81 G39 A 503 45.439 -2.940 23.838 1.00 0.00 C | ||
HETATM 11 C9 G39 A 503 44.112 -4.068 22.043 1.00 0.00 C | ||
HETATM 12 C7 G39 A 503 43.656 -1.490 19.779 1.00 0.00 C | ||
HETATM 13 C3 G39 A 503 43.012 0.776 19.027 1.00 0.00 C | ||
HETATM 14 C1 G39 A 503 42.951 -1.249 17.452 1.00 0.00 C | ||
HETATM 15 C10 G39 A 503 44.286 1.453 23.533 1.00 0.00 C | ||
HETATM 16 C8 G39 A 503 44.399 -2.720 22.765 1.00 0.00 C | ||
HETATM 17 C6 G39 A 503 43.713 -1.030 21.244 1.00 0.00 C | ||
HETATM 18 C4 G39 A 503 42.719 1.127 20.490 1.00 0.00 C | ||
HETATM 19 C5 G39 A 503 43.759 0.470 21.389 1.00 0.00 C | ||
HETATM 20 C2 G39 A 503 43.243 -0.700 18.788 1.00 0.00 C | ||
CONECT 1 14 | ||
CONECT 2 14 14 | ||
CONECT 3 15 15 | ||
CONECT 4 16 17 | ||
CONECT 5 18 | ||
CONECT 6 15 19 | ||
CONECT 7 10 | ||
CONECT 8 11 | ||
CONECT 9 15 | ||
CONECT 10 16 | ||
CONECT 11 16 | ||
CONECT 12 17 20 20 | ||
CONECT 13 18 20 | ||
CONECT 14 20 | ||
CONECT 17 19 | ||
CONECT 18 19 | ||
END |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import pytest | ||
|
||
from deemian.chem.interactions import InteractionData | ||
|
||
|
||
@pytest.fixture | ||
def interaction_data_n1(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A): | ||
interaction_data = InteractionData(oseltamivir_corrected, n1, oseltamivir_corrected_df, n1_protein_A, [1]) | ||
|
||
return interaction_data | ||
|
||
|
||
def test_interaction_data_calculate_electrostatic(interaction_data_n1): | ||
interaction_data_n1.calculate_electrostatic(positive=True, negative=True) | ||
|
||
assert interaction_data_n1.electrostatic_s1_as_cation.shape == (3, 17) | ||
interaction_data_n1.electrostatic_s1_as_anion.shape == (12, 17) | ||
|
||
|
||
def test_interaction_data_calculate_electrostatic_apparent_only(interaction_data_n1): | ||
interaction_data_n1.calculate_electrostatic(positive=False, negative=False) | ||
|
||
assert interaction_data_n1.electrostatic_s1_as_cation.shape == (0, 0) | ||
assert interaction_data_n1.electrostatic_s1_as_anion.shape == (0, 0) |
Oops, something went wrong.