-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchemtempgen_for_amber.py
1101 lines (902 loc) · 49.2 KB
/
chemtempgen_for_amber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import gemmi
import parmed
import json
import pathlib
import copy
import urllib.request
import time
import tempfile
import re
import atexit
import os
covalent_radius = { # from wikipedia
1: 0.31,
5: 0.84,
6: 0.76,
7: 0.71,
8: 0.66,
9: 0.57,
12: 0.00, # hack to avoid bonds with metals
14: 1.11,
15: 1.07,
16: 1.05,
17: 1.02,
# 19: 2.03,
20: 0.00,
# 24: 1.39,
25: 0.00, # hack to avoid bonds with metals
26: 0.00,
30: 0.00, # hack to avoid bonds with metals
# 34: 1.20,
35: 1.20,
53: 1.39,
}
from rdkit import Chem
from rdkit.Chem import rdmolops
from rdkit import RDLogger
import sys, logging
logger = logging.getLogger(__name__)
list_of_AD_elements_as_AtomicNum = list(covalent_radius.keys())
metal_AtomicNums = {12, 20, 25, 26, 30} # Mg: 12, Ca: 20, Mn: 25, Fe: 26, Zn: 30
# Utility Functions
def mol_contains_unexpected_element(mol: Chem.Mol, allowed_elements: list[str] = list_of_AD_elements_as_AtomicNum) -> bool:
"""Check if mol contains unexpected elements"""
for atom in mol.GetAtoms():
if atom.GetAtomicNum() not in allowed_elements:
return True
return False
def get_atom_idx_by_names(mol: Chem.Mol, wanted_names: set[str] = set()) -> set[int]:
if not wanted_names:
return set()
wanted_atoms_by_names = {atom for atom in mol.GetAtoms() if atom.GetProp('atom_id') in wanted_names}
names_not_found = wanted_names.difference({atom.GetProp('atom_id') for atom in wanted_atoms_by_names})
if names_not_found:
logger.warning(f"Molecule doesn't contain the requested atoms with names: {names_not_found} -> continue with found names... ")
return {atom.GetIdx() for atom in wanted_atoms_by_names}
def get_atom_idx_by_patterns(mol: Chem.Mol, allowed_smarts: str,
wanted_smarts_loc: dict[str, set[int]] = None,
allow_multiple: bool=False) -> set[int]:
if wanted_smarts_loc is None:
return set()
wanted_atoms_idx = set()
tmol = Chem.MolFromSmarts(allowed_smarts)
match_allowed = mol.GetSubstructMatches(tmol)
if not match_allowed:
logger.warning(f"Molecule doesn't contain allowed_smarts: {allowed_smarts} -> no pattern-based action will be made. ")
return set()
if len(match_allowed) > 1 and not allow_multiple:
logger.warning(f"Molecule contain multiple copies of allowed_smarts: {allowed_smarts} -> no pattern-based action will be made. ")
return set()
if len(match_allowed) > 1 and allow_multiple:
match_allowed = {item for sublist in match_allowed for item in sublist}
else:
match_allowed = match_allowed[0]
atoms_in_mol = [atom for atom in mol.GetAtoms()]
for wanted_smarts in wanted_smarts_loc:
lmol = Chem.MolFromSmarts(wanted_smarts)
match_wanted = mol.GetSubstructMatches(lmol)
if not match_wanted:
logger.warning(f"Molecule doesn't contain wanted_smarts: {wanted_smarts} -> continue with next pattern... ")
continue
for match_copy in match_wanted:
match_in_copy = (idx for idx in match_copy if match_copy.index(idx) in wanted_smarts_loc[wanted_smarts])
match_wanted_atoms = {atoms_in_mol[idx] for idx in match_in_copy if idx in match_allowed}
if match_wanted_atoms:
wanted_atoms_idx.update(atom.GetIdx() for atom in match_wanted_atoms)
return wanted_atoms_idx
# Mol Editing Functions
def embed(mol: Chem.Mol, allowed_smarts: str,
leaving_names: set[str] = None, leaving_smarts_loc: dict[str, set[int]] = None,
alsoHs: bool = True) -> Chem.Mol:
"""
Remove atoms from the molecule based the union of
(a) leaving_names: list of atom IDs (names), and
(b) leaving_smarts_loc: dict to map substructure SMARTS patterns with
tuple of 0-based indicies for atoms to delete (restricted by allowed_smarts)
"""
if leaving_names is None and leaving_smarts_loc is None:
return mol
leaving_atoms_idx = set()
if leaving_names:
leaving_atoms_idx.update(get_atom_idx_by_names(mol, leaving_names))
if leaving_smarts_loc:
leaving_atoms_idx.update(get_atom_idx_by_patterns(mol, allowed_smarts, leaving_smarts_loc))
if leaving_atoms_idx and alsoHs:
atoms_in_mol = [atom for atom in mol.GetAtoms()]
leaving_Hs = (ne for atom_idx in leaving_atoms_idx.copy() for ne in atoms_in_mol[atom_idx].GetNeighbors() if ne.GetAtomicNum() == 1)
leaving_atoms_idx.update(atom.GetIdx() for atom in leaving_Hs)
if not leaving_atoms_idx:
logger.warning(f"No matched atoms to delete -> embed returning original mol...")
return mol
rwmol = Chem.RWMol(mol)
for atom_idx in sorted(leaving_atoms_idx, reverse=True):
rwmol.RemoveAtom(atom_idx)
rwmol.UpdatePropertyCache()
return rwmol.GetMol()
def cap(mol: Chem.Mol, allowed_smarts: str,
capping_names: set[str] = None, capping_smarts_loc: dict[str, set[int]] = None,
protonate: bool = False) -> Chem.Mol:
"""Add hydrogens to atoms with implicit hydrogens based on the union of
(a) capping_names: list of atom IDs (names), and
(b) capping_smarts_loc: dict to map substructure SMARTS patterns with
tuple of 0-based indicies for atoms."""
if capping_names is None and capping_smarts_loc is None:
return mol
capping_atoms_idx = set()
if capping_names:
capping_atoms_idx.update(get_atom_idx_by_names(mol, capping_names))
if capping_smarts_loc:
capping_atoms_idx.update(get_atom_idx_by_patterns(mol, allowed_smarts, capping_smarts_loc, allow_multiple = True))
if not capping_atoms_idx:
logger.warning(f"No matched atoms to cap -> cap returning original mol...")
return mol
def get_max_Hid(mol: Chem.Mol) -> int:
all_Hids = (atom.GetProp('atom_id') for atom in mol.GetAtoms() if atom.GetAtomicNum()==1)
regular_ids = {Hid for Hid in all_Hids if Hid[0]=='H' and Hid[1:].isdigit()}
if len(regular_ids) > 0:
return max(int(x[1:]) for x in regular_ids)
else:
return 0
rwmol = Chem.RWMol(mol)
new_Hid = get_max_Hid(mol) + 1
atoms_in_mol = [atom for atom in mol.GetAtoms()]
for atom_idx in capping_atoms_idx:
parent_atom = atoms_in_mol[atom_idx]
if protonate:
parent_atom.SetFormalCharge(parent_atom.GetFormalCharge() + 1)
needed_Hs = parent_atom.GetNumImplicitHs()
if needed_Hs == 0:
logger.warning(f"Atom # {atom_idx} ({parent_atom.GetProp('atom_id')}) in mol doesn't have implicit Hs -> continue with next atom... ")
else:
new_atom = Chem.Atom("H")
new_atom.SetProp('atom_id', f"H{new_Hid}")
new_Hid += 1
new_idx = rwmol.AddAtom(new_atom)
rwmol.AddBond(atom_idx, new_idx, Chem.BondType.SINGLE)
rwmol.UpdatePropertyCache()
return rwmol.GetMol()
def deprotonate(mol: Chem.Mol, acidic_proton_loc: dict[str, int] = None) -> Chem.Mol:
"""Remove acidic protons from the molecule based on acidic_proton_loc"""
# acidic_proton_loc is a mapping
# keys: smarts pattern of a fragment
# value: the index (order in smarts) of the leaving proton
if acidic_proton_loc is None:
return mol
# deprotonate all matched protons
acidic_protons_idx = set()
for smarts_pattern, idx in acidic_proton_loc.items():
qmol = Chem.MolFromSmarts(smarts_pattern)
acidic_protons_idx.update(match[idx] for match in mol.GetSubstructMatches(qmol))
if not acidic_protons_idx:
logger.warning(f"Molecule doesn't contain matching atoms for acidic_proton_loc:" +
f"{acidic_proton_loc}" +
f"-> deprotonate returning original mol... ")
return mol
rwmol = Chem.RWMol(mol)
for atom_idx in sorted(acidic_protons_idx, reverse=True):
rwmol.RemoveAtom(atom_idx)
neighbors = mol.GetAtomWithIdx(atom_idx).GetNeighbors()
if neighbors:
neighbor_atom = rwmol.GetAtomWithIdx(neighbors[0].GetIdx())
neighbor_atom.SetFormalCharge(neighbor_atom.GetFormalCharge() - 1)
rwmol.UpdatePropertyCache()
return rwmol.GetMol()
def recharge(rwmol: Chem.RWMol) -> Chem.RWMol:
# Recharging metal complexes
metal_atoms = set(atom for atom in rwmol.GetAtoms() if atom.GetAtomicNum() in metal_AtomicNums)
coord_valence = {v: k for k, v_set in {
1: {1, 9, 17, 35, 53}, # monovalent: H, halogens
2: {8, 16}, # divalent: O, S
3: {5, 7, 15}, # trivalent: B, N, P (phosphine)
4: {6, 14}, # tetravalent: C, Si
}.items() for v in v_set}
bond_order_mapping = {
Chem.BondType.SINGLE: 1,
Chem.BondType.DOUBLE: 2,
Chem.BondType.TRIPLE: 3,
Chem.BondType.AROMATIC: 1.5
}
if not metal_atoms:
return rwmol
else:
# Method a: If charge is ignored at metal center
# -> charge up nonmetal coordinated atoms
# (metal ox state will be interpreted from valence of nonmetal coordinated atom)
metal_to_nonmetal_neighbors = {metal: {atom for atom in metal.GetNeighbors()
if atom.GetAtomicNum() not in metal_AtomicNums} for metal in metal_atoms}
nonmetal_coord_atoms = set(atom for v_set in metal_to_nonmetal_neighbors.values() for atom in v_set)
if any(atom.GetFormalCharge() == 0 for atom in metal_atoms):
logger.warning(f"Molecule contains metal with unspecified charge state -> charging nonmetal coordinated atoms...")
logger.warning(f"All metals will be neutralized and the nonmetal coordinated atoms will be charged according to their explicit valence. ")
for atom in metal_atoms:
atom.SetFormalCharge(0)
for atom in nonmetal_coord_atoms:
bond_sum = sum(bond_order_mapping.get(bond.GetBondType(), 0) for bond in atom.GetBonds())
atom.SetFormalCharge(bond_sum - coord_valence[atom.GetAtomicNum()])
# Method b: If charge (ox) state is specified for all metal centers
# -> ionize (break bonds w/) and charge down nonmetal coordinated atoms
# (metal ox state will be from input charge state)
else:
logger.warning(f"Molecule contains metal specified charge state -> ionizing metal-nonmetal coordinate bonds...")
logger.warning(f"All metal-nonmetal coordinate bonds will be polarized. ")
metal_bonds = {bond.GetIdx(): (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
for metal in metal_atoms for bond in metal.GetBonds()}
for bond_idx in sorted(metal_bonds, reverse=True):
rwmol.RemoveBond(metal_bonds[bond_idx][0], metal_bonds[bond_idx][1])
for atom in nonmetal_coord_atoms:
bond_sum = sum(bond_order_mapping.get(bond.GetBondType(), 0) for bond in atom.GetBonds())
atom.SetFormalCharge(bond_sum - coord_valence[atom.GetAtomicNum()])
# to avoid fragmentation, rebuild all metal-nonmetal bonds
for metal in metal_atoms:
for nei in metal_to_nonmetal_neighbors[metal]:
rwmol.AddBond(metal.GetIdx(), nei.GetIdx(), Chem.BondType.SINGLE)
metal.SetFormalCharge(metal.GetFormalCharge() - 1)
nei.SetFormalCharge(nei.GetFormalCharge() + 1)
logger.warning(f"Total charge of the molecule after recharging: {sum(atom.GetFormalCharge() for atom in rwmol.GetAtoms())}")
return rwmol
# Attribute Formatters
def get_smiles_with_atom_names(mol: Chem.Mol) -> tuple[str, list[str]]:
"""Generate SMILES with atom names in the order of SMILES output."""
# allHsExplicit may expose the implicit Hs of linker atoms to Smiles; the implicit Hs don't have names
smiles_exh = Chem.MolToSmiles(mol, allHsExplicit=True)
smiles_atom_output_order = mol.GetProp('_smilesAtomOutputOrder')
delimiters = ('[', ']', ',')
for delimiter in delimiters:
smiles_atom_output_order = smiles_atom_output_order.replace(delimiter, ' ')
smiles_output_order = (int(x) for x in smiles_atom_output_order.split())
atom_id_dict = {atom.GetIdx(): atom.GetProp('atom_id') for atom in mol.GetAtoms()}
atom_name = [atom_id_dict[atom_i] for atom_i in smiles_output_order]
return smiles_exh, atom_name
def get_pretty_smiles(smi: str) -> str:
"""Convert Smiles with allHsExplicit to pretty Smiles to be put on chem templates"""
# collect the inside square brackets contents
contents = set(re.findall(r'\[([^\]]+)\]', smi))
def is_nonmetal_element(symbol: str) -> bool:
"""Check if a string represents a valid chemical element."""
try:
return Chem.GetPeriodicTable().GetAtomicNumber(symbol) not in metal_AtomicNums
# rdkit throws RuntimeError if invalid
except RuntimeError:
return False
for content in contents:
# keep [H] for explicit Hs
if content == 'H':
continue
# drop H in the content to hide implicit Hs
H_stripped = content.split('H')[0]
# drop [ ] if the content is an uncharged nonmetal element symbol
if is_nonmetal_element(content) or is_nonmetal_element(H_stripped):
smi = smi.replace(f"[{content}]", f"{H_stripped}" if 'H' in content else f"{content}")
return smi
class ChemicalComponent_LoggingControler:
def __init__(self):
self.original_level = logger.level
self.rdkit_logger = RDLogger.logger()
self.default_rdkit_level = RDLogger.WARNING
self.handler = None
def __enter__(self):
self.rdkit_logger.setLevel(RDLogger.CRITICAL)
logger.setLevel(logging.WARNING)
self.handler = logging.StreamHandler(sys.stdout)
logger.addHandler(self.handler)
return self
def __exit__(self, exc_type, exc_value, traceback):
self.rdkit_logger.setLevel(self.default_rdkit_level)
logger.setLevel(self.original_level)
logger.removeHandler(self.handler)
class ChemicalComponent:
def __init__(self, rdkit_mol: Chem.Mol, resname: str, smiles_exh: str, atom_name: list[str]):
self.rdkit_mol = rdkit_mol
self.resname = resname
self.parent = resname # default parent to itself
self.smiles_exh = smiles_exh
self.atom_name = atom_name
self.link_labels = {} # default to empty dict (free molecular form)
def __eq__(self, other):
if isinstance(other, ChemicalComponent):
return self.smiles_exh == other.smiles_exh and self.atom_name == other.atom_name
return False
@classmethod
def from_lib(cls, source_lib: str, resname: str):
"""Create ChemicalComponent from a off library file and a searchable residue name in file."""
# Parse parmed ResidueTemplate object from an OFF library
residues = parmed.amber.offlib.AmberOFFLibrary.parse(source_lib)
residue = residues[resname]
def num_atom_partners(atom: parmed.Atom) -> int:
return len(atom.bond_partners) + (atom is atom.residue.head) + (atom is atom.residue.tail)
def num_current_valence(atom: parmed.Atom) -> int:
return sum(bond.order for bond in atom.bonds) + sum((atom is atom.residue.head,
atom is atom.residue.tail,
atom.formal_charge == -1)) - (atom.formal_charge == 1)
def guess_formal_charge(residue: parmed.modeller.residue.ResidueTemplate) -> parmed.modeller.residue.ResidueTemplate:
for atom in residue.atoms:
if atom.atomic_number==7:
atom.formal_charge = 0
if num_atom_partners(atom)==4:
atom.formal_charge = 1
elif num_atom_partners(atom)==1:
atom.formal_charge = -1
fp = atom.bond_partners[0]
fp.formal_charge = 1
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
elif atom.atomic_number==16 and num_atom_partners(atom)==1:
atom.formal_charge = -1
elif atom.atomic_number==8 and num_atom_partners(atom)==1 and atom.formal_charge is None:
fp = atom.bond_partners[0]
sps = set(sp for sp in fp.bond_partners if sp is not atom)
blunt_sp = set(sp for sp in sps if sp.atomic_number==8 and num_atom_partners(sp)==1)
if len(blunt_sp)==0: # Phenolate
if fp.type=='CA':
atom.formal_charge = -1
else:
atom.formal_charge = 0
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
elif len(blunt_sp)==1:
if fp.atomic_number in (6, 15): # R-C(=O)([O-]), R-P(R)(=O)([O-])
atom.formal_charge = 0
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
for sp in blunt_sp:
sp.formal_charge = -1
fp.formal_charge = 0
elif len(blunt_sp)==2:
if fp.atomic_number == 15: # R-P(=O)([O-])([O-])
atom.formal_charge = 0
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
for sp in blunt_sp:
sp.formal_charge = -1
elif atom.formal_charge is None:
atom.formal_charge = 0
return residue
def guess_bond_order(residue: parmed.modeller.residue.ResidueTemplate) -> parmed.modeller.residue.ResidueTemplate:
ref_valence = {
8: 2, # divalent: O
7: 3, # trivalent: N
6: 4, # tetravalent: C
}
unspec_bonds = set(
bond for bond in residue.bonds
if bond.order == 1 and not any(a.atomic_number == 1 for a in (bond.atom1, bond.atom2))
)
for bond in unspec_bonds.copy():
if set(a.name for a in (bond.atom1, bond.atom2)) == {'NY','CY'}:
bond.order = 3
unspec_bonds.remove(bond)
if any(a.atomic_number not in ref_valence for a in (bond.atom1, bond.atom2)) or \
any(num_current_valence(a) == ref_valence[a.atomic_number] for a in (bond.atom1, bond.atom2)):
unspec_bonds.remove(bond)
import networkx as nx
def find_longest_valid_path(graph):
longest_path = []
# Helper function to perform DFS
def dfs(current_node, current_path, visited_edges, node_visits):
nonlocal longest_path
# Check if the path is valid (even length) and if it's the longest valid path so far
if (len(current_path) == len(node_visits) and # Path length
len(current_path) - 1 > len(longest_path) and # Longer than previous longest
all(v <= (2 if node == current_path[0] else 1) for node, v in node_visits.items()) and # Start node can be visited twice
all(graph.degree[node] == 1 if node_visits[node] == 2 else True for node in current_path) # Degree 1 nodes as start/end
):
# Update longest path, making sure to copy current_path
longest_path = list(current_path)
print(f"{resname}: New Longest Valid Path Found: {longest_path}")
# Traverse neighbors
for neighbor in sorted(graph.neighbors(current_node)): # Sorted for consistent traversal
edge = frozenset([current_node, neighbor]) # Treat edge as unordered set
if edge not in visited_edges:
# Visit edge
visited_edges.add(edge)
current_path.append(neighbor)
node_visits[neighbor] += 1
# Recursive DFS
dfs(neighbor, current_path, visited_edges, node_visits)
# Backtrack
visited_edges.remove(edge)
node_visits[neighbor] -= 1
current_path.pop()
# Special condition: check if returning to the start node with even length
if (current_node == current_path[0] and
len(current_path) % 2 == 0 and
len(current_path) - 1 > len(longest_path)):
longest_path = list(current_path)
print(f"{resname}: Loop Back to Start with Even Length: {longest_path}")
# Initialize node visit counts
node_visits = {node: 0 for node in graph.nodes}
# Start DFS from all nodes (since start/end nodes can be visited differently)
for start_node in graph.nodes:
node_visits[start_node] += 1
dfs(start_node, [start_node], set(), node_visits)
node_visits[start_node] -= 1
return longest_path
def find_isolated_subgraphs(graph):
connected_components = list(nx.connected_components(graph))
isolated_subgraphs = [graph.subgraph(component).copy() for component in connected_components]
return isolated_subgraphs
G = nx.Graph()
atoms_in_residue = residue.atoms
unspec_bonds_by_idx_tuple = {
frozenset([atoms_in_residue.index(bond.atom1), atoms_in_residue.index(bond.atom2)]): bond
for bond in unspec_bonds
}
G.add_edges_from(unspec_bonds_by_idx_tuple.keys())
sorted_isolated_subgraphs = sorted(find_isolated_subgraphs(G), key=lambda subgraph: len(subgraph.nodes()), reverse=True)
for idx, g in enumerate(sorted_isolated_subgraphs):
leaf_nodes = [node for node in g.nodes() if g.degree(node) == 1]
print(resname, "Leaf Nodes", leaf_nodes)
if len(leaf_nodes) > 2:
print("too many leaf nodes")
three_degree_nodes = [node for node in g.nodes() if g.degree(node) == 3]
distance = {}
for i in leaf_nodes:
distance[i] = min(nx.shortest_path_length(g, source=i, target=j) for j in three_degree_nodes)
print(distance)
leaf_nodes_togo = [k for k in distance if distance[k]==1]
if len(leaf_nodes_togo) < len(leaf_nodes) - 2:
raise RuntimeError("cannot sanitize bond path")
else:
for idx in leaf_nodes_togo[:len(leaf_nodes) - 2]:
atom = atoms_in_residue[idx]
for bond in atom.bonds:
if bond.order == 1:
fp = ({bond.atom1, bond.atom2} - {atom}).pop()
if fp.atomic_number == 7 and \
fp.formal_charge == 0 and \
num_atom_partners(fp) == 3 and \
num_current_valence(atom) == 3:
fp.formal_charge = 1
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
print("Setting C=[N+]:",
atom.name,
"-",
fp.name,
)
g.remove_node(idx)
bond_path = find_longest_valid_path(g)
print(resname, "path", bond_path)
if len(bond_path)>=2:
print("Found path:",
"-".join([atoms_in_residue[atom_idx].name for atom_idx in bond_path]))
bonds_key_in_chain = [frozenset([i, j]) for i, j in zip(bond_path, bond_path[1:])]
double_bonds_keys = bonds_key_in_chain[0::2]
for bond_idx in double_bonds_keys:
unspec_bonds_by_idx_tuple[bond_idx].order = 2
print("Setting double bond:",
atoms_in_residue[list(bond_idx)[0]].name,
"=",
atoms_in_residue[list(bond_idx)[1]].name,
)
for atom in residue.atoms:
if atom.atomic_number == 6 and num_current_valence(atom) < ref_valence[atom.atomic_number]:
for bond in atom.bonds:
if bond.order == 1:
fp = ({bond.atom1, bond.atom2} - {atom}).pop()
if fp.atomic_number == 7 and \
fp.formal_charge == 0 and \
num_atom_partners(fp) == 3 and \
num_current_valence(atom) == 3:
fp.formal_charge = 1
for bond in atom.bonds:
if fp in (bond.atom1, bond.atom2):
bond.order = 2
print("Setting C=[N+]:",
atom.name,
"-",
fp.name,
)
for atom in residue.atoms:
if atom.atomic_number == 7 and num_current_valence(atom) > ref_valence[atom.atomic_number]:
atom.formal_charge = 1
print("Charging N+:",
atom.name,
)
return residue
residue = guess_formal_charge(residue)
residue = guess_bond_order(residue)
# Summon rdkit atoms into empty RWMol
rwmol = Chem.RWMol()
for atom in residue.atoms:
if atom.atomic_number < 0:
raise RuntimeError(f"Corrupt atomic number for {atom.name} in {resname}")
rdkit_atom = Chem.Atom(atom.atomic_number)
rdkit_atom.SetProp('atom_id', atom.name)
rdkit_atom.SetFormalCharge(atom.formal_charge)
rwmol.AddAtom(rdkit_atom)
last_idx = len(rwmol.GetAtoms())
# cap head and tail atoms
if atom is residue.head or atom is residue.tail:
H_name = 'H_h' if atom is residue.head else 'H_t'
H_atom = Chem.Atom('H')
H_atom.SetProp('atom_id', H_name)
rwmol.AddAtom(H_atom)
rwmol.AddBond(last_idx - 1, last_idx, Chem.BondType.SINGLE)
# pass formal charge
rdkit_atom.SetFormalCharge(atom.formal_charge)
# Check if rwmol contains unexpected elements
if mol_contains_unexpected_element(rwmol):
logger.warning(f"Molecule contains unexpected elements -> template for {resname} will be None. ")
return None
# Map atom_id (atom names) with rdkit idx
name_to_idx_mapping = {atom.GetProp('atom_id'): idx for (idx, atom) in enumerate(rwmol.GetAtoms())}
# Connect atoms by bonds
bond_type_mapping = {
1: Chem.BondType.SINGLE,
2: Chem.BondType.DOUBLE,
3: Chem.BondType.TRIPLE,
}
# Connect atoms by bonds
for bond in residue.bonds:
atom1, atom2 = (bond.atom1, bond.atom2)
rwmol.AddBond(name_to_idx_mapping[atom1.name],
name_to_idx_mapping[atom2.name],
bond_type_mapping.get(bond_type_mapping[bond.order], Chem.BondType.UNSPECIFIED))
# Finish eidting mol
try:
rwmol.UpdatePropertyCache()
except Exception as e:
logger.error(f"Failed to create rdkitmol from lib. Error: {e} -> template for {resname} will be None. ")
return None
# Check implicit Hs
total_implicit_hydrogens = sum(atom.GetNumImplicitHs() for atom in rwmol.GetAtoms())
if total_implicit_hydrogens > 0:
print(resname, Chem.MolToSmiles(rwmol))
logger.error(f"rdkitmol from lib has implicit hydrogens. -> template for {resname} will be None. ")
return None
rdkit_mol = rwmol.GetMol()
# Get Smiles with explicit Hs and ordered atom names
smiles_exh, atom_name = get_smiles_with_atom_names(rdkit_mol)
return cls(rdkit_mol, resname, smiles_exh, atom_name)
@classmethod
def from_cif(cls, source_cif: str, resname: str):
"""Create ChemicalComponent from a chemical component dict file and a searchable residue name in file."""
# Locate block by resname
doc = gemmi.cif.read(source_cif)
block = doc.find_block(resname)
# Populate atom table
atom_category = '_chem_comp_atom.'
atom_attributes = ['atom_id', # atom names
'type_symbol', # atom elements
'charge', # (atomic) formal charges
'pdbx_leaving_atom_flag', # flags for leaving atoms after polymerization
]
atom_table = block.find(atom_category, atom_attributes)
atom_cols = {attr: atom_table.find_column(f"{atom_category}{attr}") for attr in atom_attributes}
# Summon rdkit atoms into empty RWMol
rwmol = Chem.RWMol()
atom_elements = atom_cols['type_symbol']
for idx, element in enumerate(atom_elements):
if len(element)==2:
element = element[0] + element[1].lower()
rdkit_atom = Chem.Atom(element)
for attr in atom_attributes:
rdkit_atom.SetProp(attr, atom_cols[attr][idx])
# strip double quotes in names
raw_name = rdkit_atom.GetProp('atom_id')
rdkit_atom.SetProp('atom_id', raw_name.strip('"'))
target_charge = atom_cols['charge'][idx]
if target_charge!='0':
rdkit_atom.SetFormalCharge(int(target_charge)) # this needs to be int for rdkit
rwmol.AddAtom(rdkit_atom)
# Check if rwmol contains unexpected elements
if mol_contains_unexpected_element(rwmol):
logger.warning(f"Molecule contains unexpected elements -> template for {resname} will be None. ")
return None
# Map atom_id (atom names) with rdkit idx
name_to_idx_mapping = {atom.GetProp('atom_id'): idx for (idx, atom) in enumerate(rwmol.GetAtoms())}
# Populate bond table
bond_category = '_chem_comp_bond.'
bond_attributes = ['atom_id_1', # atom name 1
'atom_id_2', # atom name 2
'value_order', # bond order
]
bond_table = block.find(bond_category, bond_attributes)
bond_cols = {attr: bond_table.find_column(f"{bond_category}{attr}") for attr in bond_attributes}
# Connect atoms by bonds
bond_type_mapping = {
'SING': Chem.BondType.SINGLE,
'DOUB': Chem.BondType.DOUBLE,
'TRIP': Chem.BondType.TRIPLE,
'AROM': Chem.BondType.AROMATIC
}
bond_types = bond_cols['value_order']
for bond_i, bond_type in enumerate(bond_types):
rwmol.AddBond(name_to_idx_mapping[bond_cols['atom_id_1'][bond_i].strip('"')],
name_to_idx_mapping[bond_cols['atom_id_2'][bond_i].strip('"')],
bond_type_mapping.get(bond_type, Chem.BondType.UNSPECIFIED))
# Try recharging mol (for metals)
try:
rwmol = recharge(rwmol)
except Exception as e:
logger.error(f"Failed to recharge rdkitmol. Error: {e} -> template for {resname} will be None. ")
return None
# Finish eidting mol
try:
rwmol.UpdatePropertyCache()
except Exception as e:
logger.error(f"Failed to create rdkitmol from cif. Error: {e} -> template for {resname} will be None. ")
return None
# Check implicit Hs
total_implicit_hydrogens = sum(atom.GetNumImplicitHs() for atom in rwmol.GetAtoms())
if total_implicit_hydrogens > 0:
logger.error(f"rdkitmol from cif has implicit hydrogens. -> template for {resname} will be None. ")
return None
rdkit_mol = rwmol.GetMol()
# Get Smiles with explicit Hs and ordered atom names
smiles_exh, atom_name = get_smiles_with_atom_names(rdkit_mol)
return cls(rdkit_mol, resname, smiles_exh, atom_name)
def make_canonical(self, acidic_proton_loc):
"""Deprotonate acidic groups til the canonical (most deprotonated) state."""
self.rdkit_mol = deprotonate(self.rdkit_mol, acidic_proton_loc = acidic_proton_loc)
return self
def make_embedded(self, allowed_smarts, leaving_names = None, leaving_smarts_loc = None):
"""Remove leaving atoms from the molecule by atom names and/or patterns."""
self.rdkit_mol = embed(self.rdkit_mol, allowed_smarts = allowed_smarts,
leaving_names = leaving_names, leaving_smarts_loc = leaving_smarts_loc)
return self
def make_capped(self, allowed_smarts, capping_names = None, capping_smarts_loc = None, protonate = None):
"""Build and name explicit hydrogens for atoms with implicit Hs by atom names and/or patterns."""
self.rdkit_mol = cap(self.rdkit_mol, allowed_smarts = allowed_smarts,
capping_names = capping_names, capping_smarts_loc = capping_smarts_loc,
protonate = protonate)
return self
def make_pretty_smiles(self):
"""Build and name explicit hydrogens for atoms with implicit Hs by atom names and/or patterns."""
self.smiles_exh, self.atom_name = get_smiles_with_atom_names(self.rdkit_mol)
self.smiles_exh = get_pretty_smiles(self.smiles_exh)
return self
def make_link_labels_from_patterns(self, pattern_to_label_mapping = {}):
"""Map patterns to link labels based on a given mapping."""
if not pattern_to_label_mapping:
return self
for pattern in pattern_to_label_mapping:
atom_idx = get_atom_idx_by_patterns(self.rdkit_mol, allowed_smarts = Chem.MolToSmarts(self.rdkit_mol),
wanted_smarts_loc = {pattern: {0}})
atoms_in_mol = [atom for atom in self.rdkit_mol.GetAtoms()]
if not atom_idx:
logger.warning(f"Molecule doesn't contain pattern: {pattern} -> linker label for {pattern_to_label_mapping[pattern]} will not be made. ")
elif len(atom_idx) > 1:
logger.warning(f"Molecule contain multiple copies of pattern: {pattern} -> linker label for {pattern_to_label_mapping[pattern]} will not be made. ")
else:
atom_idx = next(iter(atom_idx))
name = atoms_in_mol[atom_idx].GetProp('atom_id')
self.link_labels.update({str(self.atom_name.index(name)): pattern_to_label_mapping[pattern]})
return self
def make_link_labels_from_names(self, name_to_label_mapping = {}):
"""Map atom names to link labels based on a given mapping."""
if not name_to_label_mapping:
return self
for atom in self.rdkit_mol.GetAtoms():
if atom.GetProp('atom_id') in name_to_label_mapping:
if atom.GetNumImplicitHs() > 0:
name = atom.GetProp('atom_id')
self.link_labels.update({str(self.atom_name.index(name)): name_to_label_mapping[name]})
return self
def ResidueTemplate_check(self) -> bool:
# ResidueTemplate.check from linked_rdkit_chorizo
ps = Chem.SmilesParserParams()
ps.removeHs = False
mol = Chem.MolFromSmiles(self.smiles_exh, ps)
have_implicit_hs = {atom.GetIdx() for atom in mol.GetAtoms() if atom.GetTotalNumHs() > 0}
if self.link_labels:
int_labels = {int(atom_id) for atom_id in self.link_labels}
else:
int_labels = set()
if int_labels != have_implicit_hs:
raise ValueError(
f"expected any atom with non-real Hs ({have_implicit_hs}) to tagged in link_labels ({int_labels})"
)
if not self.atom_name:
return
if len(self.atom_name) != mol.GetNumAtoms():
raise ValueError(f"{len(self.atom_name)=} differs from {mol.GetNumAtoms()=}")
return
# Export/Writer Function
def export_chem_templates_to_json(cc_list: list[ChemicalComponent], json_fname: str=""):
"""Export list of chem templates to json"""
basenames = [cc.parent for cc in cc_list]
ambiguous_dict = {basename: [] for basename in basenames}
for cc in cc_list:
if cc.resname not in ambiguous_dict[cc.parent]:
ambiguous_dict[cc.parent].append(cc.resname)
data_to_export = {"ambiguous": {basename: basename+".ambiguous" for basename in basenames}}
residue_templates = {}
for cc in cc_list:
residue_templates[cc.resname] = {
"smiles": cc.smiles_exh,
"atom_name": cc.resname+".atom_names",
}
if cc.link_labels:
residue_templates[cc.resname]["link_labels"] = cc.resname+".link_labels"
else:
residue_templates[cc.resname]["link_labels"] = {}
data_to_export.update({"residue_templates": residue_templates})
json_str = json.dumps(data_to_export, indent = 4)
# format ambiguous resnames to one line
for basename in data_to_export["ambiguous"]:
single_line_resnames = json.dumps(ambiguous_dict[basename], separators=(', ', ': '))
json_str = json_str.replace(json.dumps(data_to_export["ambiguous"][basename], indent = 4), single_line_resnames)
# format link_labels and atom_name to one line
for cc in cc_list:
single_line_atom_name = json.dumps(cc.atom_name, separators=(', ', ': '))
json_str = json_str.replace(json.dumps(data_to_export["residue_templates"][cc.resname]["atom_name"], indent = 4), single_line_atom_name)
if cc.link_labels:
single_line_link_labels = json.dumps(cc.link_labels, separators=(', ', ': '))
json_str = json_str.replace(json.dumps(data_to_export["residue_templates"][cc.resname]["link_labels"], indent = 4), single_line_link_labels)
if json_fname:
with open(pathlib.Path(json_fname), 'w') as f:
f.write(json_str)
print(f"{json_fname} <-- Json File for New Chemical Templates")
else:
print(" New Template Built ".center(60, "*"))
print(json_str)
print("*"*60)
return json_str
def fetch_from_pdb(resname: str, max_retries = 5, backoff_factor = 2) -> str:
url = f"https://files.rcsb.org/ligands/download/{resname}.cif"
file_path = f"{resname}.cif"
for retry in range(max_retries):
try:
with urllib.request.urlopen(url) as response:
content = response.read()
logger.info(f"File downloaded successfully: {file_path}")
with tempfile.NamedTemporaryFile(delete=False, suffix=".cif") as temp_file:
temp_file.write(content)
atexit.register(
lambda: os.remove(temp_file.name)
if temp_file.name and os.path.exists(temp_file.name)
else None
)
return temp_file.name
except Exception as e:
if isinstance(e, urllib.error.HTTPError) and e.getcode() == 404:
raise RuntimeError(f"Ligand {resname} not available from rcsb.org") from e
elif retry < max_retries - 1:
wait_time = backoff_factor ** retry
logger.info(f"Download failed for {resname}. Error: {e}. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
else:
err = f"Max retries reached. Could not download CIF file for {resname}. Error: {e}"
raise RuntimeError(err) from e
# Constants for deprotonate
acidic_proton_loc_canonical = {
# any carboxylic acid, sulfuric/sulfonic acid/ester, phosphoric/phosphinic acid/ester
'[H][O]['+atom+'](=O)': 0 for atom in ('CX3', 'SX4', 'SX3', 'PX4', 'PX3')
} | {
# any thio carboxylic/sulfuric acid
'[H][O]['+atom+'](=S)': 0 for atom in ('CX3', 'SX4')
} | {
'[H][SX2][a]': 0, # thiophenol
}
# Make free (noncovalent) CC
def build_noncovalent_CC(basename: str, acidic_proton_loc = acidic_proton_loc_canonical) -> ChemicalComponent:
with ChemicalComponent_LoggingControler():
cc_from_cif = ChemicalComponent.from_cif(fetch_from_pdb(basename), basename)
if cc_from_cif is None:
return None
cc = copy.deepcopy(cc_from_cif)
logger.info(f"*** using CCD ligand {basename} to construct residue {cc.resname} ***")
cc = cc.make_canonical(acidic_proton_loc = acidic_proton_loc)
if len(rdmolops.GetMolFrags(cc.rdkit_mol))>1:
err = f"Template Generation failed for {cc.resname}. Error: Molecule breaks into fragments during the deleterious editing. "
raise RuntimeError(err)
cc = cc.make_pretty_smiles()
# Check
try:
cc.ResidueTemplate_check()
except Exception as e:
err = f"Template {cc.resname} Failed to pass ResidueTemplate check. Error: {e}"
raise RuntimeError(err)
logger.info(f"*** finish making {cc.resname} ***")
return cc
def add_variants(cc_orig: ChemicalComponent, cc_list: list[ChemicalComponent] = [],
embed_allowed_smarts: str = None,
cap_allowed_smarts: str = None, cap_protonate: bool = False,
pattern_to_label_mapping_standard = dict[str, str],
variant_dict = dict[str, tuple],
acidic_proton_loc = acidic_proton_loc_canonical) -> list[ChemicalComponent]:
for suffix in variant_dict:
cc = copy.deepcopy(cc_orig)
cc.resname += suffix
logger.info(f"*** using CCD residue {cc.parent} to construct {cc.resname} ***")
cc = (
cc
.make_canonical(acidic_proton_loc = acidic_proton_loc)
.make_embedded(allowed_smarts = embed_allowed_smarts,
leaving_smarts_loc = variant_dict[suffix][0])
)
if len(rdmolops.GetMolFrags(cc.rdkit_mol))>1:
logger.warning(f"Molecule breaks into fragments during the deleterious editing of {cc.resname} -> skipping the vaiant... ")
continue
cc = (
cc
.make_capped(allowed_smarts = cap_allowed_smarts,
capping_smarts_loc = variant_dict[suffix][1],