Skip to content

Commit

Permalink
Added another alphafold mining example for cysteines in C elegans tra…
Browse files Browse the repository at this point in the history
…nsmembrane proteins.
  • Loading branch information
Tom Goddard committed Jan 23, 2024
1 parent f5ab711 commit ed550d4
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 2 deletions.
10 changes: 8 additions & 2 deletions alphafold_mining/af_mining.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ Here is the ChimeraX Python script that does the search [cyssearch.py](cyssearch

# Modifications

The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cys3search.py](cys3search.py) that does that, and the 12 structures found are listed and shown below.
The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cyssearch_v2.py](cyssearch_v2.py) that does that, and the 12 structures found are listed and shown below.

19827 UniProt entries
5756 entries with annotated transmembrane regions
Expand All @@ -216,6 +216,12 @@ The script can easily be modified to instead find all C elegans proteins with at
Q9NEQ0,,4,5
Q9XXR9,SSD domain-containing protein,11,5

<img src="5cys_4helix.png" width="800">
<img src="5cys_4helix.png" width="1000">

# More modifications

Denke Ma suggested filtering to proteins with at least 4 transmembrane helices and looking for pairs of cysteines separated by 0, 1, or 2 residues (CC, CxC, CxxC) in each helix and requiring two helices to have pairs with distance at most 10 Angstroms from a cysteine SG atom in one helix to the other. That gave 53 proteins [results_v3.txt](results_v3.txt) using script [cyssearch_v3.py](cyssearch_v3.py).

<img src="results_v3.jpg" width="1000">

Tom Goddard, January 19, 2024
File renamed without changes.
154 changes: 154 additions & 0 deletions alphafold_mining/cyssearch_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Dengke Ma wants to find all C elegans proteins with pairs of close cysteines in transmembrane regions.
# Can use UniProt to identify transmembrane residues, then use AlphaFold database predicted structures
# to see if there are close cysteines.

def find_uniprot_transmembrane_cysteines(uniprot_xml_path, namespace = '{http://uniprot.org/uniprot}'):
import xml.etree.ElementTree as ET
tree = ET.parse(uniprot_xml_path)
tm = []
for child in tree.getroot():
if child.tag == namespace + 'entry':
rr = transmembrane_residue_ranges(child, namespace)
uniprot_id = child.find(namespace + 'accession').text
seq = child.find(namespace + 'sequence').text
full_name = child.find(f'./{namespace}protein/{namespace}submittedName/{namespace}fullName')
name = '' if full_name is None else full_name.text
phc = paired_helix_cys(seq, rr)
tm.append((uniprot_id, name, phc, rr))
return tm

def paired_helix_cys(seq, rr):
# Dengke suggests considering only paired cysteines CC, CxC or CxxC in a helix.
phc = []
for b,e in rr:
ci = set([i+1 for i in range(b-1,e) if seq[i] == 'C'])
pci = [i for i in ci
if ((i+1) in ci or (i+2) in ci or (i+3) in ci or
(i-1) in ci or (i-2) in ci or (i-3) in ci)]
if len(pci) >= 2:
phc.append(pci)
return phc

def transmembrane_residue_ranges(protein_xml_entry, namespace):
ranges = []
for feature in protein_xml_entry.iter(namespace + 'feature'):
fattrib = feature.attrib
if 'type' in fattrib and fattrib['type'] == 'transmembrane region':
for loc in feature.iter(namespace + 'location'):
b,e = loc.find(namespace + 'begin'), loc.find(namespace + 'end')
if b is not None and e is not None:
if 'position' in b.attrib and 'position' in e.attrib:
r = (int(b.attrib['position']), int(e.attrib['position']))
ranges.append(r)
return ranges

def close_cysteines(structure, membrane_residue_ranges, max_distance = 5):
cys_res = [r for r in structure.residues if r.name == 'CYS']
cys_xyz = [(r.number, r.find_atom('SG').coord) for r in cys_res]
mb_res_nums = residue_numbers_from_ranges(membrane_residue_ranges)
mb_cys = [r for r in cys_res if r.number in mb_res_nums]
mb_xyz = [(r.number, r.find_atom('SG').coord) for r in mb_cys]

close_pairs = set()
from chimerax.geometry import distance
for rnum, xyz in mb_xyz:
for rnum2, xyz2 in cys_xyz:
if rnum2 != rnum and distance(xyz, xyz2) <= max_distance:
pair = (rnum, rnum2) if rnum < rnum2 else (rnum2, rnum)
close_pairs.add(pair)

return list(close_pairs)

def atoms_by_residue_number(atoms, atom_name):
amap = {}
for a in atoms:
if a.name == atom_name:
amap[a.residue.number] = a
return amap

def residue_numbers_from_ranges(residue_ranges):
res_nums = set()
for b,e in residue_ranges:
for rnum in range(b,e+1):
res_nums.add(rnum)
return res_nums

def check_for_close_cysteines(session, ulist, alphafold_dir, max_distance):
found = []
missing = []
for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist:
if len(paired_hel_cys) < 2:
continue
m = alphafold_database_model(session, uniprot_id, alphafold_dir)
if m is None:
missing.append((uniprot_id, name))
continue
close_pairs = []
atoms = atoms_by_residue_number(m.atoms, 'SG')
from chimerax.geometry import distance
for i,ph1 in enumerate(paired_hel_cys):
for ph2 in paired_hel_cys[i+1:]:
for rnum1 in ph1:
for rnum2 in ph2:
if distance(atoms[rnum1].coord, atoms[rnum2].coord) <= max_distance:
close_pairs.append((rnum1, rnum2))
if close_pairs:
found.append((uniprot_id, name, close_pairs, tm_res_ranges))
m.delete()
return found, missing

def alphafold_database_model(session, uniprot_id, alphafold_dir):
filename = f'AF-{uniprot_id}-F1-model_v4.cif'
from os.path import join, exists
path = join(alphafold_dir, filename)
if not exists(path):
return None
from chimerax.mmcif import open_mmcif
s, msg = open_mmcif(session, path)
return s[0]

def open_entries(session, entries, alphafold_dir):
models = []
for uniprot_id, name, close_pairs, tm_res_ranges in entries:
m = alphafold_database_model(session, uniprot_id, alphafold_dir)
models.append(m)
# Select transmembrane residues
rnums = residue_numbers_from_ranges(tm_res_ranges)
for r in m.residues:
if r.number in rnums:
r.atoms.selected = True
session.models.add(models)

uniprot_xml_path = 'UP000001940_6239.xml'
alphafold_dir = 'alphafold_models'
max_distance = 10

ulist = find_uniprot_transmembrane_cysteines(uniprot_xml_path)
print(f'{len(ulist)} UniProt entries')
ntm = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if tm_res_ranges])
ntm4 = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if len(tm_res_ranges)>=4])
print(f'{ntm} entries with annotated transmembrane regions')
print(f'{ntm4} entries with 4 or more transmembrane helices')
ntm4p = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist
if len(tm_res_ranges)>=4 and len(paired_hel_cys)>=2])
print(f'{ntm4p} entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs')

uclose, missing = check_for_close_cysteines(session, ulist, alphafold_dir, max_distance)
print(f'{len(uclose)} with paired cysteines in two helices closer than {max_distance}A')

entries = []
for uniprot_id, name, close_pairs, tm_res_ranges in uclose:
rpairs = ' '.join(f'{r1}:{r2}' for r1,r2 in close_pairs)
tmranges = ' '.join(f'{r1}-{r2}' for r1,r2 in tm_res_ranges)
entries.append(f'{uniprot_id},{name},{rpairs},{tmranges}')

print()
print('# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges')
print('\n'.join(entries))
print()

if missing:
me = '\n'.join(f'{uniprot_id},{name}' for uniprot_id, name in missing)
print(f'No alphafold model for {len(missing)} entries with cysteine pairs:\n{me}')

open_entries(session, uclose, alphafold_dir)
Binary file added alphafold_mining/results_v3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
63 changes: 63 additions & 0 deletions alphafold_mining/results_v3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
open cyssearch_v3.py
# Took 8 seconds

19827 UniProt entries
5756 entries with annotated transmembrane regions
3177 entries with 4 or more transmembrane helices
190 entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs
53 with paired cysteines in two helices closer than 10A

# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges
O17230,Seven TM Receptor,92:203,6-27 39-62 82-108 129-149 197-223 243-267 273-292
Q9U1P9,Lysosomal cobalamin transporter,108:371,104-127 139-161 167-185 232-251 366-391 431-454
G5EFS0,Cation/H+ exchanger domain-containing protein,227:320 227:323,25-48 68-85 92-110 122-141 153-178 184-205 217-237 257-278 299-327 347-364 376-395 407-431 443-463
Q93340,Candidate tumor suppressor protein,8:145,7-30 42-59 80-106 118-149
Q5F4U1,Serpentine Receptor, class Z,130:208 130:205 131:208,12-38 58-80 92-114 120-140 152-173 193-214 235-255 261-286
Q19983,,90:274 90:275 91:274,12-35 55-79 86-108 251-276
O16329,Seven TM Receptor,97:146 94:146 94:149,12-33 45-69 89-113 134-154 203-226 246-269 289-307
Q960A0,,428:481,25-45 70-90 120-140 159-179 199-218 253-273 296-316 329-349 377-397 421-441 475-495 513-533
Q9U2H5,Serpentine Receptor, class T,85:277,33-57 69-93 105-127 147-165 203-222 234-256 268-290
Q9XUC8,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-169 208-228 240-263 269-292
Q19508,,192:247,9-29 42-62 91-111 129-149 185-205 239-259 269-289
Q9U1V0,CLaudin-like in Caenorhabditis,166:233,16-36 135-157 164-190 210-235
Q9N5L5,G-protein coupled receptors family 1 profile domain-containing protein,34:59 35:59,19-40 47-71 91-118 139-162 196-219 240-261 281-300
A0A0K3AXT4,Serpentine Receptor, class T,90:133 90:134 93:133,51-71 83-111 117-143 163-181 211-233 253-274 280-305
Q9N483,Seven TM Receptor,53:101,12-34 46-68 88-114 134-151 199-224 255-279 285-312
A0A3B1E8T2,Uncharacterized protein,29:67,22-43 55-72 84-110 131-157
Q9XXP5,Serpentine Receptor, class H,81:297,34-54 66-89 109-130 151-171 212-239 260-281 293-314
B1GRL2,UNC93-like protein MFSD11,156:385 156:382,14-33 61-82 89-105 111-129 150-169 202-223 257-277 297-316 328-349 369-391 403-421 427-446
H2L2K4,Serpentine Receptor, class T,122:248,35-59 71-100 106-129 149-171 208-228 240-261 267-292
O61865,UNC93-like protein,390:420,21-44 64-87 94-110 116-134 155-179 207-228 262-283 295-321 333-353 373-396 408-426 432-452
O16471,Serpentine Receptor, class T,64:107 64:108 67:107,59-85 91-116 137-155 175-203 224-244 256-276
Q23073,Serpentine Receptor, class T,106:265,20-42 54-74 86-111 131-154 174-200 221-244 250-273
Q9XVU1,Lipoma HMGIC fusion partner-like 3 protein,26:172,21-44 79-103 110-133 158-182
Q965R8,G_PROTEIN_RECEP_F1_2 domain-containing protein,30:54,12-37 49-73 85-111 123-145 174-193 214-236 248-266
P91003,UNC93-like protein MFSD11,148:375,51-69 81-97 103-120 141-161 194-215 253-272 284-309 321-340 360-384 396-414 420-442
Q8MYL9,UNC93-like protein MFSD11,148:373,53-74 81-100 106-129 141-161 195-214 251-272 284-307 319-337 357-375 396-413 419-440
Q18517,G-protein coupled receptors family 1 profile domain-containing protein,96:131 97:131,46-66 87-107 119-142 163-182 233-256 277-298 318-339
Q9NAN9,Serpentine Receptor, class T,85:116 122:248,35-59 71-95 107-130 151-180 205-224 236-258 270-292
O44130,Major facilitator superfamily (MFS) profile domain-containing protein,77:151,68-95 142-161 167-191 203-225 237-256 305-326 338-355 376-398 404-420 432-453 473-492
Q9N480,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310
Q9NAI0,Serpentine Receptor, class T,122:248,35-59 71-96 108-128 149-170 208-228 240-261 267-292
I2HAH7,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-168 208-228 240-263 269-292
Q6EZH2,Gustatory receptor,111:207,108-127 202-224 245-267 273-292 344-367
O17988,Serpentine Receptor, class H,81:297,29-52 64-89 109-130 151-171 216-240 260-281 293-314
O17872,Seven TM Receptor,59:101 60:101,12-33 45-64 95-115 136-156 204-227 255-283 289-309
O45337,Seven TM Receptor,85:127 86:127,38-59 71-90 118-140 160-177 228-251 280-301 313-334
O45740,Serpentine Receptor, class H,214:258 215:258,16-39 59-77 97-119 140-163 200-226 247-274 286-307
O45972,Seven TM Receptor,59:101 60:101,15-33 45-64 90-115 136-155 202-227 255-280 286-309
A0A2K5ATW7,,26:68,12-33 60-77
O16341,G-protein coupled receptors family 1 profile domain-containing protein,82:320 82:323,45-63 75-97 109-129 135-153 165-185 224-244 265-286 306-326
Q23072,Serpentine Receptor, class T,126:285,38-60 72-94 106-131 151-170 214-235 242-264 270-293
Q9N481,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310
Q9GZF9,Serpentine Receptor, class T,41:77 42:82 42:75 42:77 85:208,6-24 31-54 66-90 111-133 153-179 200-224 230-252
A0A1T5HUX0,G-protein coupled receptors family 1 profile domain-containing protein,64:90 64:93 66:90,46-72 84-111 131-152 164-190 210-234 260-280 305-329
G5EE13,SSD domain-containing protein,324:896 326:896,51-71 312-337 346-368 374-397 462-488 546-566 783-801 808-828 875-902 914-937
A0A060Q608,Serpentine Receptor, class T,47:240,31-55 67-89 109-128 168-188 200-223 229-252
P34389,,275:838,268-288 327-347 588-608 630-650 656-676 722-742 745-765 824-844 1079-1101 1106-1128 1138-1160 1172-1194 1204-1226
Q95XW0,Serpentine Receptor, class T,89:125 90:130 90:123 90:125,44-67 79-101 113-138 159-176 210-233 254-273 279-300
Q9NEU8,UNC93-like protein MFSD11,147:375 150:375,7-27 55-76 83-99 105-122 143-168 195-214 253-272 284-306 318-340 360-384 396-414 420-442
Q4PIX1,Major facilitator superfamily (MFS) profile domain-containing protein,25:201,12-35 96-114 126-144 150-172 192-210 216-235 286-304 324-348 360-379 391-414
O44542,Serpentine Receptor, class T,127:286,39-61 73-95 107-132 152-170 206-225 246-265 271-294
G5EBI8,STarGazin (Mammalian calcium channel) homolog,187:270,64-86 177-196 203-227 257-279
G5ECQ2,,244:508 245:508,237-257 269-289 313-333 355-375 399-419 450-470 498-518

0 comments on commit ed550d4

Please sign in to comment.