-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added another alphafold mining example for cysteines in C elegans tra…
…nsmembrane proteins.
- Loading branch information
Tom Goddard
committed
Jan 23, 2024
1 parent
f5ab711
commit ed550d4
Showing
5 changed files
with
225 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
# Dengke Ma wants to find all C elegans proteins with pairs of close cysteines in transmembrane regions. | ||
# Can use UniProt to identify transmembrane residues, then use AlphaFold database predicted structures | ||
# to see if there are close cysteines. | ||
|
||
def find_uniprot_transmembrane_cysteines(uniprot_xml_path, namespace = '{http://uniprot.org/uniprot}'): | ||
import xml.etree.ElementTree as ET | ||
tree = ET.parse(uniprot_xml_path) | ||
tm = [] | ||
for child in tree.getroot(): | ||
if child.tag == namespace + 'entry': | ||
rr = transmembrane_residue_ranges(child, namespace) | ||
uniprot_id = child.find(namespace + 'accession').text | ||
seq = child.find(namespace + 'sequence').text | ||
full_name = child.find(f'./{namespace}protein/{namespace}submittedName/{namespace}fullName') | ||
name = '' if full_name is None else full_name.text | ||
phc = paired_helix_cys(seq, rr) | ||
tm.append((uniprot_id, name, phc, rr)) | ||
return tm | ||
|
||
def paired_helix_cys(seq, rr): | ||
# Dengke suggests considering only paired cysteines CC, CxC or CxxC in a helix. | ||
phc = [] | ||
for b,e in rr: | ||
ci = set([i+1 for i in range(b-1,e) if seq[i] == 'C']) | ||
pci = [i for i in ci | ||
if ((i+1) in ci or (i+2) in ci or (i+3) in ci or | ||
(i-1) in ci or (i-2) in ci or (i-3) in ci)] | ||
if len(pci) >= 2: | ||
phc.append(pci) | ||
return phc | ||
|
||
def transmembrane_residue_ranges(protein_xml_entry, namespace): | ||
ranges = [] | ||
for feature in protein_xml_entry.iter(namespace + 'feature'): | ||
fattrib = feature.attrib | ||
if 'type' in fattrib and fattrib['type'] == 'transmembrane region': | ||
for loc in feature.iter(namespace + 'location'): | ||
b,e = loc.find(namespace + 'begin'), loc.find(namespace + 'end') | ||
if b is not None and e is not None: | ||
if 'position' in b.attrib and 'position' in e.attrib: | ||
r = (int(b.attrib['position']), int(e.attrib['position'])) | ||
ranges.append(r) | ||
return ranges | ||
|
||
def close_cysteines(structure, membrane_residue_ranges, max_distance = 5): | ||
cys_res = [r for r in structure.residues if r.name == 'CYS'] | ||
cys_xyz = [(r.number, r.find_atom('SG').coord) for r in cys_res] | ||
mb_res_nums = residue_numbers_from_ranges(membrane_residue_ranges) | ||
mb_cys = [r for r in cys_res if r.number in mb_res_nums] | ||
mb_xyz = [(r.number, r.find_atom('SG').coord) for r in mb_cys] | ||
|
||
close_pairs = set() | ||
from chimerax.geometry import distance | ||
for rnum, xyz in mb_xyz: | ||
for rnum2, xyz2 in cys_xyz: | ||
if rnum2 != rnum and distance(xyz, xyz2) <= max_distance: | ||
pair = (rnum, rnum2) if rnum < rnum2 else (rnum2, rnum) | ||
close_pairs.add(pair) | ||
|
||
return list(close_pairs) | ||
|
||
def atoms_by_residue_number(atoms, atom_name): | ||
amap = {} | ||
for a in atoms: | ||
if a.name == atom_name: | ||
amap[a.residue.number] = a | ||
return amap | ||
|
||
def residue_numbers_from_ranges(residue_ranges): | ||
res_nums = set() | ||
for b,e in residue_ranges: | ||
for rnum in range(b,e+1): | ||
res_nums.add(rnum) | ||
return res_nums | ||
|
||
def check_for_close_cysteines(session, ulist, alphafold_dir, max_distance): | ||
found = [] | ||
missing = [] | ||
for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist: | ||
if len(paired_hel_cys) < 2: | ||
continue | ||
m = alphafold_database_model(session, uniprot_id, alphafold_dir) | ||
if m is None: | ||
missing.append((uniprot_id, name)) | ||
continue | ||
close_pairs = [] | ||
atoms = atoms_by_residue_number(m.atoms, 'SG') | ||
from chimerax.geometry import distance | ||
for i,ph1 in enumerate(paired_hel_cys): | ||
for ph2 in paired_hel_cys[i+1:]: | ||
for rnum1 in ph1: | ||
for rnum2 in ph2: | ||
if distance(atoms[rnum1].coord, atoms[rnum2].coord) <= max_distance: | ||
close_pairs.append((rnum1, rnum2)) | ||
if close_pairs: | ||
found.append((uniprot_id, name, close_pairs, tm_res_ranges)) | ||
m.delete() | ||
return found, missing | ||
|
||
def alphafold_database_model(session, uniprot_id, alphafold_dir): | ||
filename = f'AF-{uniprot_id}-F1-model_v4.cif' | ||
from os.path import join, exists | ||
path = join(alphafold_dir, filename) | ||
if not exists(path): | ||
return None | ||
from chimerax.mmcif import open_mmcif | ||
s, msg = open_mmcif(session, path) | ||
return s[0] | ||
|
||
def open_entries(session, entries, alphafold_dir): | ||
models = [] | ||
for uniprot_id, name, close_pairs, tm_res_ranges in entries: | ||
m = alphafold_database_model(session, uniprot_id, alphafold_dir) | ||
models.append(m) | ||
# Select transmembrane residues | ||
rnums = residue_numbers_from_ranges(tm_res_ranges) | ||
for r in m.residues: | ||
if r.number in rnums: | ||
r.atoms.selected = True | ||
session.models.add(models) | ||
|
||
uniprot_xml_path = 'UP000001940_6239.xml' | ||
alphafold_dir = 'alphafold_models' | ||
max_distance = 10 | ||
|
||
ulist = find_uniprot_transmembrane_cysteines(uniprot_xml_path) | ||
print(f'{len(ulist)} UniProt entries') | ||
ntm = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if tm_res_ranges]) | ||
ntm4 = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if len(tm_res_ranges)>=4]) | ||
print(f'{ntm} entries with annotated transmembrane regions') | ||
print(f'{ntm4} entries with 4 or more transmembrane helices') | ||
ntm4p = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist | ||
if len(tm_res_ranges)>=4 and len(paired_hel_cys)>=2]) | ||
print(f'{ntm4p} entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs') | ||
|
||
uclose, missing = check_for_close_cysteines(session, ulist, alphafold_dir, max_distance) | ||
print(f'{len(uclose)} with paired cysteines in two helices closer than {max_distance}A') | ||
|
||
entries = [] | ||
for uniprot_id, name, close_pairs, tm_res_ranges in uclose: | ||
rpairs = ' '.join(f'{r1}:{r2}' for r1,r2 in close_pairs) | ||
tmranges = ' '.join(f'{r1}-{r2}' for r1,r2 in tm_res_ranges) | ||
entries.append(f'{uniprot_id},{name},{rpairs},{tmranges}') | ||
|
||
print() | ||
print('# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges') | ||
print('\n'.join(entries)) | ||
print() | ||
|
||
if missing: | ||
me = '\n'.join(f'{uniprot_id},{name}' for uniprot_id, name in missing) | ||
print(f'No alphafold model for {len(missing)} entries with cysteine pairs:\n{me}') | ||
|
||
open_entries(session, uclose, alphafold_dir) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
open cyssearch_v3.py | ||
# Took 8 seconds | ||
|
||
19827 UniProt entries | ||
5756 entries with annotated transmembrane regions | ||
3177 entries with 4 or more transmembrane helices | ||
190 entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs | ||
53 with paired cysteines in two helices closer than 10A | ||
|
||
# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges | ||
O17230,Seven TM Receptor,92:203,6-27 39-62 82-108 129-149 197-223 243-267 273-292 | ||
Q9U1P9,Lysosomal cobalamin transporter,108:371,104-127 139-161 167-185 232-251 366-391 431-454 | ||
G5EFS0,Cation/H+ exchanger domain-containing protein,227:320 227:323,25-48 68-85 92-110 122-141 153-178 184-205 217-237 257-278 299-327 347-364 376-395 407-431 443-463 | ||
Q93340,Candidate tumor suppressor protein,8:145,7-30 42-59 80-106 118-149 | ||
Q5F4U1,Serpentine Receptor, class Z,130:208 130:205 131:208,12-38 58-80 92-114 120-140 152-173 193-214 235-255 261-286 | ||
Q19983,,90:274 90:275 91:274,12-35 55-79 86-108 251-276 | ||
O16329,Seven TM Receptor,97:146 94:146 94:149,12-33 45-69 89-113 134-154 203-226 246-269 289-307 | ||
Q960A0,,428:481,25-45 70-90 120-140 159-179 199-218 253-273 296-316 329-349 377-397 421-441 475-495 513-533 | ||
Q9U2H5,Serpentine Receptor, class T,85:277,33-57 69-93 105-127 147-165 203-222 234-256 268-290 | ||
Q9XUC8,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-169 208-228 240-263 269-292 | ||
Q19508,,192:247,9-29 42-62 91-111 129-149 185-205 239-259 269-289 | ||
Q9U1V0,CLaudin-like in Caenorhabditis,166:233,16-36 135-157 164-190 210-235 | ||
Q9N5L5,G-protein coupled receptors family 1 profile domain-containing protein,34:59 35:59,19-40 47-71 91-118 139-162 196-219 240-261 281-300 | ||
A0A0K3AXT4,Serpentine Receptor, class T,90:133 90:134 93:133,51-71 83-111 117-143 163-181 211-233 253-274 280-305 | ||
Q9N483,Seven TM Receptor,53:101,12-34 46-68 88-114 134-151 199-224 255-279 285-312 | ||
A0A3B1E8T2,Uncharacterized protein,29:67,22-43 55-72 84-110 131-157 | ||
Q9XXP5,Serpentine Receptor, class H,81:297,34-54 66-89 109-130 151-171 212-239 260-281 293-314 | ||
B1GRL2,UNC93-like protein MFSD11,156:385 156:382,14-33 61-82 89-105 111-129 150-169 202-223 257-277 297-316 328-349 369-391 403-421 427-446 | ||
H2L2K4,Serpentine Receptor, class T,122:248,35-59 71-100 106-129 149-171 208-228 240-261 267-292 | ||
O61865,UNC93-like protein,390:420,21-44 64-87 94-110 116-134 155-179 207-228 262-283 295-321 333-353 373-396 408-426 432-452 | ||
O16471,Serpentine Receptor, class T,64:107 64:108 67:107,59-85 91-116 137-155 175-203 224-244 256-276 | ||
Q23073,Serpentine Receptor, class T,106:265,20-42 54-74 86-111 131-154 174-200 221-244 250-273 | ||
Q9XVU1,Lipoma HMGIC fusion partner-like 3 protein,26:172,21-44 79-103 110-133 158-182 | ||
Q965R8,G_PROTEIN_RECEP_F1_2 domain-containing protein,30:54,12-37 49-73 85-111 123-145 174-193 214-236 248-266 | ||
P91003,UNC93-like protein MFSD11,148:375,51-69 81-97 103-120 141-161 194-215 253-272 284-309 321-340 360-384 396-414 420-442 | ||
Q8MYL9,UNC93-like protein MFSD11,148:373,53-74 81-100 106-129 141-161 195-214 251-272 284-307 319-337 357-375 396-413 419-440 | ||
Q18517,G-protein coupled receptors family 1 profile domain-containing protein,96:131 97:131,46-66 87-107 119-142 163-182 233-256 277-298 318-339 | ||
Q9NAN9,Serpentine Receptor, class T,85:116 122:248,35-59 71-95 107-130 151-180 205-224 236-258 270-292 | ||
O44130,Major facilitator superfamily (MFS) profile domain-containing protein,77:151,68-95 142-161 167-191 203-225 237-256 305-326 338-355 376-398 404-420 432-453 473-492 | ||
Q9N480,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310 | ||
Q9NAI0,Serpentine Receptor, class T,122:248,35-59 71-96 108-128 149-170 208-228 240-261 267-292 | ||
I2HAH7,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-168 208-228 240-263 269-292 | ||
Q6EZH2,Gustatory receptor,111:207,108-127 202-224 245-267 273-292 344-367 | ||
O17988,Serpentine Receptor, class H,81:297,29-52 64-89 109-130 151-171 216-240 260-281 293-314 | ||
O17872,Seven TM Receptor,59:101 60:101,12-33 45-64 95-115 136-156 204-227 255-283 289-309 | ||
O45337,Seven TM Receptor,85:127 86:127,38-59 71-90 118-140 160-177 228-251 280-301 313-334 | ||
O45740,Serpentine Receptor, class H,214:258 215:258,16-39 59-77 97-119 140-163 200-226 247-274 286-307 | ||
O45972,Seven TM Receptor,59:101 60:101,15-33 45-64 90-115 136-155 202-227 255-280 286-309 | ||
A0A2K5ATW7,,26:68,12-33 60-77 | ||
O16341,G-protein coupled receptors family 1 profile domain-containing protein,82:320 82:323,45-63 75-97 109-129 135-153 165-185 224-244 265-286 306-326 | ||
Q23072,Serpentine Receptor, class T,126:285,38-60 72-94 106-131 151-170 214-235 242-264 270-293 | ||
Q9N481,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310 | ||
Q9GZF9,Serpentine Receptor, class T,41:77 42:82 42:75 42:77 85:208,6-24 31-54 66-90 111-133 153-179 200-224 230-252 | ||
A0A1T5HUX0,G-protein coupled receptors family 1 profile domain-containing protein,64:90 64:93 66:90,46-72 84-111 131-152 164-190 210-234 260-280 305-329 | ||
G5EE13,SSD domain-containing protein,324:896 326:896,51-71 312-337 346-368 374-397 462-488 546-566 783-801 808-828 875-902 914-937 | ||
A0A060Q608,Serpentine Receptor, class T,47:240,31-55 67-89 109-128 168-188 200-223 229-252 | ||
P34389,,275:838,268-288 327-347 588-608 630-650 656-676 722-742 745-765 824-844 1079-1101 1106-1128 1138-1160 1172-1194 1204-1226 | ||
Q95XW0,Serpentine Receptor, class T,89:125 90:130 90:123 90:125,44-67 79-101 113-138 159-176 210-233 254-273 279-300 | ||
Q9NEU8,UNC93-like protein MFSD11,147:375 150:375,7-27 55-76 83-99 105-122 143-168 195-214 253-272 284-306 318-340 360-384 396-414 420-442 | ||
Q4PIX1,Major facilitator superfamily (MFS) profile domain-containing protein,25:201,12-35 96-114 126-144 150-172 192-210 216-235 286-304 324-348 360-379 391-414 | ||
O44542,Serpentine Receptor, class T,127:286,39-61 73-95 107-132 152-170 206-225 246-265 271-294 | ||
G5EBI8,STarGazin (Mammalian calcium channel) homolog,187:270,64-86 177-196 203-227 257-279 | ||
G5ECQ2,,244:508 245:508,237-257 269-289 313-333 355-375 399-419 450-470 498-518 |