diff --git a/alphafold_mining/af_mining.md b/alphafold_mining/af_mining.md index 2b0d54b..4c267a1 100644 --- a/alphafold_mining/af_mining.md +++ b/alphafold_mining/af_mining.md @@ -195,7 +195,7 @@ Here is the ChimeraX Python script that does the search [cyssearch.py](cyssearch # Modifications -The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cys3search.py](cys3search.py) that does that, and the 12 structures found are listed and shown below. +The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cyssearch_v2.py](cyssearch_v2.py) that does that, and the 12 structures found are listed and shown below. 19827 UniProt entries 5756 entries with annotated transmembrane regions @@ -216,6 +216,12 @@ The script can easily be modified to instead find all C elegans proteins with at Q9NEQ0,,4,5 Q9XXR9,SSD domain-containing protein,11,5 - + + +# More modifications + +Denke Ma suggested filtering to proteins with at least 4 transmembrane helices and looking for pairs of cysteines separated by 0, 1, or 2 residues (CC, CxC, CxxC) in each helix and requiring two helices to have pairs with distance at most 10 Angstroms from a cysteine SG atom in one helix to the other. That gave 53 proteins [results_v3.txt](results_v3.txt) using script [cyssearch_v3.py](cyssearch_v3.py). + + Tom Goddard, January 19, 2024 diff --git a/alphafold_mining/cys3search.py b/alphafold_mining/cyssearch_v2.py similarity index 100% rename from alphafold_mining/cys3search.py rename to alphafold_mining/cyssearch_v2.py diff --git a/alphafold_mining/cyssearch_v3.py b/alphafold_mining/cyssearch_v3.py new file mode 100644 index 0000000..e07ae88 --- /dev/null +++ b/alphafold_mining/cyssearch_v3.py @@ -0,0 +1,154 @@ +# Dengke Ma wants to find all C elegans proteins with pairs of close cysteines in transmembrane regions. +# Can use UniProt to identify transmembrane residues, then use AlphaFold database predicted structures +# to see if there are close cysteines. + +def find_uniprot_transmembrane_cysteines(uniprot_xml_path, namespace = '{http://uniprot.org/uniprot}'): + import xml.etree.ElementTree as ET + tree = ET.parse(uniprot_xml_path) + tm = [] + for child in tree.getroot(): + if child.tag == namespace + 'entry': + rr = transmembrane_residue_ranges(child, namespace) + uniprot_id = child.find(namespace + 'accession').text + seq = child.find(namespace + 'sequence').text + full_name = child.find(f'./{namespace}protein/{namespace}submittedName/{namespace}fullName') + name = '' if full_name is None else full_name.text + phc = paired_helix_cys(seq, rr) + tm.append((uniprot_id, name, phc, rr)) + return tm + +def paired_helix_cys(seq, rr): + # Dengke suggests considering only paired cysteines CC, CxC or CxxC in a helix. + phc = [] + for b,e in rr: + ci = set([i+1 for i in range(b-1,e) if seq[i] == 'C']) + pci = [i for i in ci + if ((i+1) in ci or (i+2) in ci or (i+3) in ci or + (i-1) in ci or (i-2) in ci or (i-3) in ci)] + if len(pci) >= 2: + phc.append(pci) + return phc + +def transmembrane_residue_ranges(protein_xml_entry, namespace): + ranges = [] + for feature in protein_xml_entry.iter(namespace + 'feature'): + fattrib = feature.attrib + if 'type' in fattrib and fattrib['type'] == 'transmembrane region': + for loc in feature.iter(namespace + 'location'): + b,e = loc.find(namespace + 'begin'), loc.find(namespace + 'end') + if b is not None and e is not None: + if 'position' in b.attrib and 'position' in e.attrib: + r = (int(b.attrib['position']), int(e.attrib['position'])) + ranges.append(r) + return ranges + +def close_cysteines(structure, membrane_residue_ranges, max_distance = 5): + cys_res = [r for r in structure.residues if r.name == 'CYS'] + cys_xyz = [(r.number, r.find_atom('SG').coord) for r in cys_res] + mb_res_nums = residue_numbers_from_ranges(membrane_residue_ranges) + mb_cys = [r for r in cys_res if r.number in mb_res_nums] + mb_xyz = [(r.number, r.find_atom('SG').coord) for r in mb_cys] + + close_pairs = set() + from chimerax.geometry import distance + for rnum, xyz in mb_xyz: + for rnum2, xyz2 in cys_xyz: + if rnum2 != rnum and distance(xyz, xyz2) <= max_distance: + pair = (rnum, rnum2) if rnum < rnum2 else (rnum2, rnum) + close_pairs.add(pair) + + return list(close_pairs) + +def atoms_by_residue_number(atoms, atom_name): + amap = {} + for a in atoms: + if a.name == atom_name: + amap[a.residue.number] = a + return amap + +def residue_numbers_from_ranges(residue_ranges): + res_nums = set() + for b,e in residue_ranges: + for rnum in range(b,e+1): + res_nums.add(rnum) + return res_nums + +def check_for_close_cysteines(session, ulist, alphafold_dir, max_distance): + found = [] + missing = [] + for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist: + if len(paired_hel_cys) < 2: + continue + m = alphafold_database_model(session, uniprot_id, alphafold_dir) + if m is None: + missing.append((uniprot_id, name)) + continue + close_pairs = [] + atoms = atoms_by_residue_number(m.atoms, 'SG') + from chimerax.geometry import distance + for i,ph1 in enumerate(paired_hel_cys): + for ph2 in paired_hel_cys[i+1:]: + for rnum1 in ph1: + for rnum2 in ph2: + if distance(atoms[rnum1].coord, atoms[rnum2].coord) <= max_distance: + close_pairs.append((rnum1, rnum2)) + if close_pairs: + found.append((uniprot_id, name, close_pairs, tm_res_ranges)) + m.delete() + return found, missing + +def alphafold_database_model(session, uniprot_id, alphafold_dir): + filename = f'AF-{uniprot_id}-F1-model_v4.cif' + from os.path import join, exists + path = join(alphafold_dir, filename) + if not exists(path): + return None + from chimerax.mmcif import open_mmcif + s, msg = open_mmcif(session, path) + return s[0] + +def open_entries(session, entries, alphafold_dir): + models = [] + for uniprot_id, name, close_pairs, tm_res_ranges in entries: + m = alphafold_database_model(session, uniprot_id, alphafold_dir) + models.append(m) + # Select transmembrane residues + rnums = residue_numbers_from_ranges(tm_res_ranges) + for r in m.residues: + if r.number in rnums: + r.atoms.selected = True + session.models.add(models) + +uniprot_xml_path = 'UP000001940_6239.xml' +alphafold_dir = 'alphafold_models' +max_distance = 10 + +ulist = find_uniprot_transmembrane_cysteines(uniprot_xml_path) +print(f'{len(ulist)} UniProt entries') +ntm = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if tm_res_ranges]) +ntm4 = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if len(tm_res_ranges)>=4]) +print(f'{ntm} entries with annotated transmembrane regions') +print(f'{ntm4} entries with 4 or more transmembrane helices') +ntm4p = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist + if len(tm_res_ranges)>=4 and len(paired_hel_cys)>=2]) +print(f'{ntm4p} entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs') + +uclose, missing = check_for_close_cysteines(session, ulist, alphafold_dir, max_distance) +print(f'{len(uclose)} with paired cysteines in two helices closer than {max_distance}A') + +entries = [] +for uniprot_id, name, close_pairs, tm_res_ranges in uclose: + rpairs = ' '.join(f'{r1}:{r2}' for r1,r2 in close_pairs) + tmranges = ' '.join(f'{r1}-{r2}' for r1,r2 in tm_res_ranges) + entries.append(f'{uniprot_id},{name},{rpairs},{tmranges}') + +print() +print('# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges') +print('\n'.join(entries)) +print() + +if missing: + me = '\n'.join(f'{uniprot_id},{name}' for uniprot_id, name in missing) + print(f'No alphafold model for {len(missing)} entries with cysteine pairs:\n{me}') + +open_entries(session, uclose, alphafold_dir) diff --git a/alphafold_mining/results_v3.jpg b/alphafold_mining/results_v3.jpg new file mode 100644 index 0000000..711e8ad Binary files /dev/null and b/alphafold_mining/results_v3.jpg differ diff --git a/alphafold_mining/results_v3.txt b/alphafold_mining/results_v3.txt new file mode 100644 index 0000000..1469d25 --- /dev/null +++ b/alphafold_mining/results_v3.txt @@ -0,0 +1,63 @@ +open cyssearch_v3.py +# Took 8 seconds + +19827 UniProt entries +5756 entries with annotated transmembrane regions +3177 entries with 4 or more transmembrane helices +190 entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs +53 with paired cysteines in two helices closer than 10A + +# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges +O17230,Seven TM Receptor,92:203,6-27 39-62 82-108 129-149 197-223 243-267 273-292 +Q9U1P9,Lysosomal cobalamin transporter,108:371,104-127 139-161 167-185 232-251 366-391 431-454 +G5EFS0,Cation/H+ exchanger domain-containing protein,227:320 227:323,25-48 68-85 92-110 122-141 153-178 184-205 217-237 257-278 299-327 347-364 376-395 407-431 443-463 +Q93340,Candidate tumor suppressor protein,8:145,7-30 42-59 80-106 118-149 +Q5F4U1,Serpentine Receptor, class Z,130:208 130:205 131:208,12-38 58-80 92-114 120-140 152-173 193-214 235-255 261-286 +Q19983,,90:274 90:275 91:274,12-35 55-79 86-108 251-276 +O16329,Seven TM Receptor,97:146 94:146 94:149,12-33 45-69 89-113 134-154 203-226 246-269 289-307 +Q960A0,,428:481,25-45 70-90 120-140 159-179 199-218 253-273 296-316 329-349 377-397 421-441 475-495 513-533 +Q9U2H5,Serpentine Receptor, class T,85:277,33-57 69-93 105-127 147-165 203-222 234-256 268-290 +Q9XUC8,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-169 208-228 240-263 269-292 +Q19508,,192:247,9-29 42-62 91-111 129-149 185-205 239-259 269-289 +Q9U1V0,CLaudin-like in Caenorhabditis,166:233,16-36 135-157 164-190 210-235 +Q9N5L5,G-protein coupled receptors family 1 profile domain-containing protein,34:59 35:59,19-40 47-71 91-118 139-162 196-219 240-261 281-300 +A0A0K3AXT4,Serpentine Receptor, class T,90:133 90:134 93:133,51-71 83-111 117-143 163-181 211-233 253-274 280-305 +Q9N483,Seven TM Receptor,53:101,12-34 46-68 88-114 134-151 199-224 255-279 285-312 +A0A3B1E8T2,Uncharacterized protein,29:67,22-43 55-72 84-110 131-157 +Q9XXP5,Serpentine Receptor, class H,81:297,34-54 66-89 109-130 151-171 212-239 260-281 293-314 +B1GRL2,UNC93-like protein MFSD11,156:385 156:382,14-33 61-82 89-105 111-129 150-169 202-223 257-277 297-316 328-349 369-391 403-421 427-446 +H2L2K4,Serpentine Receptor, class T,122:248,35-59 71-100 106-129 149-171 208-228 240-261 267-292 +O61865,UNC93-like protein,390:420,21-44 64-87 94-110 116-134 155-179 207-228 262-283 295-321 333-353 373-396 408-426 432-452 +O16471,Serpentine Receptor, class T,64:107 64:108 67:107,59-85 91-116 137-155 175-203 224-244 256-276 +Q23073,Serpentine Receptor, class T,106:265,20-42 54-74 86-111 131-154 174-200 221-244 250-273 +Q9XVU1,Lipoma HMGIC fusion partner-like 3 protein,26:172,21-44 79-103 110-133 158-182 +Q965R8,G_PROTEIN_RECEP_F1_2 domain-containing protein,30:54,12-37 49-73 85-111 123-145 174-193 214-236 248-266 +P91003,UNC93-like protein MFSD11,148:375,51-69 81-97 103-120 141-161 194-215 253-272 284-309 321-340 360-384 396-414 420-442 +Q8MYL9,UNC93-like protein MFSD11,148:373,53-74 81-100 106-129 141-161 195-214 251-272 284-307 319-337 357-375 396-413 419-440 +Q18517,G-protein coupled receptors family 1 profile domain-containing protein,96:131 97:131,46-66 87-107 119-142 163-182 233-256 277-298 318-339 +Q9NAN9,Serpentine Receptor, class T,85:116 122:248,35-59 71-95 107-130 151-180 205-224 236-258 270-292 +O44130,Major facilitator superfamily (MFS) profile domain-containing protein,77:151,68-95 142-161 167-191 203-225 237-256 305-326 338-355 376-398 404-420 432-453 473-492 +Q9N480,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310 +Q9NAI0,Serpentine Receptor, class T,122:248,35-59 71-96 108-128 149-170 208-228 240-261 267-292 +I2HAH7,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-168 208-228 240-263 269-292 +Q6EZH2,Gustatory receptor,111:207,108-127 202-224 245-267 273-292 344-367 +O17988,Serpentine Receptor, class H,81:297,29-52 64-89 109-130 151-171 216-240 260-281 293-314 +O17872,Seven TM Receptor,59:101 60:101,12-33 45-64 95-115 136-156 204-227 255-283 289-309 +O45337,Seven TM Receptor,85:127 86:127,38-59 71-90 118-140 160-177 228-251 280-301 313-334 +O45740,Serpentine Receptor, class H,214:258 215:258,16-39 59-77 97-119 140-163 200-226 247-274 286-307 +O45972,Seven TM Receptor,59:101 60:101,15-33 45-64 90-115 136-155 202-227 255-280 286-309 +A0A2K5ATW7,,26:68,12-33 60-77 +O16341,G-protein coupled receptors family 1 profile domain-containing protein,82:320 82:323,45-63 75-97 109-129 135-153 165-185 224-244 265-286 306-326 +Q23072,Serpentine Receptor, class T,126:285,38-60 72-94 106-131 151-170 214-235 242-264 270-293 +Q9N481,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310 +Q9GZF9,Serpentine Receptor, class T,41:77 42:82 42:75 42:77 85:208,6-24 31-54 66-90 111-133 153-179 200-224 230-252 +A0A1T5HUX0,G-protein coupled receptors family 1 profile domain-containing protein,64:90 64:93 66:90,46-72 84-111 131-152 164-190 210-234 260-280 305-329 +G5EE13,SSD domain-containing protein,324:896 326:896,51-71 312-337 346-368 374-397 462-488 546-566 783-801 808-828 875-902 914-937 +A0A060Q608,Serpentine Receptor, class T,47:240,31-55 67-89 109-128 168-188 200-223 229-252 +P34389,,275:838,268-288 327-347 588-608 630-650 656-676 722-742 745-765 824-844 1079-1101 1106-1128 1138-1160 1172-1194 1204-1226 +Q95XW0,Serpentine Receptor, class T,89:125 90:130 90:123 90:125,44-67 79-101 113-138 159-176 210-233 254-273 279-300 +Q9NEU8,UNC93-like protein MFSD11,147:375 150:375,7-27 55-76 83-99 105-122 143-168 195-214 253-272 284-306 318-340 360-384 396-414 420-442 +Q4PIX1,Major facilitator superfamily (MFS) profile domain-containing protein,25:201,12-35 96-114 126-144 150-172 192-210 216-235 286-304 324-348 360-379 391-414 +O44542,Serpentine Receptor, class T,127:286,39-61 73-95 107-132 152-170 206-225 246-265 271-294 +G5EBI8,STarGazin (Mammalian calcium channel) homolog,187:270,64-86 177-196 203-227 257-279 +G5ECQ2,,244:508 245:508,237-257 269-289 313-333 355-375 399-419 450-470 498-518