diff --git a/alphafold_mining/af_mining.md b/alphafold_mining/af_mining.md
index 2b0d54b..4c267a1 100644
--- a/alphafold_mining/af_mining.md
+++ b/alphafold_mining/af_mining.md
@@ -195,7 +195,7 @@ Here is the ChimeraX Python script that does the search [cyssearch.py](cyssearch
# Modifications
-The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cys3search.py](cys3search.py) that does that, and the 12 structures found are listed and shown below.
+The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines. And we can load those structures with the transmembrane helices selected. Here is the script [cyssearch_v2.py](cyssearch_v2.py) that does that, and the 12 structures found are listed and shown below.
19827 UniProt entries
5756 entries with annotated transmembrane regions
@@ -216,6 +216,12 @@ The script can easily be modified to instead find all C elegans proteins with at
Q9NEQ0,,4,5
Q9XXR9,SSD domain-containing protein,11,5
-
+
+
+# More modifications
+
+Denke Ma suggested filtering to proteins with at least 4 transmembrane helices and looking for pairs of cysteines separated by 0, 1, or 2 residues (CC, CxC, CxxC) in each helix and requiring two helices to have pairs with distance at most 10 Angstroms from a cysteine SG atom in one helix to the other. That gave 53 proteins [results_v3.txt](results_v3.txt) using script [cyssearch_v3.py](cyssearch_v3.py).
+
+
Tom Goddard, January 19, 2024
diff --git a/alphafold_mining/cys3search.py b/alphafold_mining/cyssearch_v2.py
similarity index 100%
rename from alphafold_mining/cys3search.py
rename to alphafold_mining/cyssearch_v2.py
diff --git a/alphafold_mining/cyssearch_v3.py b/alphafold_mining/cyssearch_v3.py
new file mode 100644
index 0000000..e07ae88
--- /dev/null
+++ b/alphafold_mining/cyssearch_v3.py
@@ -0,0 +1,154 @@
+# Dengke Ma wants to find all C elegans proteins with pairs of close cysteines in transmembrane regions.
+# Can use UniProt to identify transmembrane residues, then use AlphaFold database predicted structures
+# to see if there are close cysteines.
+
+def find_uniprot_transmembrane_cysteines(uniprot_xml_path, namespace = '{http://uniprot.org/uniprot}'):
+ import xml.etree.ElementTree as ET
+ tree = ET.parse(uniprot_xml_path)
+ tm = []
+ for child in tree.getroot():
+ if child.tag == namespace + 'entry':
+ rr = transmembrane_residue_ranges(child, namespace)
+ uniprot_id = child.find(namespace + 'accession').text
+ seq = child.find(namespace + 'sequence').text
+ full_name = child.find(f'./{namespace}protein/{namespace}submittedName/{namespace}fullName')
+ name = '' if full_name is None else full_name.text
+ phc = paired_helix_cys(seq, rr)
+ tm.append((uniprot_id, name, phc, rr))
+ return tm
+
+def paired_helix_cys(seq, rr):
+ # Dengke suggests considering only paired cysteines CC, CxC or CxxC in a helix.
+ phc = []
+ for b,e in rr:
+ ci = set([i+1 for i in range(b-1,e) if seq[i] == 'C'])
+ pci = [i for i in ci
+ if ((i+1) in ci or (i+2) in ci or (i+3) in ci or
+ (i-1) in ci or (i-2) in ci or (i-3) in ci)]
+ if len(pci) >= 2:
+ phc.append(pci)
+ return phc
+
+def transmembrane_residue_ranges(protein_xml_entry, namespace):
+ ranges = []
+ for feature in protein_xml_entry.iter(namespace + 'feature'):
+ fattrib = feature.attrib
+ if 'type' in fattrib and fattrib['type'] == 'transmembrane region':
+ for loc in feature.iter(namespace + 'location'):
+ b,e = loc.find(namespace + 'begin'), loc.find(namespace + 'end')
+ if b is not None and e is not None:
+ if 'position' in b.attrib and 'position' in e.attrib:
+ r = (int(b.attrib['position']), int(e.attrib['position']))
+ ranges.append(r)
+ return ranges
+
+def close_cysteines(structure, membrane_residue_ranges, max_distance = 5):
+ cys_res = [r for r in structure.residues if r.name == 'CYS']
+ cys_xyz = [(r.number, r.find_atom('SG').coord) for r in cys_res]
+ mb_res_nums = residue_numbers_from_ranges(membrane_residue_ranges)
+ mb_cys = [r for r in cys_res if r.number in mb_res_nums]
+ mb_xyz = [(r.number, r.find_atom('SG').coord) for r in mb_cys]
+
+ close_pairs = set()
+ from chimerax.geometry import distance
+ for rnum, xyz in mb_xyz:
+ for rnum2, xyz2 in cys_xyz:
+ if rnum2 != rnum and distance(xyz, xyz2) <= max_distance:
+ pair = (rnum, rnum2) if rnum < rnum2 else (rnum2, rnum)
+ close_pairs.add(pair)
+
+ return list(close_pairs)
+
+def atoms_by_residue_number(atoms, atom_name):
+ amap = {}
+ for a in atoms:
+ if a.name == atom_name:
+ amap[a.residue.number] = a
+ return amap
+
+def residue_numbers_from_ranges(residue_ranges):
+ res_nums = set()
+ for b,e in residue_ranges:
+ for rnum in range(b,e+1):
+ res_nums.add(rnum)
+ return res_nums
+
+def check_for_close_cysteines(session, ulist, alphafold_dir, max_distance):
+ found = []
+ missing = []
+ for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist:
+ if len(paired_hel_cys) < 2:
+ continue
+ m = alphafold_database_model(session, uniprot_id, alphafold_dir)
+ if m is None:
+ missing.append((uniprot_id, name))
+ continue
+ close_pairs = []
+ atoms = atoms_by_residue_number(m.atoms, 'SG')
+ from chimerax.geometry import distance
+ for i,ph1 in enumerate(paired_hel_cys):
+ for ph2 in paired_hel_cys[i+1:]:
+ for rnum1 in ph1:
+ for rnum2 in ph2:
+ if distance(atoms[rnum1].coord, atoms[rnum2].coord) <= max_distance:
+ close_pairs.append((rnum1, rnum2))
+ if close_pairs:
+ found.append((uniprot_id, name, close_pairs, tm_res_ranges))
+ m.delete()
+ return found, missing
+
+def alphafold_database_model(session, uniprot_id, alphafold_dir):
+ filename = f'AF-{uniprot_id}-F1-model_v4.cif'
+ from os.path import join, exists
+ path = join(alphafold_dir, filename)
+ if not exists(path):
+ return None
+ from chimerax.mmcif import open_mmcif
+ s, msg = open_mmcif(session, path)
+ return s[0]
+
+def open_entries(session, entries, alphafold_dir):
+ models = []
+ for uniprot_id, name, close_pairs, tm_res_ranges in entries:
+ m = alphafold_database_model(session, uniprot_id, alphafold_dir)
+ models.append(m)
+ # Select transmembrane residues
+ rnums = residue_numbers_from_ranges(tm_res_ranges)
+ for r in m.residues:
+ if r.number in rnums:
+ r.atoms.selected = True
+ session.models.add(models)
+
+uniprot_xml_path = 'UP000001940_6239.xml'
+alphafold_dir = 'alphafold_models'
+max_distance = 10
+
+ulist = find_uniprot_transmembrane_cysteines(uniprot_xml_path)
+print(f'{len(ulist)} UniProt entries')
+ntm = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if tm_res_ranges])
+ntm4 = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist if len(tm_res_ranges)>=4])
+print(f'{ntm} entries with annotated transmembrane regions')
+print(f'{ntm4} entries with 4 or more transmembrane helices')
+ntm4p = len([uniprot_id for uniprot_id, name, paired_hel_cys, tm_res_ranges in ulist
+ if len(tm_res_ranges)>=4 and len(paired_hel_cys)>=2])
+print(f'{ntm4p} entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs')
+
+uclose, missing = check_for_close_cysteines(session, ulist, alphafold_dir, max_distance)
+print(f'{len(uclose)} with paired cysteines in two helices closer than {max_distance}A')
+
+entries = []
+for uniprot_id, name, close_pairs, tm_res_ranges in uclose:
+ rpairs = ' '.join(f'{r1}:{r2}' for r1,r2 in close_pairs)
+ tmranges = ' '.join(f'{r1}-{r2}' for r1,r2 in tm_res_ranges)
+ entries.append(f'{uniprot_id},{name},{rpairs},{tmranges}')
+
+print()
+print('# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges')
+print('\n'.join(entries))
+print()
+
+if missing:
+ me = '\n'.join(f'{uniprot_id},{name}' for uniprot_id, name in missing)
+ print(f'No alphafold model for {len(missing)} entries with cysteine pairs:\n{me}')
+
+open_entries(session, uclose, alphafold_dir)
diff --git a/alphafold_mining/results_v3.jpg b/alphafold_mining/results_v3.jpg
new file mode 100644
index 0000000..711e8ad
Binary files /dev/null and b/alphafold_mining/results_v3.jpg differ
diff --git a/alphafold_mining/results_v3.txt b/alphafold_mining/results_v3.txt
new file mode 100644
index 0000000..1469d25
--- /dev/null
+++ b/alphafold_mining/results_v3.txt
@@ -0,0 +1,63 @@
+open cyssearch_v3.py
+# Took 8 seconds
+
+19827 UniProt entries
+5756 entries with annotated transmembrane regions
+3177 entries with 4 or more transmembrane helices
+190 entries with 4 or more transmembrane helices and at least two with CC, CxC or CxxC cysteine pairs
+53 with paired cysteines in two helices closer than 10A
+
+# UniProt ID, protein name, residue numbers of close paired cysteines, transmembrane ranges
+O17230,Seven TM Receptor,92:203,6-27 39-62 82-108 129-149 197-223 243-267 273-292
+Q9U1P9,Lysosomal cobalamin transporter,108:371,104-127 139-161 167-185 232-251 366-391 431-454
+G5EFS0,Cation/H+ exchanger domain-containing protein,227:320 227:323,25-48 68-85 92-110 122-141 153-178 184-205 217-237 257-278 299-327 347-364 376-395 407-431 443-463
+Q93340,Candidate tumor suppressor protein,8:145,7-30 42-59 80-106 118-149
+Q5F4U1,Serpentine Receptor, class Z,130:208 130:205 131:208,12-38 58-80 92-114 120-140 152-173 193-214 235-255 261-286
+Q19983,,90:274 90:275 91:274,12-35 55-79 86-108 251-276
+O16329,Seven TM Receptor,97:146 94:146 94:149,12-33 45-69 89-113 134-154 203-226 246-269 289-307
+Q960A0,,428:481,25-45 70-90 120-140 159-179 199-218 253-273 296-316 329-349 377-397 421-441 475-495 513-533
+Q9U2H5,Serpentine Receptor, class T,85:277,33-57 69-93 105-127 147-165 203-222 234-256 268-290
+Q9XUC8,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-169 208-228 240-263 269-292
+Q19508,,192:247,9-29 42-62 91-111 129-149 185-205 239-259 269-289
+Q9U1V0,CLaudin-like in Caenorhabditis,166:233,16-36 135-157 164-190 210-235
+Q9N5L5,G-protein coupled receptors family 1 profile domain-containing protein,34:59 35:59,19-40 47-71 91-118 139-162 196-219 240-261 281-300
+A0A0K3AXT4,Serpentine Receptor, class T,90:133 90:134 93:133,51-71 83-111 117-143 163-181 211-233 253-274 280-305
+Q9N483,Seven TM Receptor,53:101,12-34 46-68 88-114 134-151 199-224 255-279 285-312
+A0A3B1E8T2,Uncharacterized protein,29:67,22-43 55-72 84-110 131-157
+Q9XXP5,Serpentine Receptor, class H,81:297,34-54 66-89 109-130 151-171 212-239 260-281 293-314
+B1GRL2,UNC93-like protein MFSD11,156:385 156:382,14-33 61-82 89-105 111-129 150-169 202-223 257-277 297-316 328-349 369-391 403-421 427-446
+H2L2K4,Serpentine Receptor, class T,122:248,35-59 71-100 106-129 149-171 208-228 240-261 267-292
+O61865,UNC93-like protein,390:420,21-44 64-87 94-110 116-134 155-179 207-228 262-283 295-321 333-353 373-396 408-426 432-452
+O16471,Serpentine Receptor, class T,64:107 64:108 67:107,59-85 91-116 137-155 175-203 224-244 256-276
+Q23073,Serpentine Receptor, class T,106:265,20-42 54-74 86-111 131-154 174-200 221-244 250-273
+Q9XVU1,Lipoma HMGIC fusion partner-like 3 protein,26:172,21-44 79-103 110-133 158-182
+Q965R8,G_PROTEIN_RECEP_F1_2 domain-containing protein,30:54,12-37 49-73 85-111 123-145 174-193 214-236 248-266
+P91003,UNC93-like protein MFSD11,148:375,51-69 81-97 103-120 141-161 194-215 253-272 284-309 321-340 360-384 396-414 420-442
+Q8MYL9,UNC93-like protein MFSD11,148:373,53-74 81-100 106-129 141-161 195-214 251-272 284-307 319-337 357-375 396-413 419-440
+Q18517,G-protein coupled receptors family 1 profile domain-containing protein,96:131 97:131,46-66 87-107 119-142 163-182 233-256 277-298 318-339
+Q9NAN9,Serpentine Receptor, class T,85:116 122:248,35-59 71-95 107-130 151-180 205-224 236-258 270-292
+O44130,Major facilitator superfamily (MFS) profile domain-containing protein,77:151,68-95 142-161 167-191 203-225 237-256 305-326 338-355 376-398 404-420 432-453 473-492
+Q9N480,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310
+Q9NAI0,Serpentine Receptor, class T,122:248,35-59 71-96 108-128 149-170 208-228 240-261 267-292
+I2HAH7,Serpentine Receptor, class T,87:280,36-59 71-95 107-129 149-168 208-228 240-263 269-292
+Q6EZH2,Gustatory receptor,111:207,108-127 202-224 245-267 273-292 344-367
+O17988,Serpentine Receptor, class H,81:297,29-52 64-89 109-130 151-171 216-240 260-281 293-314
+O17872,Seven TM Receptor,59:101 60:101,12-33 45-64 95-115 136-156 204-227 255-283 289-309
+O45337,Seven TM Receptor,85:127 86:127,38-59 71-90 118-140 160-177 228-251 280-301 313-334
+O45740,Serpentine Receptor, class H,214:258 215:258,16-39 59-77 97-119 140-163 200-226 247-274 286-307
+O45972,Seven TM Receptor,59:101 60:101,15-33 45-64 90-115 136-155 202-227 255-280 286-309
+A0A2K5ATW7,,26:68,12-33 60-77
+O16341,G-protein coupled receptors family 1 profile domain-containing protein,82:320 82:323,45-63 75-97 109-129 135-153 165-185 224-244 265-286 306-326
+Q23072,Serpentine Receptor, class T,126:285,38-60 72-94 106-131 151-170 214-235 242-264 270-293
+Q9N481,Seven TM Receptor,54:102,12-35 47-69 89-115 135-152 211-234 255-280 286-310
+Q9GZF9,Serpentine Receptor, class T,41:77 42:82 42:75 42:77 85:208,6-24 31-54 66-90 111-133 153-179 200-224 230-252
+A0A1T5HUX0,G-protein coupled receptors family 1 profile domain-containing protein,64:90 64:93 66:90,46-72 84-111 131-152 164-190 210-234 260-280 305-329
+G5EE13,SSD domain-containing protein,324:896 326:896,51-71 312-337 346-368 374-397 462-488 546-566 783-801 808-828 875-902 914-937
+A0A060Q608,Serpentine Receptor, class T,47:240,31-55 67-89 109-128 168-188 200-223 229-252
+P34389,,275:838,268-288 327-347 588-608 630-650 656-676 722-742 745-765 824-844 1079-1101 1106-1128 1138-1160 1172-1194 1204-1226
+Q95XW0,Serpentine Receptor, class T,89:125 90:130 90:123 90:125,44-67 79-101 113-138 159-176 210-233 254-273 279-300
+Q9NEU8,UNC93-like protein MFSD11,147:375 150:375,7-27 55-76 83-99 105-122 143-168 195-214 253-272 284-306 318-340 360-384 396-414 420-442
+Q4PIX1,Major facilitator superfamily (MFS) profile domain-containing protein,25:201,12-35 96-114 126-144 150-172 192-210 216-235 286-304 324-348 360-379 391-414
+O44542,Serpentine Receptor, class T,127:286,39-61 73-95 107-132 152-170 206-225 246-265 271-294
+G5EBI8,STarGazin (Mammalian calcium channel) homolog,187:270,64-86 177-196 203-227 257-279
+G5ECQ2,,244:508 245:508,237-257 269-289 313-333 355-375 399-419 450-470 498-518