Added a modified search for C elegans transmembrane cysteines.

RBVI · Jan 20, 2024 · 685dacc · 685dacc
1 parent 3aea080
commit 685dacc
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 0 deletions.
diff --git a/alphafold_mining/5cys_4helix.png b/alphafold_mining/5cys_4helix.png
diff --git a/alphafold_mining/af_mining.md b/alphafold_mining/af_mining.md
@@ -193,4 +193,29 @@ Here is the ChimeraX Python script that does the search [cyssearch.py](cyssearch
     me = '\n'.join(f'{uniprot_id} {seq_length}' for uniprot_id, seq_length in missing)
     print(f'No alphafold model for {len(missing)} entries with transmembrane regions:\n{me}')
 
+# Modifications
+
+The script can easily be modified to instead find all C elegans proteins with at least 4 transmembrane helices where one of those helices has at least 5 cysteines.  And we can load those structures with the transmembrane helices selected.  Here is the script [cys3search.py](cys3search.py) that does that, and the 12 structures found are listed and shown below.
+
+    19827 UniProt entries
+    5756 entries with annotated transmembrane regions
+    4518 entries with 1 or more transmembrane cysteines
+    12 UniProt entries with 5 cysteines in one transmembrane helix and at least 4 transmembrane helices
+
+    # UniProt ID, protein name, number of transmembrane segments, maximum single segment cysteines)
+    Q7YWW4,Conserved plasma membrane protein,4,5
+    Q22937,Serpentine receptor class gamma,7,5
+    Q93340,Candidate tumor suppressor protein,4,7
+    Q19983,,4,5
+    Q4PIV2,Claudin domain-containing protein 1,4,5
+    Q5DX32,,7,6
+    Q9GUE2,Seven TM Receptor,7,6
+    O45337,Seven TM Receptor,7,5
+    Q9GUE1,Seven TM Receptor,7,5
+    G5EFG6,Small integral membrane protein,4,8
+    Q9NEQ0,,4,5
+    Q9XXR9,SSD domain-containing protein,11,5
+
+<img src="5cys_4helix.png" width="800">
+
 Tom Goddard, January 19, 2024
diff --git a/alphafold_mining/cys3search.py b/alphafold_mining/cys3search.py
@@ -0,0 +1,130 @@
+# Dengke Ma wants to find all C elegans proteins with pairs of close cysteines in transmembrane regions.
+# Can use UniProt to identify transmembrane residues, then use AlphaFold database predicted structures
+# to see if there are close cysteines.
+
+def find_uniprot_transmembrane_cysteines(uniprot_xml_path, namespace = '{http://uniprot.org/uniprot}'):
+    import xml.etree.ElementTree as ET
+    tree = ET.parse(uniprot_xml_path)
+    tm = []
+    for child in tree.getroot():
+        if child.tag == namespace + 'entry':
+            rr = transmembrane_residue_ranges(child, namespace)
+            uniprot_id = child.find(namespace + 'accession').text
+            seq = child.find(namespace + 'sequence').text
+            full_name = child.find(f'./{namespace}protein/{namespace}submittedName/{namespace}fullName')
+            name = '' if full_name is None else full_name.text
+            helix_cys_count = [seq[b-1:e].count('C') for b,e in rr]
+            hcc = max(helix_cys_count, default = 0)
+            tm.append((uniprot_id, name, hcc, len(seq), rr))
+    return tm
+
+def transmembrane_residue_ranges(protein_xml_entry, namespace):
+    ranges = []
+    for feature in protein_xml_entry.iter(namespace + 'feature'):
+        fattrib = feature.attrib
+        if 'type' in fattrib and fattrib['type'] == 'transmembrane region':
+            for loc in feature.iter(namespace + 'location'):
+                b,e = loc.find(namespace + 'begin'), loc.find(namespace + 'end')
+                if b is not None and e is not None:
+                    if 'position' in b.attrib and 'position' in e.attrib:
+                        r = (int(b.attrib['position']), int(e.attrib['position']))
+                        ranges.append(r)
+    return ranges
+
+def close_cysteines(structure, membrane_residue_ranges, max_distance = 5):
+    cys_res = [r for r in structure.residues if r.name == 'CYS']
+    cys_xyz = [(r.number, r.find_atom('SG').coord) for r in cys_res]
+    mb_res_nums = residue_numbers_from_ranges(membrane_residue_ranges)
+    mb_cys = [r for r in cys_res if r.number in mb_res_nums]
+    mb_xyz = [(r.number, r.find_atom('SG').coord) for r in mb_cys]
+
+    close_pairs = set()
+    from chimerax.geometry import distance
+    for rnum, xyz in mb_xyz:
+        for rnum2, xyz2 in cys_xyz:
+            if rnum2 != rnum and distance(xyz, xyz2) <= max_distance:
+                pair = (rnum, rnum2) if rnum < rnum2 else (rnum2, rnum)
+                close_pairs.add(pair)
+
+    return list(close_pairs)
+
+def residue_numbers_from_ranges(residue_ranges):
+    res_nums = set()
+    for b,e in residue_ranges:
+        for rnum in range(b,e+1):
+            res_nums.add(rnum)
+    return res_nums
+
+def check_for_close_cysteines(session, ulist, alphafold_dir, max_distance):
+    found = []
+    missing = []
+    for uniprot_id, ncys, seq_len, tm_res_ranges in ulist:
+        if ncys == 0:
+            continue
+        m = alphafold_database_model(session, uniprot_id, alphafold_dir)
+        if m is None:
+            missing.append((uniprot_id, seq_len))
+            continue
+        close_pairs = close_cysteines(m, tm_res_ranges, max_distance)
+        if close_pairs:
+            found.append((uniprot_id, close_pairs))
+        m.delete()
+    return found, missing
+
+def alphafold_database_model(session, uniprot_id, alphafold_dir):
+    filename = f'AF-{uniprot_id}-F1-model_v4.cif'
+    from os.path import join, exists
+    path = join(alphafold_dir, filename)
+    if not exists(path):
+        return None
+    from chimerax.mmcif import open_mmcif
+    s, msg = open_mmcif(session, path)
+    return s[0]
+
+def open_entries(session, entries, alphafold_dir):
+    models = []
+    for uniprot_id, name, ncys, seq_len, tm_res_ranges in entries:
+        m = alphafold_database_model(session, uniprot_id, alphafold_dir)
+        models.append(m)
+        # Select transmembrane residues
+        rnums = residue_numbers_from_ranges(tm_res_ranges)
+        for r in m.residues:
+            if r.number in rnums:
+                r.atoms.selected = True
+    session.models.add(models)
+
+uniprot_xml_path = 'UP000001940_6239.xml'
+alphafold_dir = 'alphafold_models'
+max_distance = 5
+
+ulist = find_uniprot_transmembrane_cysteines(uniprot_xml_path)
+#uclose, missing = check_for_close_cysteines(session, ulist, alphafold_dir, max_distance)
+print(f'{len(ulist)} UniProt entries')
+ntm = len([uniprot_id for uniprot_id, name, ncys, seq_len, tm_res_ranges in ulist if tm_res_ranges])
+ntmc = len([uniprot_id for uniprot_id, name, ncys, seq_len, tm_res_ranges in ulist if ncys > 0])
+print(f'{ntm} entries with annotated transmembrane regions')
+print(f'{ntmc} entries with 1 or more transmembrane cysteines')
+min_cys = 5
+min_helix = 4
+u3 = [(uniprot_id, name, ncys, seq_len, tm_res_ranges)
+      for uniprot_id, name, ncys, seq_len, tm_res_ranges in ulist if ncys >= min_cys and len(tm_res_ranges) >= min_helix]
+print(f'{len(u3)} UniProt entries with {min_cys} cysteines in one transmembrane helix and at least {min_helix} transmembrane helices')
+#print(f'{len(uclose)} with two cysteines closer than {max_distance}A, at least 1 being transmembrane')
+
+#entries = []
+#for uniprot_id, res_pairs in uclose:
+#    rpairs = ' '.join(f'{r1},{r2}' for r1,r2 in res_pairs)
+#    entries.append(f'{uniprot_id} {rpairs}')
+entries = [f'{uniprot_id},{name},{len(tm_res_ranges)},{ncys}'
+           for uniprot_id, name, ncys, seq_len, tm_res_ranges in u3]
+
+print()
+print('# UniProt ID, protein name, number of transmembrane segments, maximum single segment cysteines)')
+print('\n'.join(entries))
+print()
+
+open_entries(session, u3, alphafold_dir)
+
+#me = '\n'.join(f'{uniprot_id} {seq_length}' for uniprot_id, seq_length in missing)
+#print(f'No alphafold model for {len(missing)} entries with transmembrane regions:\n{me}')
+