From a8dc918c46a2f3019dd5679e6645b2f9bd6b8f1b Mon Sep 17 00:00:00 2001
From: Sean Aubin <sean@proteinqure.com>
Date: Mon, 12 Sep 2022 08:40:51 -0400
Subject: [PATCH 1/4] parse_PDB_biounits no mutable default args

atoms shouldn't have a list as a default arg, since it's mutable
and could cause problems. also added type annotations
---
 protein_mpnn_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/protein_mpnn_utils.py b/protein_mpnn_utils.py
index ef7a72ea..63dde3b6 100644
--- a/protein_mpnn_utils.py
+++ b/protein_mpnn_utils.py
@@ -30,12 +30,14 @@ def _S_to_seq(S, mask):
     seq = ''.join([alphabet[c] for c, m in zip(S.tolist(), mask.tolist()) if m > 0])
     return seq
 
-def parse_PDB_biounits(x, atoms=['N','CA','C'], chain=None):
+def parse_PDB_biounits(x, atoms: list[str] | None, chain: str | None = None):
   '''
   input:  x = PDB filename
           atoms = atoms to extract (optional)
   output: (length, atoms, coords=(x,y,z)), sequence
   '''
+  if atoms is None:
+    atoms = ["N", "CA", "C"]
 
   alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
   states = len(alpha_1)

From b6b4b4c95f16f454af3034720c5aaf037cc4cecb Mon Sep 17 00:00:00 2001
From: Sean Aubin <sean@proteinqure.com>
Date: Mon, 12 Sep 2022 08:44:52 -0400
Subject: [PATCH 2/4] cleanup parse_PDB

- use string instead of manually defining alphabet
- remove unused loop
- use isinstance instead of type()
- rename my_dict -> pdb_dict
- remove unused variables
- use f-string instead of str concat
- use join instead of iterative string concat
- add type annotation
---
 protein_mpnn_utils.py | 81 ++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/protein_mpnn_utils.py b/protein_mpnn_utils.py
index 63dde3b6..f5e311a5 100644
--- a/protein_mpnn_utils.py
+++ b/protein_mpnn_utils.py
@@ -1,6 +1,8 @@
 from __future__ import print_function
 import json, time, os, sys, glob
 import shutil
+from string import ascii_lowercase, ascii_uppercase
+
 import numpy as np
 import torch
 from torch import optim
@@ -117,54 +119,47 @@ def N_to_AA(x):
   except TypeError:
       return 'no_chain', 'no_chain'
 
-def parse_PDB(path_to_pdb, input_chain_list=None, ca_only=False):
-    c=0
+def parse_PDB(path_to_pdb, input_chain_list=None, ca_only=False) -> list[dict]:
     pdb_dict_list = []
-    init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G','H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V','W','X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v','w','x', 'y', 'z']
-    extra_alphabet = [str(item) for item in list(np.arange(300))]
-    chain_alphabet = init_alphabet + extra_alphabet
-     
+
     if input_chain_list:
-        chain_alphabet = input_chain_list  
- 
+        chain_alphabet = input_chain_list
+    else:
+        chain_alphabet = list(ascii_uppercase) + list(ascii_lowercase) + [str(item) for item in range(300)]
+
+    if ca_only:
+        sidechain_atoms = ["CA"]
+    else:
+        sidechain_atoms = ["N", "CA", "C", "O"]
+
+    pdb_dict = {}
+    s = 0
+    concat_seq = []
+    for letter in chain_alphabet:
+        xyz, seq = parse_PDB_biounits(path_to_pdb, atoms=sidechain_atoms, chain=letter)
+        if not isinstance(xyz, str):
+            concat_seq.append(seq[0])
+            pdb_dict[f"seq_chain_{letter}"] = seq[0]
 
-    biounit_names = [path_to_pdb]
-    for biounit in biounit_names:
-        my_dict = {}
-        s = 0
-        concat_seq = ''
-        concat_N = []
-        concat_CA = []
-        concat_C = []
-        concat_O = []
-        concat_mask = []
-        coords_dict = {}
-        for letter in chain_alphabet:
             if ca_only:
-                sidechain_atoms = ['CA']
+                coords_dict_chain = {f"CA_chain_{letter}": xyz.tolist()}
             else:
-                sidechain_atoms = ['N', 'CA', 'C', 'O']
-            xyz, seq = parse_PDB_biounits(biounit, atoms=sidechain_atoms, chain=letter)
-            if type(xyz) != str:
-                concat_seq += seq[0]
-                my_dict['seq_chain_'+letter]=seq[0]
-                coords_dict_chain = {}
-                if ca_only:
-                    coords_dict_chain['CA_chain_'+letter]=xyz.tolist()
-                else:
-                    coords_dict_chain['N_chain_' + letter] = xyz[:, 0, :].tolist()
-                    coords_dict_chain['CA_chain_' + letter] = xyz[:, 1, :].tolist()
-                    coords_dict_chain['C_chain_' + letter] = xyz[:, 2, :].tolist()
-                    coords_dict_chain['O_chain_' + letter] = xyz[:, 3, :].tolist()
-                my_dict['coords_chain_'+letter]=coords_dict_chain
-                s += 1
-        fi = biounit.rfind("/")
-        my_dict['name']=biounit[(fi+1):-4]
-        my_dict['num_of_chains'] = s
-        my_dict['seq'] = concat_seq
-        if s <= len(chain_alphabet):
-            pdb_dict_list.append(my_dict)
-            c+=1
+                coords_dict_chain = {
+                    f"N_chain_{letter}": xyz[:, 0, :].tolist(),
+                    f"CA_chain_{letter}": xyz[:, 1, :].tolist(),
+                    f"C_chain_{letter}": xyz[:, 2, :].tolist(),
+                    f"O_chain_{letter}": xyz[:, 3, :].tolist(),
+                }
+            pdb_dict[f"coords_chain_{letter}"] = coords_dict_chain
+            s += 1
+
+    fi = path_to_pdb.rfind("/")
+    pdb_dict["name"] = path_to_pdb[(fi + 1) : -4]
+    pdb_dict["num_of_chains"] = s
+    pdb_dict["seq"] = "".join(concat_seq)
+    if s <= len(chain_alphabet):
+        pdb_dict_list.append(pdb_dict)
+
     return pdb_dict_list
 
 

From e8c006127dc7abc549e2f5bd108bb3e7c52da288 Mon Sep 17 00:00:00 2001
From: Sean Aubin <sean@proteinqure.com>
Date: Mon, 12 Sep 2022 08:49:48 -0400
Subject: [PATCH 3/4] remove unused imports

---
 protein_mpnn_utils.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/protein_mpnn_utils.py b/protein_mpnn_utils.py
index f5e311a5..2d27139d 100644
--- a/protein_mpnn_utils.py
+++ b/protein_mpnn_utils.py
@@ -1,18 +1,12 @@
 from __future__ import print_function
-import json, time, os, sys, glob
-import shutil
+import json, time
 from string import ascii_lowercase, ascii_uppercase
 
 import numpy as np
 import torch
-from torch import optim
-from torch.utils.data import DataLoader
-from torch.utils.data.dataset import random_split, Subset
 
-import copy
 import torch.nn as nn
 import torch.nn.functional as F
-import random
 import itertools
 
 #A number of functions/classes are adopted from: https://github.com/jingraham/neurips19-graph-protein-design

From 504482c0096c99ff7b37f555eaa35f32898f6cab Mon Sep 17 00:00:00 2001
From: Sean Aubin <sean@proteinqure.com>
Date: Wed, 14 Sep 2022 14:18:14 -0400
Subject: [PATCH 4/4] rename s -> num_chains

---
 protein_mpnn_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/protein_mpnn_utils.py b/protein_mpnn_utils.py
index 2d27139d..15780474 100644
--- a/protein_mpnn_utils.py
+++ b/protein_mpnn_utils.py
@@ -127,7 +127,7 @@ def parse_PDB(path_to_pdb, input_chain_list=None, ca_only=False) -> list[dict]:
         sidechain_atoms = ["N", "CA", "C", "O"]
 
     pdb_dict = {}
-    s = 0
+    num_chains = 0
     concat_seq = []
     for letter in chain_alphabet:
         xyz, seq = parse_PDB_biounits(path_to_pdb, atoms=sidechain_atoms, chain=letter)
@@ -145,13 +145,13 @@ def parse_PDB(path_to_pdb, input_chain_list=None, ca_only=False) -> list[dict]:
                     f"O_chain_{letter}": xyz[:, 3, :].tolist(),
                 }
             pdb_dict[f"coords_chain_{letter}"] = coords_dict_chain
-            s += 1
+            num_chains += 1
 
     fi = path_to_pdb.rfind("/")
     pdb_dict["name"] = path_to_pdb[(fi + 1) : -4]
-    pdb_dict["num_of_chains"] = s
+    pdb_dict["num_of_chains"] = num_chains
     pdb_dict["seq"] = "".join(concat_seq)
-    if s <= len(chain_alphabet):
+    if num_chains <= len(chain_alphabet):
         pdb_dict_list.append(pdb_dict)
 
     return pdb_dict_list