From 3887d40e2e578f556c75252737f7fd5d829c3eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Sat, 7 Sep 2024 04:51:48 +0000 Subject: [PATCH] fix bug in parse fasta --- openfold/utils/script_utils.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/openfold/utils/script_utils.py b/openfold/utils/script_utils.py index facc103d..2b385cc2 100644 --- a/openfold/utils/script_utils.py +++ b/openfold/utils/script_utils.py @@ -116,15 +116,23 @@ def load_models_from_command_line(config, model_device, openfold_checkpoint_path def parse_fasta(data): + # Remove any empty lines or trailing ">" characters data = re.sub('>$', '', data, flags=re.M) - lines = [ - l.replace('\n', '') - for prot in data.split('>') for l in prot.strip().split('\n', 1) - ][1:] - tags, seqs = lines[::2], lines[1::2] - - tags = [re.split('\W| \|', t)[0] for t in tags] - + + # Split the data into entries based on '>' + entries = data.split('>')[1:] # Skip the first empty element + + tags = [] + seqs = [] + + for entry in entries: + lines = entry.split('\n', 1) + if len(lines) == 2: + tag = lines[0].strip() # Take the full tag (header) + seq = lines[1].replace('\n', '').strip() # Remove newlines and any extra spaces in the sequence + tags.append(tag) + seqs.append(seq) + return tags, seqs