Skip to content

Commit

Permalink
Changed number of retries for each allele when fetching UniProt annot…
Browse files Browse the repository at this point in the history
…ations (from 5 to 2). Fixed the count of inframe stop codons in SchemaEvaluator.
  • Loading branch information
rfm-targa committed Jan 24, 2024
1 parent ea35ab1 commit 63a007b
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 27 deletions.
5 changes: 2 additions & 3 deletions CHEWBBACA/CHEWBBACA_NS/upload_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def create_uniprot_queries(file, max_queries=ct.MAX_QUERIES):
return queries_file


def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):
def get_annotation(queries_file, max_retries=ct.MAX_RETRIES):
"""Query the UniProt SPARQL endpoint to retrieve protein annotations.
This function uses the SPARQL queries imported from
Expand Down Expand Up @@ -380,7 +380,6 @@ def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):

tries = 0
# define maximum number of tries
max_tries = max_queries
found = False
while found is False:

Expand Down Expand Up @@ -409,7 +408,7 @@ def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):
pass

tries += 1
if tries == max_tries or tries == len(queries):
if tries == max_retries or tries == len(queries):
found = True

annotation_info = [locus, prev_name, prev_label, prev_url]
Expand Down
4 changes: 3 additions & 1 deletion CHEWBBACA/UniprotFinder/annotate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ def main(schema_directory, output_directory, genes_list, protein_table,
no_cleanup, blast_path):

# Create output directory
fo.create_directory(output_directory)
created = fo.create_directory(output_directory)
if created is False:
sys.exit(ct.OUTPUT_DIRECTORY_EXISTS)

# Create temp directory
temp_directory = fo.join_paths(output_directory, ['temp'])
Expand Down
2 changes: 1 addition & 1 deletion CHEWBBACA/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

__version__ = "3.3.2"
__version__ = "3.3.3"
2 changes: 1 addition & 1 deletion CHEWBBACA/docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = 'Rafael Mamede'

# The full version, including alpha/beta/rc tags
release = '3.3.2'
release = '3.3.3'

# -- General configuration ---------------------------------------------------

Expand Down
7 changes: 5 additions & 2 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,10 @@
# UniProt SPARQL endpoint
UNIPROT_SPARQL = 'https://sparql.uniprot.org/sparql'
UNIPROT_SPARQL_THREADS = 4
MAX_QUERIES = 10
# Maximum number of retries if querying the SPARQL endpoint fails
MAX_RETRIES = 2
# Maximum number of sequences used to query the SPARQL endpoint
MAX_QUERIES = 20

# FTP to get UniProt's reference proteomes
UNIPROT_PROTEOMES_FTP = ('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/')
Expand Down Expand Up @@ -356,7 +359,7 @@
INVALID_ALLELES_COLUMNS = ['Allele ID', 'Exception Category',
'Exception Description']

TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon found',
TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon',
'is not a start codon',
'is not a stop codon',
'sequence length is not a multiple of 3',
Expand Down
40 changes: 21 additions & 19 deletions CHEWBBACA/utils/uniprot_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def get_data(sparql_query):
"""
tries = 0
failed = []
max_tries = 5
max_tries = ct.MAX_RETRIES
success = False
while success is False and tries < max_tries:
try:
Expand Down Expand Up @@ -258,31 +258,33 @@ def get_annotation(gene, translation_table):
"""
selected_url = ''
selected_name = ''
# import locus alleles
# Import locus alleles
sequences = fao.import_sequences(gene)
queried = []
for seqid, sequence in sequences.items():
# translate allele
# Translate allele
protein_sequence = str(sm.translate_sequence(sequence,
table_id=translation_table))

query = uniprot_query(protein_sequence)
result, failed = get_data(query)

if len(result) > 0:
name, url, label = select_name(result)

lowercase_name = name.lower()
if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
if selected_name == '':
if protein_sequence not in queried:
query = uniprot_query(protein_sequence)
result, failed = get_data(query)
queried.append(protein_sequence)
if len(result) > 0:
name, url, label = select_name(result)

lowercase_name = name.lower()
if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
if selected_name == '':
selected_name = name
selected_url = url
continue
elif name == '':
continue
else:
selected_name = name
selected_url = url
continue
elif name == '':
continue
else:
selected_name = name
selected_url = url
break
break

return [gene, selected_name, selected_url, failed]

Expand Down

0 comments on commit 63a007b

Please sign in to comment.