Changed number of retries for each allele when fetching UniProt annot…

…ations (from 5 to 2). Fixed the count of inframe stop codons in SchemaEvaluator.
B-UMMI · Jan 24, 2024 · 63a007b · 63a007b
1 parent ea35ab1
commit 63a007b
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 27 deletions.
diff --git a/CHEWBBACA/CHEWBBACA_NS/upload_schema.py b/CHEWBBACA/CHEWBBACA_NS/upload_schema.py
@@ -336,7 +336,7 @@ def create_uniprot_queries(file, max_queries=ct.MAX_QUERIES):
     return queries_file
 
 
-def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):
+def get_annotation(queries_file, max_retries=ct.MAX_RETRIES):
     """Query the UniProt SPARQL endpoint to retrieve protein annotations.
 
         This function uses the SPARQL queries imported from
@@ -380,7 +380,6 @@ def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):
 
     tries = 0
     # define maximum number of tries
-    max_tries = max_queries
     found = False
     while found is False:
 
@@ -409,7 +408,7 @@ def get_annotation(queries_file, max_queries=ct.MAX_QUERIES):
             pass
 
         tries += 1
-        if tries == max_tries or tries == len(queries):
+        if tries == max_retries or tries == len(queries):
             found = True
 
     annotation_info = [locus, prev_name, prev_label, prev_url]

diff --git a/CHEWBBACA/UniprotFinder/annotate_schema.py b/CHEWBBACA/UniprotFinder/annotate_schema.py
@@ -335,7 +335,9 @@ def main(schema_directory, output_directory, genes_list, protein_table,
          no_cleanup, blast_path):
 
     # Create output directory
-    fo.create_directory(output_directory)
+    created = fo.create_directory(output_directory)
+    if created is False:
+        sys.exit(ct.OUTPUT_DIRECTORY_EXISTS)
 
     # Create temp directory
     temp_directory = fo.join_paths(output_directory, ['temp'])

diff --git a/CHEWBBACA/__init__.py b/CHEWBBACA/__init__.py
@@ -1,2 +1,2 @@
 
-__version__ = "3.3.2"
+__version__ = "3.3.3"
diff --git a/CHEWBBACA/docs/conf.py b/CHEWBBACA/docs/conf.py
@@ -22,7 +22,7 @@
 author = 'Rafael Mamede'
 
 # The full version, including alpha/beta/rc tags
-release = '3.3.2'
+release = '3.3.3'
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py
@@ -150,7 +150,10 @@
 # UniProt SPARQL endpoint
 UNIPROT_SPARQL = 'https://sparql.uniprot.org/sparql'
 UNIPROT_SPARQL_THREADS = 4
-MAX_QUERIES = 10
+# Maximum number of retries if querying the SPARQL endpoint fails
+MAX_RETRIES = 2
+# Maximum number of sequences used to query the SPARQL endpoint
+MAX_QUERIES = 20
 
 # FTP to get UniProt's reference proteomes
 UNIPROT_PROTEOMES_FTP = ('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/')
@@ -356,7 +359,7 @@
 INVALID_ALLELES_COLUMNS = ['Allele ID', 'Exception Category',
                            'Exception Description']
 
-TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon found',
+TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon',
                           'is not a start codon',
                           'is not a stop codon',
                           'sequence length is not a multiple of 3',

diff --git a/CHEWBBACA/utils/uniprot_requests.py b/CHEWBBACA/utils/uniprot_requests.py
@@ -176,7 +176,7 @@ def get_data(sparql_query):
     """
     tries = 0
     failed = []
-    max_tries = 5
+    max_tries = ct.MAX_RETRIES
     success = False
     while success is False and tries < max_tries:
         try:
@@ -258,31 +258,33 @@ def get_annotation(gene, translation_table):
     """
     selected_url = ''
     selected_name = ''
-    # import locus alleles
+    # Import locus alleles
     sequences = fao.import_sequences(gene)
+    queried = []
     for seqid, sequence in sequences.items():
-        # translate allele
+        # Translate allele
         protein_sequence = str(sm.translate_sequence(sequence,
                                                      table_id=translation_table))
 
-        query = uniprot_query(protein_sequence)
-        result, failed = get_data(query)
-
-        if len(result) > 0:
-            name, url, label = select_name(result)
-
-            lowercase_name = name.lower()
-            if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
-                if selected_name == '':
+        if protein_sequence not in queried:
+            query = uniprot_query(protein_sequence)
+            result, failed = get_data(query)
+            queried.append(protein_sequence)
+            if len(result) > 0:
+                name, url, label = select_name(result)
+
+                lowercase_name = name.lower()
+                if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
+                    if selected_name == '':
+                        selected_name = name
+                        selected_url = url
+                    continue
+                elif name == '':
+                    continue
+                else:
                     selected_name = name
                     selected_url = url
-                continue
-            elif name == '':
-                continue
-            else:
-                selected_name = name
-                selected_url = url
-                break
+                    break
 
     return [gene, selected_name, selected_url, failed]