Use GoaT in addition to the NCBI because GoaT also has the freshest E…

…NA taxon_ids NCBI is still the first database we query
sanger-tol · Aug 22, 2024 · 70f961c · 70f961c
1 parent 8886dba
commit 70f961c
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ The pipeline has now been validated for draft (unpublished) assemblies.
 
 - The pipeline now queries the NCBI database instead of GoaT to establish the
   taxonomic classification of the species and the relevant Busco lineages.
+  In case the taxon_id is not found, the pipeline falls back to GoaT, which
+  is aware of upcoming taxon_ids in ENA.
 - New `--busco_lineages` parameter to choose specific Busco lineages instead of
   automatically selecting based on the taxonomy.
 - All parameters are now passed the regular Nextflow way. There is no support

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -26,6 +26,10 @@
 
 - [Fasta_windows](https://github.com/tolkit/fasta_windows)
 
+- [GoaT](https://goat.genomehubs.org)
+
+  > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1.
+
 - [Minimap2](https://github.com/lh3/minimap2)
 
   > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191.

diff --git a/bin/generate_config.py b/bin/generate_config.py
@@ -10,6 +10,8 @@
 import yaml
 
 NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s"
+GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi"
+GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi"
 NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies"
 
 RANKS = [
@@ -51,20 +53,57 @@ class TaxonInfo:
     taxon_id: int
     organism_name: str
     rank: typing.Optional[str]
-    lineage: typing.List[int]
+    lineage: typing.List["TaxonInfo"]
 
 
-def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+def make_taxon_info_from_goat(body: dict[str, str]) -> TaxonInfo:
+    rank = body["taxon_rank"]
+    if rank == "no rank":
+        rank = None
+    if "lineage" in body:
+        lineage = [make_taxon_info_from_goat(b) for b in body["lineage"]]
+    else:
+        lineage = []
+    return TaxonInfo(int(body["taxon_id"]), body["scientific_name"], rank, lineage)
+
+
+def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+    if isinstance(taxon_name, int):
+        taxon_id = taxon_name
+        record_id = "taxon-%d" % taxon_name
+    else:
+        # Resolve the taxon_id of the species
+        response = requests.get(GOAT_LOOKUP_API % taxon_name).json()
+        taxon_id = int(response["results"][0]["result"]["taxon_id"])
+        record_id = response["results"][0]["id"]
+
+    # Using API, get the taxon_ids of the species and all parents
+    response = requests.get(GOAT_RECORD_API % record_id).json()
+    body = response["records"][0]["record"]
+    return make_taxon_info_from_goat(body)
+
+
+def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]:
     # Using API, get the taxon_ids of the species and all parents
     response = requests.get(NCBI_TAXONOMY_API % taxon_name).json()
-    body = response["taxonomy_nodes"][0]["taxonomy"]
-    return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"])
+    if "taxonomy" in response["taxonomy_nodes"][0]:
+        body = response["taxonomy_nodes"][0]["taxonomy"]
+        if with_lineage:
+            lineage = [
+                fetch_taxon_info_from_ncbi(t, with_lineage=False) for t in reversed(body["lineage"][2:])
+            ]  # skip root and cellular_organisms
+        else:
+            lineage = []
+        return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), lineage)
+
+
+def fetch_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+    return fetch_taxon_info_from_ncbi(taxon_name) or fetch_taxon_info_from_goat(taxon_name)
 
 
 def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
     ancestors: typing.Dict[str, str] = {}
-    for anc_taxon_id in taxon_info.lineage[1:]:  # skip the root taxon_id=1
-        anc_taxon_info = make_taxon_info(anc_taxon_id)
+    for anc_taxon_info in taxon_info.lineage:
         if anc_taxon_info.rank:
             ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name
     # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/
@@ -92,9 +131,9 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin
     else:
         # Do the intersection to find the ancestors that have a BUSCO lineage
         odb_arr = [
-            lineage_tax_ids_dict[taxon_id]
-            for taxon_id in reversed(taxon_info.lineage)
-            if taxon_id in lineage_tax_ids_dict
+            lineage_tax_ids_dict[anc_taxon_info.taxon_id]
+            for anc_taxon_info in taxon_info.lineage
+            if anc_taxon_info.taxon_id in lineage_tax_ids_dict
         ]
 
     return odb_arr
@@ -129,7 +168,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
 def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int:
     con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3"))
     cur = con.cursor()
-    for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)):
+    for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]:
         res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,))
         if res.fetchone():
             return taxon_id
@@ -214,7 +253,7 @@ def main(args=None):
         assembly_info = {"level": "scaffold"}
     assembly_info["file"] = args.fasta
 
-    taxon_info = make_taxon_info(args.taxon_query)
+    taxon_info = fetch_taxon_info(args.taxon_query)
     classification = get_classification(taxon_info)
     odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)
     taxon_id = adjust_taxon_id(args.blastn, taxon_info)

diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
@@ -130,7 +130,7 @@ workflow BLOBTOOLKIT {
     ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions )
 
     //
-    // SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp
+    // SUBWORKFLOW: Run BUSCO using lineages fetched from GoaT, then run diamond_blastp
     //
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,