Skip to content

Commit

Permalink
Use GoaT in addition to the NCBI because GoaT also has the freshest E…
Browse files Browse the repository at this point in the history
…NA taxon_ids

NCBI is still the first database we query
  • Loading branch information
muffato committed Aug 22, 2024
1 parent 8886dba commit 70f961c
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 12 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ The pipeline has now been validated for draft (unpublished) assemblies.

- The pipeline now queries the NCBI database instead of GoaT to establish the
taxonomic classification of the species and the relevant Busco lineages.
In case the taxon_id is not found, the pipeline falls back to GoaT, which
is aware of upcoming taxon_ids in ENA.
- New `--busco_lineages` parameter to choose specific Busco lineages instead of
automatically selecting based on the taxonomy.
- All parameters are now passed the regular Nextflow way. There is no support
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
- [Fasta_windows](https://github.com/tolkit/fasta_windows)

- [GoaT](https://goat.genomehubs.org)

> Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1.
- [Minimap2](https://github.com/lh3/minimap2)

> Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191.
Expand Down
61 changes: 50 additions & 11 deletions bin/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import yaml

NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s"
GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi"
GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi"
NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies"

RANKS = [
Expand Down Expand Up @@ -51,20 +53,57 @@ class TaxonInfo:
taxon_id: int
organism_name: str
rank: typing.Optional[str]
lineage: typing.List[int]
lineage: typing.List["TaxonInfo"]


def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
def make_taxon_info_from_goat(body: dict[str, str]) -> TaxonInfo:
rank = body["taxon_rank"]
if rank == "no rank":
rank = None
if "lineage" in body:
lineage = [make_taxon_info_from_goat(b) for b in body["lineage"]]
else:
lineage = []
return TaxonInfo(int(body["taxon_id"]), body["scientific_name"], rank, lineage)


def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
if isinstance(taxon_name, int):
taxon_id = taxon_name
record_id = "taxon-%d" % taxon_name
else:
# Resolve the taxon_id of the species
response = requests.get(GOAT_LOOKUP_API % taxon_name).json()
taxon_id = int(response["results"][0]["result"]["taxon_id"])
record_id = response["results"][0]["id"]

# Using API, get the taxon_ids of the species and all parents
response = requests.get(GOAT_RECORD_API % record_id).json()
body = response["records"][0]["record"]
return make_taxon_info_from_goat(body)


def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]:
# Using API, get the taxon_ids of the species and all parents
response = requests.get(NCBI_TAXONOMY_API % taxon_name).json()
body = response["taxonomy_nodes"][0]["taxonomy"]
return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"])
if "taxonomy" in response["taxonomy_nodes"][0]:
body = response["taxonomy_nodes"][0]["taxonomy"]
if with_lineage:
lineage = [
fetch_taxon_info_from_ncbi(t, with_lineage=False) for t in reversed(body["lineage"][2:])
] # skip root and cellular_organisms
else:
lineage = []
return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), lineage)


def fetch_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
return fetch_taxon_info_from_ncbi(taxon_name) or fetch_taxon_info_from_goat(taxon_name)


def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
ancestors: typing.Dict[str, str] = {}
for anc_taxon_id in taxon_info.lineage[1:]: # skip the root taxon_id=1
anc_taxon_info = make_taxon_info(anc_taxon_id)
for anc_taxon_info in taxon_info.lineage:
if anc_taxon_info.rank:
ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name
# https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/
Expand Down Expand Up @@ -92,9 +131,9 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin
else:
# Do the intersection to find the ancestors that have a BUSCO lineage
odb_arr = [
lineage_tax_ids_dict[taxon_id]
for taxon_id in reversed(taxon_info.lineage)
if taxon_id in lineage_tax_ids_dict
lineage_tax_ids_dict[anc_taxon_info.taxon_id]
for anc_taxon_info in taxon_info.lineage
if anc_taxon_info.taxon_id in lineage_tax_ids_dict
]

return odb_arr
Expand Down Expand Up @@ -129,7 +168,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int:
con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3"))
cur = con.cursor()
for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)):
for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]:
res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,))
if res.fetchone():
return taxon_id
Expand Down Expand Up @@ -214,7 +253,7 @@ def main(args=None):
assembly_info = {"level": "scaffold"}
assembly_info["file"] = args.fasta

taxon_info = make_taxon_info(args.taxon_query)
taxon_info = fetch_taxon_info(args.taxon_query)
classification = get_classification(taxon_info)
odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)
taxon_id = adjust_taxon_id(args.blastn, taxon_info)
Expand Down
2 changes: 1 addition & 1 deletion workflows/blobtoolkit.nf
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ workflow BLOBTOOLKIT {
ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions )

//
// SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp
// SUBWORKFLOW: Run BUSCO using lineages fetched from GoaT, then run diamond_blastp
//
BUSCO_DIAMOND (
PREPARE_GENOME.out.genome,
Expand Down

0 comments on commit 70f961c

Please sign in to comment.