From b99101bad1fa6e48e1d9667ab9aff61ac531bdbf Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 15 Jan 2024 18:26:24 +0000 Subject: [PATCH] Report all BUSCOs and in the right order --- modules/local/blobtoolkit/createblobdir.nf | 5 ++-- subworkflows/local/busco_diamond_blastp.nf | 27 +++++++++++++++++----- subworkflows/local/collate_stats.nf | 8 ++----- workflows/blobtoolkit.nf | 4 ++-- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf index 54810650..22399365 100644 --- a/modules/local/blobtoolkit/createblobdir.nf +++ b/modules/local/blobtoolkit/createblobdir.nf @@ -9,7 +9,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { input: tuple val(meta), path(window, stageAs: 'windowstats/*') - tuple val(meta1), path(busco) + tuple val(meta1), path(busco, stageAs: 'lineage??/*') tuple val(meta2), path(blastp) tuple val(meta3), path(yaml) path(taxdump) @@ -24,6 +24,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" + def busco_args = busco.collect { "--busco " + it } .join(' ') def hits_blastp = blastp ? "--hits ${blastp}" : "" """ blobtools replace \\ @@ -31,7 +32,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { --meta ${yaml} \\ --taxdump ${taxdump} \\ --taxrule buscogenes \\ - --busco ${busco} \\ + ${busco_args} \\ ${hits_blastp} \\ --threads ${task.cpus} \\ $args \\ diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 6037de19..709bffaf 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -51,7 +51,7 @@ workflow BUSCO_DIAMOND { // Add the basal lineages to the list (excluding duplicates) - basal_lineages = [ "archaea_odb10", "bacteria_odb10", "eukaryota_odb10" ] + basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] ch_ancestral_lineages | map { lineages -> (lineages + basal_lineages).unique() } | flatten () @@ -86,11 +86,26 @@ workflow BUSCO_DIAMOND { ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) - // Select BUSCO results for taxonomically closest database + // Index the lineages in the taxonomic order + def lineage_index = 0 + ch_lineages + | map { lineage -> [lineage, lineage_index++] } + | set { ch_ordered_lineages } + + + // Order BUSCO results accoring to ch_lineages BUSCO.out.full_table - | combine ( ch_lineages.toList().map { it[0] } ) - | filter { meta, table, lineage -> table =~ /$lineage/ } - | map { meta, table, lineage -> [ meta, table ] } + | map { meta, table -> [table.parent.baseName.minus("run_"), meta, table] } + | join ( ch_ordered_lineages ) + | map { lineage, meta, table, index -> [meta, table, index] } + | groupTuple() + | map { meta, tables, indexes -> [ meta, tables.withIndex().sort { a, b -> indexes[a[1]] <=> indexes[b[1]] } . collect { table, i -> table } ] } + | set { ch_indexed_buscos } + + + // Select BUSCO results for taxonomically closest database + ch_indexed_buscos + | map { meta, tables -> [meta, tables[0]] } | set { ch_first_table } @@ -102,7 +117,7 @@ workflow BUSCO_DIAMOND { emit: first_table = ch_first_table // channel: [ val(meta), path(full_table) ] - full_table = BUSCO.out.full_table // channel: [ val(meta), path(full_tables) ] + all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf index 21baf44a..08bc43c9 100644 --- a/subworkflows/local/collate_stats.nf +++ b/subworkflows/local/collate_stats.nf @@ -9,7 +9,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window workflow COLLATE_STATS { take: - busco_table // channel: [ val(meta), path(full_table) ] + busco // channel: [ val(meta), path(full_table) ] bed // channel: [ val(meta), path(bed) ] freq // channel: [ val(meta), path(freq) ] mononuc // channel: [ val(meta), path(mononuc) ] @@ -20,11 +20,7 @@ workflow COLLATE_STATS { // Count BUSCO genes in a region - busco_table - | groupTuple() - | set { ch_busco } - - BLOBTOOLKIT_COUNTBUSCOS ( ch_busco, bed ) + BLOBTOOLKIT_COUNTBUSCOS ( busco, bed ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_COUNTBUSCOS.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index aeb1bdfc..944ccc4c 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -172,7 +172,7 @@ workflow BLOBTOOLKIT { // SUBWORKFLOW: Collate genome statistics by various window sizes // COLLATE_STATS ( - BUSCO_DIAMOND.out.full_table, + BUSCO_DIAMOND.out.all_tables, COVERAGE_STATS.out.bed, COVERAGE_STATS.out.freq, COVERAGE_STATS.out.mononuc, @@ -186,7 +186,7 @@ workflow BLOBTOOLKIT { BLOBTOOLS ( INPUT_CHECK.out.config, COLLATE_STATS.out.window_tsv, - BUSCO_DIAMOND.out.first_table, + BUSCO_DIAMOND.out.all_tables, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]), RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]),