diff --git a/.editorconfig b/.editorconfig index a30ae1e1..84a786d8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -31,3 +31,12 @@ insert_final_newline = unset trim_trailing_whitespace = unset indent_style = unset indent_size = unset + +# To prevent errors for these test blastn databases +[/assets/test*/nt_*/*.{ndb,nhr,nin,nog,nos,not,nsq,ntf,nto}] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 19adb352..81a8380e 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -110,4 +110,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.nf-core.yml b/.nf-core.yml index f2175469..6bed2cad 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,3 +16,5 @@ lint: multiqc_config: - report_comment actions_ci: false + template_strings: false + merge_markers: false diff --git a/assets/test/mCerEla1.1.buscogenes.dmnd b/assets/test/mMelMel3.1.buscogenes.dmnd similarity index 99% rename from assets/test/mCerEla1.1.buscogenes.dmnd rename to assets/test/mMelMel3.1.buscogenes.dmnd index bccca41d..391345ba 100644 Binary files a/assets/test/mCerEla1.1.buscogenes.dmnd and b/assets/test/mMelMel3.1.buscogenes.dmnd differ diff --git a/assets/test/mMelMel3.1.buscoregions.dmnd b/assets/test/mMelMel3.1.buscoregions.dmnd new file mode 100644 index 00000000..91fa6042 Binary files /dev/null and b/assets/test/mMelMel3.1.buscoregions.dmnd differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb new file mode 100644 index 00000000..18062436 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr new file mode 100644 index 00000000..0b5d4906 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin new file mode 100644 index 00000000..bebd568b Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog new file mode 100644 index 00000000..e6ef79c7 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos new file mode 100644 index 00000000..99700566 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not new file mode 100644 index 00000000..047e8d38 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq new file mode 100644 index 00000000..48497573 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf new file mode 100644 index 00000000..3be5ea5b Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto new file mode 100644 index 00000000..6d4a41c7 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto differ diff --git a/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd b/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd new file mode 100644 index 00000000..3f2a1a54 Binary files /dev/null and b/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb new file mode 100644 index 00000000..0905629a Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr new file mode 100644 index 00000000..1fa3521a Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin new file mode 100644 index 00000000..0503c4c7 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog new file mode 100644 index 00000000..7dcd60eb Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos new file mode 100644 index 00000000..6bd1dcdf Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not new file mode 100644 index 00000000..8bacddec Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq new file mode 100644 index 00000000..6afe38e9 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf new file mode 100644 index 00000000..efd34086 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto new file mode 100644 index 00000000..4b140ec3 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto differ diff --git a/bin/nohitlist.sh b/bin/nohitlist.sh new file mode 100755 index 00000000..c935cebe --- /dev/null +++ b/bin/nohitlist.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# input +fasta=$1 +blast=$2 +prefix=$3 +E=$4 + +# find ids of sequences with no hits in the blastx search +grep '>' $fasta | \ + grep -v -w -f <(awk -v evalue="$E" '{{if($14<{evalue}){{print $1}}}}' $blast | sort | uniq) | \ + cut -f1 | sed 's/>//' > $prefix.nohit.txt + + + diff --git a/conf/modules.config b/conf/modules.config index ebf62694..e1ac0bfc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,12 +37,16 @@ process { ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" } + withName: "DIAMOND_BLASTX" { + ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + } + withName: "BLOBTOOLKIT_WINDOWSTATS" { ext.args = "--window 0.1 --window 0.01 --window 1 --window 100000 --window 1000000" } withName: "BLOBTOOLKIT_BLOBDIR" { - ext.args = "--evalue 1.0e-25 --hit-count 10" + ext.args = "--evalue 1.0e-25 --hit-count 10 --update-plot" publishDir = [ path: { "${params.outdir}/" }, mode: params.publish_dir_mode, @@ -66,6 +70,26 @@ process { ] } + withName: "BLOBTOOLKIT_CHUNK" { + ext.args = "--chunk 100000 --overlap 0 --max-chunks 10 --min-length 1000" + } + + withName: "BLOBTOOLKIT_UNCHUNK" { + ext.args = "--count 10" + } + + withName: "NOHIT_LIST" { + ext.args = "1.0e-25" + } + + withName: "BLAST_BLASTN" { + ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" + } + + withName: "BLASTN" { + ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" + } + withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { publishDir = [ path: { "${params.outdir}/blobtoolkit_info" }, diff --git a/conf/test.config b/conf/test.config index 165bfff6..7d4526c4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,7 +30,9 @@ params { taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/" - uniprot = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd" + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/" + blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" + blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" + blastn = "${projectDir}/assets/test/nt_mMelMel3.1" } diff --git a/conf/test_full.config b/conf/test_full.config index ee22dba2..d5f7ed7f 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,7 +27,9 @@ params { taxon = "Laetiporus sulphureus" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/busco/v5/" - uniprot = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/busco/v5/" + blastp = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" + blastx = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd" + blastn = "${projectDir}/assets/test_full/nt_gfLaeSulp1.1" } diff --git a/modules.json b/modules.json index cd615550..d4d4d9d1 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "blast/blastn": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "busco": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -21,6 +26,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "diamond/blastx": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "fastawindows": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -50,6 +60,11 @@ "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] + }, + "seqtk/subseq": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] } } }, diff --git a/modules/local/blastn.nf b/modules/local/blastn.nf new file mode 100644 index 00000000..84c4b8ce --- /dev/null +++ b/modules/local/blastn.nf @@ -0,0 +1,40 @@ +process BLASTN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::blast=2.13.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.13.0--hf3cf87c_0' : + 'biocontainers/blast:2.13.0--hf3cf87c_0' }" + + input: + tuple val(meta), path(fasta) + path db + val taxid + + output: + tuple val(meta), file('*.blastn.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + """ + DB=`find -L ./ -name "*.ndb" | sed 's/\\.ndb\$//'` + blastn \\ + -num_threads $task.cpus \\ + -db \$DB \\ + -query $fasta \\ + $exclude_taxon \\ + $args \\ + -out ${prefix}.blastn.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/blobdir.nf b/modules/local/blobtoolkit/blobdir.nf index 3f064bce..f07be0f6 100644 --- a/modules/local/blobtoolkit/blobdir.nf +++ b/modules/local/blobtoolkit/blobdir.nf @@ -11,7 +11,9 @@ process BLOBTOOLKIT_BLOBDIR { tuple val(meta), path(window, stageAs: 'windowstats/*') tuple val(meta1), path(busco) tuple val(meta2), path(blastp) - tuple val(meta3), path(yaml) + tuple val(meta3), path(blastx) + tuple val(meta4), path(blastn) + tuple val(meta5), path(yaml) path(taxdump) output: @@ -24,15 +26,19 @@ process BLOBTOOLKIT_BLOBDIR { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def hits = blastp ? "--hits ${blastp}" : "" + def hits_blastp = blastp ? "--hits ${blastp}" : "" + def hits_blastx = blastx ? "--hits ${blastx}" : "" + def hits_blastn = blastn ? "--hits ${blastn}" : "" """ blobtools replace \\ --bedtsvdir windowstats \\ --meta ${yaml} \\ --taxdump ${taxdump} \\ - --taxrule buscogenes \\ + --taxrule bestdistorder=buscoregions \\ --busco ${busco} \\ - ${hits} \\ + ${hits_blastp} \\ + ${hits_blastx} \\ + ${hits_blastn} \\ --threads ${task.cpus} \\ $args \\ ${prefix} diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf new file mode 100644 index 00000000..dc9a840b --- /dev/null +++ b/modules/local/blobtoolkit/chunk.nf @@ -0,0 +1,37 @@ +process BLOBTOOLKIT_CHUNK { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(busco_table) + + output: + tuple val(meta), path("*.chunks.fasta"), emit: chunks + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def busco = busco_table ? "--busco ${busco_table}" : "--busco None" + """ + btk pipeline chunk-fasta \\ + --in ${fasta} \\ + ${busco} \\ + --out ${prefix}.chunks.fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf new file mode 100644 index 00000000..1ad7fae0 --- /dev/null +++ b/modules/local/blobtoolkit/unchunk.nf @@ -0,0 +1,34 @@ +process BLOBTOOLKIT_UNCHUNK { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(blast_table) + + output: + tuple val(meta), path("*.out"), emit: blast_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${blast_table}" + """ + btk pipeline unchunk-blast \\ + --in ${blast_table} \\ + --out ${prefix}.out \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/nohit_list.nf b/modules/local/nohit_list.nf new file mode 100644 index 00000000..0df7ee53 --- /dev/null +++ b/modules/local/nohit_list.nf @@ -0,0 +1,32 @@ +process NOHIT_LIST { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'quay.io/biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(blast) //path to blast output table in txt format + tuple val(meta), path(fasta) //path to genome fasta file + + output: + tuple val(meta), path ('*.nohit.txt') , emit: nohitlist + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in sanger-tol/blobtoolkit/bin/ + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + nohitlist.sh ${fasta} ${blast} ${prefix} $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nohit_list: 1.0 + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf new file mode 100644 index 00000000..9a1f3a55 --- /dev/null +++ b/modules/nf-core/blast/blastn/main.nf @@ -0,0 +1,37 @@ +process BLAST_BLASTN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::blast=2.13.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.13.0--hf3cf87c_0' : + 'biocontainers/blast:2.13.0--hf3cf87c_0' }" + + input: + tuple val(meta), path(fasta) + path db + + output: + tuple val(meta), path('*.blastn.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + DB=`find -L ./ -name "*.ndb" | sed 's/\\.ndb\$//'` + blastn \\ + -num_threads $task.cpus \\ + -db \$DB \\ + -query $fasta \\ + $args \\ + -out ${prefix}.blastn.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml new file mode 100644 index 00000000..2742278d --- /dev/null +++ b/modules/nf-core/blast/blastn/meta.yml @@ -0,0 +1,41 @@ +name: blast_blastn +description: Queries a BLAST DNA database +keywords: + - fasta + - blast + - blastn + - DNA sequence +tools: + - blast: + description: | + BLAST finds regions of similarity between biological sequences. + homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi + documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs + doi: 10.1016/S0022-2836(05)80360-2 + licence: ["US-Government-Work"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing queries sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing blast database + pattern: "*" +output: + - txt: + type: file + description: File containing blastn hits + pattern: "*.{blastn.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf new file mode 100644 index 00000000..e08fb0d9 --- /dev/null +++ b/modules/nf-core/diamond/blastx/main.nf @@ -0,0 +1,68 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::diamond=2.0.15" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : + 'biocontainers/diamond:2.0.15--hb97b32f_0' }" + + input: + tuple val(meta), path(fasta) + path db + val out_ext + val blast_columns + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ + blastx \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} ${columns} \\ + $args \\ + --out ${prefix}.${out_ext} \\ + --log + + mv diamond.log ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastx/meta.yml b/modules/nf-core/diamond/blastx/meta.yml new file mode 100644 index 00000000..a2a6013d --- /dev/null +++ b/modules/nf-core/diamond/blastx/meta.yml @@ -0,0 +1,81 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the nucelotide blast database + pattern: "*" + - out_ext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + +output: + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" + - txt: + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Log file containing stdout information + pattern: "*.{log}" + +authors: + - "@spficklin" + - "@jfy133" + - "@mjamy" diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf new file mode 100644 index 00000000..1ba887ee --- /dev/null +++ b/modules/nf-core/seqtk/subseq/main.nf @@ -0,0 +1,41 @@ +process SEQTK_SUBSEQ { + tag "$sequences" + label 'process_single' + + conda "bioconda::seqtk=1.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' : + 'biocontainers/seqtk:1.3--h5bf99c6_3' }" + + input: + path sequences + path filter_list + + output: + path "*.gz" , emit: sequences + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: '' + def ext = "fa" + if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { + ext = "fq" + } + """ + seqtk \\ + subseq \\ + $args \\ + $sequences \\ + $filter_list | \\ + gzip --no-name > ${sequences}${prefix}.${ext}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml new file mode 100644 index 00000000..d06efb55 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/meta.yml @@ -0,0 +1,34 @@ +name: seqtk_subseq +description: Select only sequences that match the filtering condition +keywords: + - filtering,selection +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + +input: + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq,fq.gz,fa,fa.gz}" + - filter_list: + type: file + description: BED file or a text file with a list of sequence names + pattern: "*.{bed,lst}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq.gz,fa.gz}" + +authors: + - "@sidorov-si" diff --git a/nextflow.config b/nextflow.config index 988b2a1c..2d031482 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,9 +23,14 @@ params { // Databases and related options taxdump = null busco = null - uniprot = null + blastp = null + blastx = null + blastn = null blastp_outext = 'txt' blastp_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' + blastx_outext = 'txt' + blastx_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' + // MultiQC options multiqc_config = null @@ -47,7 +52,7 @@ params { version = false validate_params = true show_hidden_params = false - schema_ignore_params = 'genomes' + // Config options custom_config_version = 'master' @@ -63,6 +68,10 @@ params { max_cpus = 16 max_time = '240.h' + // Schema validation default options + validationShowHiddenParams = false + validationSchemaIgnoreParams = '' + } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index a960bee2..2f8c25d9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -79,7 +79,7 @@ "type": "object", "fa_icon": "fas fa-database", "description": "Define the location and parameters to work with databases.", - "required": ["uniprot", "taxdump"], + "required": ["blastp", "blastx", "blastn", "taxdump"], "properties": { "taxa_file": { "type": "string", @@ -106,13 +106,38 @@ "fa_icon": "fas fa-file-circle-question", "default": "txt" }, - "uniprot": { + "blastx_cols": { + "type": "string", + "description": "When blastx_outext is 'txt', this is the list of columns that Diamond BLAST should print.", + "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" + }, + "blastx_outext": { + "type": "string", + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "description": "Extension (file format) of the output file from Diamond BLAST.", + "fa_icon": "fas fa-file-circle-question", + "default": "txt" + }, + "blastp": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.dmnd$", "description": "Path to the Diamond species-specific buscogenes database", "fa_icon": "fas fa-file-archive" }, + "blastx": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.dmnd$", + "description": "Path to the Diamond species-specific buscoregions database", + "fa_icon": "fas fa-file-archive" + }, + "blastn": { + "type": "string", + "format": "file-path", + "description": "Path to the nucleotide BLAST database", + "fa_icon": "fas fa-file-archive" + }, "taxdump": { "type": "string", "format": "directory-path", @@ -304,6 +329,14 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationShowHiddenParams": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "default": false, + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." } } } diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 730e1334..593147a9 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -11,6 +11,8 @@ workflow BLOBTOOLS { windowstats // channel: [ val(meta), path(window_stats_tsvs) ] busco // channel: [ val(meta), path(full_table) ] blastp // channel: [ val(meta), path(txt) ] + blastx // channel: [ val(meta), path(txt) ] + blastn // channel: [ val(meta), path(txt) ] taxdump // channel: path(taxdump_db) @@ -28,7 +30,7 @@ workflow BLOBTOOLS { // // Create Blobtools dataset files // - BLOBTOOLKIT_BLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) + BLOBTOOLKIT_BLOBDIR ( windowstats, busco, blastp, blastx, blastn, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_BLOBDIR.out.versions.first() ) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 44fe8b6c..78554f5e 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -27,6 +27,18 @@ workflow BUSCO_DIAMOND { // GOAT_TAXONSEARCH ( taxon_taxa ) ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) + + + // + // Get NCBI species ID + // + GOAT_TAXONSEARCH.out.taxonsearch + | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } + | map { row -> [ row.taxon_rank, row.taxon_id ] } + | transpose() + | filter { rank,id -> rank =~ /species/ } + | map { rank, id -> id} + | set { ch_taxid } // @@ -91,6 +103,7 @@ workflow BUSCO_DIAMOND { first_table = ch_first_table // channel: [ val(meta), path(full_table) ] full_table = BUSCO.out.full_table // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] + taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf new file mode 100644 index 00000000..fb8d0f9d --- /dev/null +++ b/subworkflows/local/run_blastn.nf @@ -0,0 +1,76 @@ +// +// BLASTN search of assembly contigs with no diamond blastx match against the nucleotide database +// + + +include { NOHIT_LIST } from '../../modules/local/nohit_list' +include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { BLOBTOOLKIT_CHUNK } from '../../modules/local/blobtoolkit/chunk' +include { BLASTN as BLASTN_TAXON } from '../../modules/local/blastn' +include { BLASTN } from '../../modules/local/blastn' +include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/unchunk' + + +workflow RUN_BLASTN { + take: + blast_table // channel: [ val(meta), path(blast_table) ] + fasta // channel: [ val(meta), path(fasta) ] + blastn // channel: path(blastn_db) + taxon_id // channel: val(taxon_id) + + + main: + ch_versions = Channel.empty() + + + // Extract no hits fasta + // Get list of sequence ids with no hits in diamond blastx search + NOHIT_LIST ( blast_table, fasta ) + ch_versions = ch_versions.mix ( NOHIT_LIST.out.versions.first() ) + // Subset of sequences with no hits (meta is not propagated in this step) + SEQTK_SUBSEQ ( + fasta.map { meta, genome -> genome }, + NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } + ) + ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) + + + // Split long contigs into chunks + // add meta to fasta subset channel: [ val(meta), path(compressed_fasta) ] + ch_gz = fasta.combine(SEQTK_SUBSEQ.out.sequences).map { meta, genome, seq -> [ meta, seq ] } + // uncompress fasta + GUNZIP ( ch_gz ) + // create chunks + BLOBTOOLKIT_CHUNK ( GUNZIP.out.gunzip, [[],[]] ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) + + + // Run blastn search + // run blastn excluding taxon_id + BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id ) + // check if blastn output table is empty + BLASTN_TAXON.out.txt + | map { meta, txt -> txt.isEmpty() } + | set { is_txt_empty } + // repeat the blastn search without excluding taxon_id + if ( is_txt_empty ) { + BLASTN ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, [] ) + ch_blastn_txt = BLASTN.out.txt + } + else { + ch_blastn_txt = BLASTN_TAXON.out.txt + } + + ch_versions = ch_versions.mix ( BLASTN.out.versions.first() ) + + + // Unchunk chunked blastn results + BLOBTOOLKIT_UNCHUNK ( ch_blastn_txt ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_UNCHUNK.out.versions.first() ) + + + emit: + blastn_out = BLOBTOOLKIT_UNCHUNK.out.blast_out // channel: [ val(meta), path(blastn_out) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf new file mode 100644 index 00000000..29bd365b --- /dev/null +++ b/subworkflows/local/run_blastx.nf @@ -0,0 +1,42 @@ +// +// Create BlobTools dataset +// + +include { BLOBTOOLKIT_CHUNK } from '../../modules/local/blobtoolkit/chunk' +include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/unchunk' +include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' + +workflow RUN_BLASTX { + take: + fasta // channel: [ val(meta), path(fasta) ] + table // channel: [ val(meta), path(busco_table) ] + blastx // channel: path(blastx_db) + outext // channel: val(out_format) + cols // channel: val(column_names) + + + main: + ch_versions = Channel.empty() + + + // + // Create metadata summary file + // + BLOBTOOLKIT_CHUNK ( fasta, table ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) + + // + // Run diamond_blastx + // + DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) + + // + // Unchunk chunked blastx results + // + BLOBTOOLKIT_UNCHUNK ( DIAMOND_BLASTX.out.txt ) + + + emit: + blastx_out = BLOBTOOLKIT_UNCHUNK.out.blast_out // channel: [ val(meta), path(blastx_out) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index c8dad117..68439f01 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -11,14 +11,16 @@ WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.uniprot ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.blastp, params.blastx ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' } +if (params.blastp) { ch_blastp = file(params.blastp) } else { exit 1, 'Diamond BLASTp database not specified!' } +if (params.blastx) { ch_blastx = file(params.blastx) } else { exit 1, 'Diamond BLASTx database not specified!' } +if (params.blastn) { ch_blastn = file(params.blastn) } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } // Create channel for optional parameters @@ -50,12 +52,14 @@ include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' -include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' -include { COLLATE_STATS } from '../subworkflows/local/collate_stats' -include { BLOBTOOLS } from '../subworkflows/local/blobtools' -include { VIEW } from '../subworkflows/local/view' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' +include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' +include { RUN_BLASTX } from '../subworkflows/local/run_blastx' +include { RUN_BLASTN } from '../subworkflows/local/run_blastn' +include { COLLATE_STATS } from '../subworkflows/local/collate_stats' +include { BLOBTOOLS } from '../subworkflows/local/blobtools' +include { VIEW } from '../subworkflows/local/view' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -115,8 +119,27 @@ workflow BLOBTOOLKIT { ch_taxon_taxa = ch_fasta.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } } - BUSCO_DIAMOND ( ch_genome, ch_taxon_taxa, ch_busco_db, ch_uniprot, params.blastp_outext, params.blastp_cols ) + BUSCO_DIAMOND ( ch_genome, ch_taxon_taxa, ch_busco_db, ch_blastp, params.blastp_outext, params.blastp_cols ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) + + + // + // SUBWORKFLOW: Run Diamond blastx to search protein database with assembly query + // + RUN_BLASTX ( + ch_genome, + BUSCO_DIAMOND.out.first_table, + ch_blastx, + params.blastx_outext, + params.blastx_cols + ) + + + // + // SUBWORKFLOW: Run blastn search on sequences that had no blastx hits + // + RUN_BLASTN ( RUN_BLASTX.out.blastx_out, ch_genome, ch_blastn, BUSCO_DIAMOND.out.taxon_id ) + // // SUBWORKFLOW: Collate genome statistics by various window sizes @@ -135,7 +158,15 @@ workflow BLOBTOOLKIT { ch_config = ch_yaml } - BLOBTOOLS ( ch_config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.first_table, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), ch_taxdump ) + BLOBTOOLS ( + ch_config, + COLLATE_STATS.out.window_tsv, + BUSCO_DIAMOND.out.first_table, + BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), + RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]), + RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]), + ch_taxdump + ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) //