diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index b30ada17..b26623a3 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -1,15 +1,15 @@ name: nf-core branch protection -# This workflow is triggered on PRs to master branch on the repository -# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +# This workflow is triggered on PRs to main branch on the repository +# It fails when someone tries to make a PR against the nf-core `main` branch instead of `dev` on: pull_request_target: - branches: [master] + branches: [main] jobs: test: runs-on: ubuntu-latest steps: - # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + # PRs to the nf-core repo main branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs if: github.repository == 'sanger-tol/blobtoolkit' run: | @@ -22,7 +22,7 @@ jobs: uses: mshick/add-pr-comment@v1 with: message: | - ## This PR is against the `master` branch :x: + ## This PR is against the `main` branch :x: * Do not close this PR * Click _Edit_ and change the `base` to `dev` @@ -32,9 +32,9 @@ jobs: Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. - The `master` branch on nf-core repositories should always contain code from the latest release. - Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `main` branch. + The `main` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `main` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml index 406a6280..32849b2e 100644 --- a/.github/workflows/sanger_test.yml +++ b/.github/workflows/sanger_test.yml @@ -17,7 +17,7 @@ jobs: with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV_LARGE }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml index e3a25f7b..b44c29f4 100644 --- a/.github/workflows/sanger_test_full.yml +++ b/.github/workflows/sanger_test_full.yml @@ -1,6 +1,10 @@ name: sanger-tol LSF full size tests on: + push: + branches: + - main + - dev workflow_dispatch: jobs: run-tower: @@ -22,7 +26,7 @@ jobs: with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV_LARGE }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | diff --git a/.nf-core.yml b/.nf-core.yml index 2a47982a..85e18745 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -18,6 +18,7 @@ lint: - .github/ISSUE_TEMPLATE/bug_report.yml - .github/PULL_REQUEST_TEMPLATE.md - .github/workflows/linting.yml + - .github/workflows/branch.yml multiqc_config: - report_comment nextflow_config: diff --git a/CHANGELOG.md b/CHANGELOG.md index 240bcd13..b1f08975 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,30 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – [2024-07-31] + +General tidy up of the configuration and the pipeline + +### Enhancements & fixes + +- Increased the resources for blastn +- Removed some options that were not used or not needed +- All relevant outputs are now copied to the output directory +- Fixed some blast parameters to match the behaviour of the Snakemake pipeline +- Fixed parsing of samplesheets from fetchngs to capture correct data type + +### Parameters + +| Old parameter | New parameter | +| --------------- | ------------- | +| --taxa_file | | +| --blastp_outext | | +| --blastp_cols | | +| --blastx_outext | | +| --blastx_cols | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + ## [[0.4.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.4.0)] – Buneary – [2024-04-17] The pipeline has now been validated on dozens of genomes, up to 11 Gbp. diff --git a/README.md b/README.md index 31512bc0..c7b92970 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) -7. Run BLASTn against extracted BUSCO genes ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) -8. Run BLASTx against extracted BUSCO genes ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) diff --git a/conf/base.config b/conf/base.config index 8f51f7f8..75fa0d06 100644 --- a/conf/base.config +++ b/conf/base.config @@ -104,6 +104,18 @@ process { time = { check_max( 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') } } + withName: "BLAST_BLASTN" { + + // There are blast failures we don't know how to fix. Just ignore for now + errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == process.maxRetries ? 'ignore' : 'retry') : 'finish' } + + // Most jobs complete quickly but some need a lot longer. For those outliers, + // the CPU usage remains usually low, often nearing a single CPU + cpus = { check_max( 6 - (task.attempt-1), 'cpus' ) } + memory = { check_max( 1.GB * Math.pow(4, task.attempt-1), 'memory' ) } + time = { check_max( 10.h * Math.pow(4, task.attempt-1), 'time' ) } + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index 439a77b3..ac597dc4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -48,6 +48,14 @@ process { ext.args = { "-ax map-ont -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } + withName: "MINIMAP2_.*" { + publishDir = [ + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + withName: "SAMTOOLS_VIEW" { ext.args = "--output-fmt bam --write-index" } @@ -60,6 +68,22 @@ process { ext.args = "--lineage --busco" } + withName: "PIGZ_COMPRESS" { + publishDir = [ + path: { "${params.outdir}/base_content" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename.minus("fw_out/") } + ] + } + + withName: "BLOBTK_DEPTH" { + publishDir = [ + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : "${meta.id}.coverage.1k.bed.gz" } + ] + } + withName: "BUSCO" { scratch = true ext.args = { 'test' in workflow.profile.tokenize(',') ? @@ -114,7 +138,7 @@ process { } withName: "BLAST_BLASTN" { - ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" + ext.args = "-task megablast -outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" } withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { diff --git a/docs/output.md b/docs/output.md index 18fe2b6d..e3204a1d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,6 +15,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit) - [Static plots](#static-plots) - Static versions of the BlobToolKit plots - [BUSCO](#busco) - BUSCO results +- [Read alignments](#read-alignments) - Aligned reads (optional) +- [Read coverage](#read-coverage) - Read coverage tracks +- [Base content](#base-content) - _k_-mer statistics (for k ≤ 4) - [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -26,8 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive Output files - `blobtoolkit/` - - `/` - - `*.json.gz`: files generated from genome and alignment coverage statistics + - `/` + - `*.json.gz`: files generated from genome and alignment coverage statistics. More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer) @@ -53,12 +56,56 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas
Output files -- `blobtoolkit/` - - `busco/` - - `*.batch_summary.txt`: BUSCO scores as tab-separated files (1 file per lineage). - - `*.fasta.txt`: BUSCO scores as formatted text (1 file per lineage). - - `*.json`: BUSCO scores as JSON (1 file per lineage). - - `*/`: all output BUSCO files, including the coordinate and sequence files of the annotated genes. +- `busco/` + - `/` + - `short_summary.json`: BUSCO scores for that lineage as a tab-separated file. + - `short_summary.tsv`: BUSCO scores for that lineage as JSON. + - `short_summary.txt`: BUSCO scores for that lineage as formatted text. + - `full_table.tsv`: Coordinates of the annotated BUSCO genes as a tab-separated file. + - `missing_busco_list.tsv`: List of the BUSCO genes that could not be found. + - `*_busco_sequences.tar.gz`: Sequences of the annotated BUSCO genes. 1 _tar_ archive for each of the three annotation levels (`single_copy`, `multi_copy`, `fragmented`), with 1 file per gene. + - `hmmer_output.tar.gz`: Archive of the HMMER alignment scores. + +
+ +### Read alignments + +Read alignments in BAM format -- only if the pipeline is run with `--align`. + +
+Output files + +- `read_mapping/` + - `/` + - `.bam`: alignments of that sample's reads in BAM format. + +
+ +### Read coverage + +Read coverage statistics as computed by the pipeline. +Those files are the raw data used to build the BlobDir. + +
+Output files + +- `read_mapping/` + - `/` + - `.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows. + +
+ +### Base content + +_k_-mer statistics. +Those files are the raw data used to build the BlobDir. + +
+Output files + +- `base_content/` + - `_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. + - `_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts.
diff --git a/modules.json b/modules.json index 667a4482..ebb45a6c 100644 --- a/modules.json +++ b/modules.json @@ -30,12 +30,14 @@ "diamond/blastp": { "branch": "master", "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", @@ -64,6 +66,11 @@ "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] }, + "pigz/compress": { + "branch": "master", + "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", + "installed_by": ["modules"] + }, "samtools/fasta": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index de1313d5..a5556348 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/pacificbiosciences/pyyaml:5.3.1" + container "docker.io/genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(input) diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff index 1695c793..e01e07cb 100644 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -1,7 +1,14 @@ Changes in module 'nf-core/blast/blastn' --- modules/nf-core/blast/blastn/main.nf +++ modules/nf-core/blast/blastn/main.nf -@@ -10,6 +10,7 @@ +@@ -1,6 +1,5 @@ + process BLAST_BLASTN { + tag "$meta.id" +- label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +@@ -10,6 +9,7 @@ input: tuple val(meta) , path(fasta) tuple val(meta2), path(db) @@ -9,21 +16,31 @@ Changes in module 'nf-core/blast/blastn' output: tuple val(meta), path('*.txt'), emit: txt -@@ -23,6 +24,7 @@ +@@ -23,6 +23,8 @@ def prefix = task.ext.prefix ?: "${meta.id}" def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' ++ def command_epilog = taxid ? "|| true" : '' """ if [ "${is_compressed}" == "true" ]; then -@@ -39,6 +41,7 @@ +@@ -39,8 +41,15 @@ -num_threads ${task.cpus} \\ -db \$DB \\ -query ${fasta_name} \\ + ${exclude_taxon} \\ ${args} \\ - -out ${prefix}.txt +- -out ${prefix}.txt ++ -out ${prefix}.txt \\ ++ 2> >( tee "${prefix}.error.log" >&2 ) $command_epilog ++ ++ if [[ -s "${prefix}.error.log" ]] ++ then ++ grep -qF 'BLAST Database error: Taxonomy ID(s) not found.Taxonomy ID(s) not found' "${prefix}.error.log" ++ fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": ************************************************************ diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 065ad7cd..d674989a 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -1,6 +1,5 @@ process BLAST_BLASTN { tag "$meta.id" - label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -25,6 +24,7 @@ process BLAST_BLASTN { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + def command_epilog = taxid ? "|| true" : '' """ if [ "${is_compressed}" == "true" ]; then @@ -43,7 +43,13 @@ process BLAST_BLASTN { -query ${fasta_name} \\ ${exclude_taxon} \\ ${args} \\ - -out ${prefix}.txt + -out ${prefix}.txt \\ + 2> >( tee "${prefix}.error.log" >&2 ) $command_epilog + + if [[ -s "${prefix}.error.log" ]] + then + grep -qF 'BLAST Database error: Taxonomy ID(s) not found.Taxonomy ID(s) not found' "${prefix}.error.log" + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff new file mode 100644 index 00000000..12608ea0 --- /dev/null +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -0,0 +1,29 @@ +Changes in module 'nf-core/diamond/blastp' +--- modules/nf-core/diamond/blastp/main.nf ++++ modules/nf-core/diamond/blastp/main.nf +@@ -12,6 +12,7 @@ + tuple val(meta2), path(db) + val out_ext + val blast_columns ++ val taxid + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast +@@ -32,6 +33,7 @@ + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def columns = blast_columns ? "${blast_columns}" : '' ++ def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break +@@ -59,6 +61,7 @@ + --db \$DB \\ + --query ${fasta_name} \\ + --outfmt ${outfmt} ${columns} \\ ++ ${exclude_taxon} \\ + ${args} \\ + --out ${prefix}.${out_ext} + + +************************************************************ diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index dc01cdcc..ae5a1248 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -12,6 +12,7 @@ process DIAMOND_BLASTP { tuple val(meta2), path(db) val out_ext val blast_columns + val taxid output: tuple val(meta), path('*.blast'), optional: true, emit: blast @@ -32,6 +33,7 @@ process DIAMOND_BLASTP { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def columns = blast_columns ? "${blast_columns}" : '' + def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break @@ -59,6 +61,7 @@ process DIAMOND_BLASTP { --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ --out ${prefix}.${out_ext} diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff new file mode 100644 index 00000000..eff4326a --- /dev/null +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -0,0 +1,29 @@ +Changes in module 'nf-core/diamond/blastx' +--- modules/nf-core/diamond/blastx/main.nf ++++ modules/nf-core/diamond/blastx/main.nf +@@ -12,6 +12,7 @@ + tuple val(meta2), path(db) + val out_ext + val blast_columns ++ val taxid + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast +@@ -33,6 +34,7 @@ + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def columns = blast_columns ? "${blast_columns}" : '' ++ def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break +@@ -60,6 +62,7 @@ + --db \$DB \\ + --query ${fasta_name} \\ + --outfmt ${outfmt} ${columns} \\ ++ ${exclude_taxon} \\ + ${args} \\ + --out ${prefix}.${out_ext} \\ + --log + +************************************************************ diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index bf3f623c..dfa82e24 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -12,6 +12,7 @@ process DIAMOND_BLASTX { tuple val(meta2), path(db) val out_ext val blast_columns + val taxid output: tuple val(meta), path('*.blast'), optional: true, emit: blast @@ -33,6 +34,7 @@ process DIAMOND_BLASTX { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def columns = blast_columns ? "${blast_columns}" : '' + def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break @@ -60,6 +62,7 @@ process DIAMOND_BLASTX { --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ --out ${prefix}.${out_ext} \\ --log diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml new file mode 100644 index 00000000..7551d187 --- /dev/null +++ b/modules/nf-core/pigz/compress/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "pigz_compress" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "pigz=2.8" diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf new file mode 100644 index 00000000..152e7006 --- /dev/null +++ b/modules/nf-core/pigz/compress/main.nf @@ -0,0 +1,45 @@ +process PIGZ_COMPRESS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.8': + 'biocontainers/pigz:2.8' }" + + input: + tuple val(meta), path(raw_file) + + output: + tuple val(meta), path("$archive"), emit: archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + # Note: needs --stdout for pigz to avoid the following issue: + # pigz: skipping: ${raw_file} is a symbolic link + pigz --processes $task.cpus --stdout --force ${args} ${raw_file} > ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + touch ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml new file mode 100644 index 00000000..42efd735 --- /dev/null +++ b/modules/nf-core/pigz/compress/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "pigz_compress" +description: Compresses files with pigz. +keywords: + - compress + - gzip + - parallelized +tools: + - "pigz": + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - raw_file: + type: file + description: File to be compressed + pattern: "*.*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - archive: + type: file + description: The compressed file + pattern: "*.gz" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@leoisl" +maintainers: + - "@leoisl" diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test new file mode 100644 index 00000000..248d40fb --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test @@ -0,0 +1,49 @@ +nextflow_process { + name "Test Process PIGZ_COMPRESS" + script "../main.nf" + process "PIGZ_COMPRESS" + + tag "modules" + tag "modules_nfcore" + tag "pigz" + tag "pigz/compress" + + test("sarscov2 - genome - fasta") { + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - genome - fasta - stub") { + options "-stub-run" + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.archive[0][1]).name).match() } + ) + } + } +} diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap new file mode 100644 index 00000000..6e50456f --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ], + "archive": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ] + } + ], + "timestamp": "2023-12-11T22:39:53.350546" + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + "genome.fasta.gz" + ], + "timestamp": "2023-12-11T22:52:24.309192" + } +} \ No newline at end of file diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml new file mode 100644 index 00000000..42c46bfa --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/tags.yml @@ -0,0 +1,2 @@ +pigz/compress: + - "modules/nf-core/pigz/compress/**" diff --git a/nextflow.config b/nextflow.config index 83aaaafc..db5ef388 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,11 +17,10 @@ params { mask = false fetchngs_samplesheet = false - // Reference options + // Reference options fasta = null accession = null taxon = null - taxa_file = null // Output options image_format = 'png' @@ -32,10 +31,6 @@ params { blastp = null blastx = null blastn = null - blastp_outext = 'txt' - blastp_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - blastx_outext = 'txt' - blastx_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' // MultiQC options multiqc_config = null @@ -248,7 +243,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.4.0' + version = '0.5.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 97c84534..b392e2a5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -75,7 +75,7 @@ "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", - "required": ["taxon", "accession", "fasta"], + "required": ["taxon", "fasta"], "properties": { "taxon": { "type": ["string", "integer"], @@ -102,43 +102,12 @@ "description": "Define the location and parameters to work with databases.", "required": ["blastp", "blastx", "blastn", "taxdump"], "properties": { - "taxa_file": { - "type": "string", - "format": "file-path", - "description": "Path to file containing the BUSCO lineages for the genome species", - "help_text": "If this file is not included, the relevant BUSCO lineages are automatically calculated using the taxon parameter.", - "fa_icon": "fas fa-file-alt" - }, "busco": { "type": "string", "format": "directory-path", "description": "Local directory where clade-specific BUSCO lineage datasets are stored", "fa_icon": "fas fa-folder-open" }, - "blastp_cols": { - "type": "string", - "description": "When blastp_outext is 'txt', this is the list of columns that Diamond BLAST should print.", - "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" - }, - "blastp_outext": { - "type": "string", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], - "description": "Extension (file format) of the output file from Diamond BLAST.", - "fa_icon": "fas fa-file-circle-question", - "default": "txt" - }, - "blastx_cols": { - "type": "string", - "description": "When blastx_outext is 'txt', this is the list of columns that Diamond BLAST should print.", - "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" - }, - "blastx_outext": { - "type": "string", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], - "description": "Extension (file format) of the output file from Diamond BLAST.", - "fa_icon": "fas fa-file-circle-question", - "default": "txt" - }, "blastp": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 8411ad24..747bc9fa 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -28,14 +28,14 @@ workflow BLOBTOOLS { ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) - // + // // Create Blobtools dataset files // BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() ) - // + // // Update Blobtools dataset files // BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump ) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index c3ebe104..2a89471f 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -12,11 +12,9 @@ include { RESTRUCTUREBUSCODIR } from '../../modules/local/restructurebusco workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] - taxon_taxa // channel: [ val(meta, val(taxon), path(taxa) ] + taxon // channel: val(taxon) busco_db // channel: path(busco_db) blastp // channel: path(blastp_db) - outext // channel: val(out_format) - cols // channel: val(column_names) main: @@ -24,11 +22,13 @@ workflow BUSCO_DIAMOND { // - // Fetch BUSCO lineages for taxon (or taxa) + // Fetch BUSCO lineages for taxon // - GOAT_TAXONSEARCH ( taxon_taxa ) + GOAT_TAXONSEARCH ( + fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } + ) ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) - + // // Get NCBI species ID @@ -39,6 +39,7 @@ workflow BUSCO_DIAMOND { | transpose() | filter { rank,id -> rank =~ /species/ } | map { rank, id -> id} + | first | set { ch_taxid } @@ -70,7 +71,7 @@ workflow BUSCO_DIAMOND { ch_fasta_with_lineage, "genome", ch_fasta_with_lineage.map { it[0].lineage_name }, - busco_db.collect().ifEmpty([]), + busco_db, [], ) ch_versions = ch_versions.mix ( BUSCO.out.versions.first() ) @@ -108,12 +109,15 @@ workflow BUSCO_DIAMOND { // // Align BUSCO genes against the BLASTp database - // + // BLOBTOOLKIT_EXTRACTBUSCOS.out.genes | filter { it[1].size() > 140 } | set { ch_busco_genes } - DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols ) + // Hardcoded to match the format expected by blobtools + def outext = 'txt' + def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' + DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, ch_taxid ) ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) @@ -141,7 +145,7 @@ workflow BUSCO_DIAMOND { emit: - first_table = ch_first_table // channel: [ val(meta), path(full_table) ] + first_table = ch_first_table // channel: [ val(meta), path(full_table) ] all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] taxon_id = ch_taxid // channel: taxon_id diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf index 08bc43c9..b986188d 100644 --- a/subworkflows/local/collate_stats.nf +++ b/subworkflows/local/collate_stats.nf @@ -8,7 +8,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window workflow COLLATE_STATS { - take: + take: busco // channel: [ val(meta), path(full_table) ] bed // channel: [ val(meta), path(bed) ] freq // channel: [ val(meta), path(freq) ] diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf index 79b39a8a..86703851 100644 --- a/subworkflows/local/coverage_stats.nf +++ b/subworkflows/local/coverage_stats.nf @@ -6,12 +6,13 @@ include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { BLOBTK_DEPTH } from '../../modules/local/blobtk/depth' include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' +include { PIGZ_COMPRESS } from '../../modules/nf-core/pigz/compress/main' include { CREATE_BED } from '../../modules/local/create_bed' workflow COVERAGE_STATS { - take: - input // channel: [ val(meta), path(aln) ] + take: + input // channel: [ val(meta), path(aln) ] fasta // channel: [ val(meta), path(fasta) ] @@ -53,11 +54,22 @@ workflow COVERAGE_STATS { ch_versions = ch_versions.mix ( FASTAWINDOWS.out.versions.first() ) + // Compress the TSV files + PIGZ_COMPRESS ( + FASTAWINDOWS.out.mononuc + | mix ( FASTAWINDOWS.out.dinuc ) + | mix ( FASTAWINDOWS.out.trinuc ) + | mix ( FASTAWINDOWS.out.tetranuc ) + | mix ( FASTAWINDOWS.out.freq ) + ) + ch_versions = ch_versions.mix ( PIGZ_COMPRESS.out.versions.first() ) + + // Create genome windows file in BED format CREATE_BED ( FASTAWINDOWS.out.mononuc ) ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() ) - + // Calculate coverage BLOBTK_DEPTH ( ch_bam_csi ) ch_versions = ch_versions.mix ( BLOBTK_DEPTH.out.versions.first() ) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index da522ca8..d498269f 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -20,7 +20,7 @@ workflow INPUT_CHECK { if ( params.fetchngs_samplesheet ) { FETCHNGSSAMPLESHEET_CHECK ( samplesheet ) .csv - .splitCsv ( header:true, sep:',' ) + .splitCsv ( header:true, sep:',', quote:'"' ) .branch { row -> paired: row.fastq_2 [[id: row.run_accession, row:row], [row.fastq_1, row.fastq_2]] diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf index 1d6263b3..0c25f4c7 100644 --- a/subworkflows/local/minimap_alignment.nf +++ b/subworkflows/local/minimap_alignment.nf @@ -1,4 +1,4 @@ -// +// // Optional alignment subworkflow using Minimap2 // @@ -52,7 +52,7 @@ workflow MINIMAP2_ALIGNMENT { // Align with Minimap2 MINIMAP2_HIC ( ch_input.hic, fasta, true, false, false ) ch_versions = ch_versions.mix(MINIMAP2_HIC.out.versions.first()) - + MINIMAP2_ILMN ( ch_input.illumina, fasta, true, false, false ) ch_versions = ch_versions.mix(MINIMAP2_ILMN.out.versions.first()) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 0b426fae..a1f03980 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -48,7 +48,7 @@ workflow PREPARE_GENOME { ch_fasta = ch_genome } - + emit: genome = ch_fasta // channel: [ meta, path(genome) ] versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index cc1fa6c5..1ea64b82 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -12,8 +12,8 @@ include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/u workflow RUN_BLASTN { - take: - blast_table // channel: [ val(meta), path(blast_table) ] + take: + blast_table // channel: [ val(meta), path(blast_table) ] fasta // channel: [ val(meta), path(fasta) ] blastn // channel: [ val(meta), path(blastn_db) ] taxon_id // channel: val(taxon_id) @@ -27,16 +27,16 @@ workflow RUN_BLASTN { // Get list of sequence ids with no hits in diamond blastx search NOHIT_LIST ( blast_table, fasta ) ch_versions = ch_versions.mix ( NOHIT_LIST.out.versions.first() ) - + // Subset of sequences with no hits SEQTK_SUBSEQ ( fasta, - NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } + NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } . filter { it.size() > 0 } ) ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) - - - // Split long contigs into chunks + + + // Split long contigs into chunks // create chunks BLOBTOOLKIT_CHUNK ( SEQTK_SUBSEQ.out.sequences, [[],[]] ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index 1bad6f6d..715e5ae2 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -11,8 +11,7 @@ workflow RUN_BLASTX { fasta // channel: [ val(meta), path(fasta) ] table // channel: [ val(meta), path(busco_table) ] blastx // channel: [ val(meta), path(blastx_db) ] - outext // channel: val(out_format) - cols // channel: val(column_names) + taxon_id // channel: val(taxon_id) main: @@ -29,9 +28,12 @@ workflow RUN_BLASTX { // // Run diamond_blastx // - DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) + // Hardocded to match the format expected by blobtools + def outext = 'txt' + def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' + DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols, taxon_id ) ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() ) - + // // Unchunk chunked blastx results diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index f25da1eb..3610cdde 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -17,22 +17,24 @@ WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.blastp, params.blastx ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' } -if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.blastp && params.accession) { ch_blastp = Channel.of([ [ 'id': params.accession ], params.blastp ]).first() } else { exit 1, 'Diamond BLASTp database and accession must be specified!' } -if (params.blastx && params.accession) { ch_blastx = Channel.of([ [ 'id': params.accession ], params.blastx ]).first() } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } -if (params.blastn && params.accession) { ch_blastn = Channel.of([ [ 'id': params.accession ], params.blastn ]).first() } else { exit 1, 'BLASTn database not specified!' } +if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta.replace(".gz", "")).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } +if (params.taxon) { ch_taxon = Channel.value(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } +if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } +if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } +if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).baseName ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } // Create channel for optional parameters -if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() } -if (params.yaml && params.accession) { ch_yaml = Channel.of([ [ 'id': params.accession ], params.yaml ]) } else { ch_yaml = Channel.empty() } +if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } +if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } +if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } +if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -51,11 +53,6 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Loaded from modules/local/ -// -include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' - // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // @@ -108,7 +105,7 @@ workflow BLOBTOOLKIT { INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome, ch_yaml ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) - // + // // SUBWORKFLOW: Optional read alignment // if ( params.align ) { @@ -120,7 +117,7 @@ workflow BLOBTOOLKIT { } // - // SUBWORKFLOW: Calculate genome coverage and statistics + // SUBWORKFLOW: Calculate genome coverage and statistics // COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) @@ -128,32 +125,22 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp // - if (params.taxa_file) { - ch_taxa = Channel.from(params.taxa_file) - ch_taxon_taxa = PREPARE_GENOME.out.genome.combine(ch_taxon).combine(ch_taxa).map { meta, fasta, taxon, taxa -> [ meta, taxon, taxa ] } - } else { - ch_taxon_taxa = PREPARE_GENOME.out.genome.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } - } - - BUSCO_DIAMOND ( - PREPARE_GENOME.out.genome, - ch_taxon_taxa, - ch_busco_db, - ch_blastp, - params.blastp_outext, - params.blastp_cols + BUSCO_DIAMOND ( + PREPARE_GENOME.out.genome, + ch_taxon, + ch_busco_db, + ch_blastp, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) - + // // SUBWORKFLOW: Diamond blastx search of assembly contigs against the UniProt reference proteomes // - RUN_BLASTX ( + RUN_BLASTX ( PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, ch_blastx, - params.blastx_outext, - params.blastx_cols + BUSCO_DIAMOND.out.taxon_id, ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -161,29 +148,29 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Run blastn search on sequences that had no blastx hits // - RUN_BLASTN ( - RUN_BLASTX.out.blastx_out, - PREPARE_GENOME.out.genome, - ch_blastn, - BUSCO_DIAMOND.out.taxon_id + RUN_BLASTN ( + RUN_BLASTX.out.blastx_out, + PREPARE_GENOME.out.genome, + ch_blastn, + BUSCO_DIAMOND.out.taxon_id, ) - + // // SUBWORKFLOW: Collate genome statistics by various window sizes // - COLLATE_STATS ( + COLLATE_STATS ( BUSCO_DIAMOND.out.all_tables, - COVERAGE_STATS.out.bed, - COVERAGE_STATS.out.freq, - COVERAGE_STATS.out.mononuc, - COVERAGE_STATS.out.cov + COVERAGE_STATS.out.bed, + COVERAGE_STATS.out.freq, + COVERAGE_STATS.out.mononuc, + COVERAGE_STATS.out.cov ) ch_versions = ch_versions.mix ( COLLATE_STATS.out.versions ) // // SUBWORKFLOW: Create BlobTools dataset // - BLOBTOOLS ( + BLOBTOOLS ( INPUT_CHECK.out.config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.all_tables, @@ -193,7 +180,7 @@ workflow BLOBTOOLKIT { ch_taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) - + // // SUBWORKFLOW: Generate summary and static images //