diff --git a/CHANGELOG.md b/CHANGELOG.md index 86425753..5b4d60e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ General tidy up of the configuration and the pipeline - Increased the resources for blastn - Removed some options that were not used or not needed +- All relevant outputs are now copied to the output directory ### Parameters diff --git a/conf/modules.config b/conf/modules.config index 439a77b3..3e54b96a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -48,6 +48,14 @@ process { ext.args = { "-ax map-ont -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } + withName: "MINIMAP2_.*" { + publishDir = [ + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + withName: "SAMTOOLS_VIEW" { ext.args = "--output-fmt bam --write-index" } @@ -60,6 +68,22 @@ process { ext.args = "--lineage --busco" } + withName: "PIGZ_COMPRESS" { + publishDir = [ + path: { "${params.outdir}/base_content" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename.minus("fw_out/") } + ] + } + + withName: "BLOBTK_DEPTH" { + publishDir = [ + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : "${meta.id}.coverage.1k.bed.gz" } + ] + } + withName: "BUSCO" { scratch = true ext.args = { 'test' in workflow.profile.tokenize(',') ? diff --git a/docs/output.md b/docs/output.md index 18fe2b6d..e3204a1d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,6 +15,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit) - [Static plots](#static-plots) - Static versions of the BlobToolKit plots - [BUSCO](#busco) - BUSCO results +- [Read alignments](#read-alignments) - Aligned reads (optional) +- [Read coverage](#read-coverage) - Read coverage tracks +- [Base content](#base-content) - _k_-mer statistics (for k ≤ 4) - [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -26,8 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive Output files - `blobtoolkit/` - - `/` - - `*.json.gz`: files generated from genome and alignment coverage statistics + - `/` + - `*.json.gz`: files generated from genome and alignment coverage statistics. More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer) @@ -53,12 +56,56 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas
Output files -- `blobtoolkit/` - - `busco/` - - `*.batch_summary.txt`: BUSCO scores as tab-separated files (1 file per lineage). - - `*.fasta.txt`: BUSCO scores as formatted text (1 file per lineage). - - `*.json`: BUSCO scores as JSON (1 file per lineage). - - `*/`: all output BUSCO files, including the coordinate and sequence files of the annotated genes. +- `busco/` + - `/` + - `short_summary.json`: BUSCO scores for that lineage as a tab-separated file. + - `short_summary.tsv`: BUSCO scores for that lineage as JSON. + - `short_summary.txt`: BUSCO scores for that lineage as formatted text. + - `full_table.tsv`: Coordinates of the annotated BUSCO genes as a tab-separated file. + - `missing_busco_list.tsv`: List of the BUSCO genes that could not be found. + - `*_busco_sequences.tar.gz`: Sequences of the annotated BUSCO genes. 1 _tar_ archive for each of the three annotation levels (`single_copy`, `multi_copy`, `fragmented`), with 1 file per gene. + - `hmmer_output.tar.gz`: Archive of the HMMER alignment scores. + +
+ +### Read alignments + +Read alignments in BAM format -- only if the pipeline is run with `--align`. + +
+Output files + +- `read_mapping/` + - `/` + - `.bam`: alignments of that sample's reads in BAM format. + +
+ +### Read coverage + +Read coverage statistics as computed by the pipeline. +Those files are the raw data used to build the BlobDir. + +
+Output files + +- `read_mapping/` + - `/` + - `.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows. + +
+ +### Base content + +_k_-mer statistics. +Those files are the raw data used to build the BlobDir. + +
+Output files + +- `base_content/` + - `_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. + - `_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts.
diff --git a/modules.json b/modules.json index 667a4482..d80a794d 100644 --- a/modules.json +++ b/modules.json @@ -64,6 +64,11 @@ "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] }, + "pigz/compress": { + "branch": "master", + "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", + "installed_by": ["modules"] + }, "samtools/fasta": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index de1313d5..a5556348 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/pacificbiosciences/pyyaml:5.3.1" + container "docker.io/genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(input) diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml new file mode 100644 index 00000000..7551d187 --- /dev/null +++ b/modules/nf-core/pigz/compress/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "pigz_compress" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "pigz=2.8" diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf new file mode 100644 index 00000000..152e7006 --- /dev/null +++ b/modules/nf-core/pigz/compress/main.nf @@ -0,0 +1,45 @@ +process PIGZ_COMPRESS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.8': + 'biocontainers/pigz:2.8' }" + + input: + tuple val(meta), path(raw_file) + + output: + tuple val(meta), path("$archive"), emit: archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + # Note: needs --stdout for pigz to avoid the following issue: + # pigz: skipping: ${raw_file} is a symbolic link + pigz --processes $task.cpus --stdout --force ${args} ${raw_file} > ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + touch ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml new file mode 100644 index 00000000..42efd735 --- /dev/null +++ b/modules/nf-core/pigz/compress/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "pigz_compress" +description: Compresses files with pigz. +keywords: + - compress + - gzip + - parallelized +tools: + - "pigz": + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - raw_file: + type: file + description: File to be compressed + pattern: "*.*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - archive: + type: file + description: The compressed file + pattern: "*.gz" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@leoisl" +maintainers: + - "@leoisl" diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test new file mode 100644 index 00000000..248d40fb --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test @@ -0,0 +1,49 @@ +nextflow_process { + name "Test Process PIGZ_COMPRESS" + script "../main.nf" + process "PIGZ_COMPRESS" + + tag "modules" + tag "modules_nfcore" + tag "pigz" + tag "pigz/compress" + + test("sarscov2 - genome - fasta") { + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - genome - fasta - stub") { + options "-stub-run" + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.archive[0][1]).name).match() } + ) + } + } +} diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap new file mode 100644 index 00000000..6e50456f --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ], + "archive": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ] + } + ], + "timestamp": "2023-12-11T22:39:53.350546" + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + "genome.fasta.gz" + ], + "timestamp": "2023-12-11T22:52:24.309192" + } +} \ No newline at end of file diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml new file mode 100644 index 00000000..42c46bfa --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/tags.yml @@ -0,0 +1,2 @@ +pigz/compress: + - "modules/nf-core/pigz/compress/**" diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf index 78be4620..86703851 100644 --- a/subworkflows/local/coverage_stats.nf +++ b/subworkflows/local/coverage_stats.nf @@ -6,6 +6,7 @@ include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { BLOBTK_DEPTH } from '../../modules/local/blobtk/depth' include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' +include { PIGZ_COMPRESS } from '../../modules/nf-core/pigz/compress/main' include { CREATE_BED } from '../../modules/local/create_bed' @@ -53,6 +54,17 @@ workflow COVERAGE_STATS { ch_versions = ch_versions.mix ( FASTAWINDOWS.out.versions.first() ) + // Compress the TSV files + PIGZ_COMPRESS ( + FASTAWINDOWS.out.mononuc + | mix ( FASTAWINDOWS.out.dinuc ) + | mix ( FASTAWINDOWS.out.trinuc ) + | mix ( FASTAWINDOWS.out.tetranuc ) + | mix ( FASTAWINDOWS.out.freq ) + ) + ch_versions = ch_versions.mix ( PIGZ_COMPRESS.out.versions.first() ) + + // Create genome windows file in BED format CREATE_BED ( FASTAWINDOWS.out.mononuc ) ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() )