diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86425753..5b4d60e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ General tidy up of the configuration and the pipeline
- Increased the resources for blastn
- Removed some options that were not used or not needed
+- All relevant outputs are now copied to the output directory
### Parameters
diff --git a/conf/modules.config b/conf/modules.config
index 439a77b3..3e54b96a 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -48,6 +48,14 @@ process {
ext.args = { "-ax map-ont -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
}
+ withName: "MINIMAP2_.*" {
+ publishDir = [
+ path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+ ]
+ }
+
withName: "SAMTOOLS_VIEW" {
ext.args = "--output-fmt bam --write-index"
}
@@ -60,6 +68,22 @@ process {
ext.args = "--lineage --busco"
}
+ withName: "PIGZ_COMPRESS" {
+ publishDir = [
+ path: { "${params.outdir}/base_content" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals("versions.yml") ? null : filename.minus("fw_out/") }
+ ]
+ }
+
+ withName: "BLOBTK_DEPTH" {
+ publishDir = [
+ path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals("versions.yml") ? null : "${meta.id}.coverage.1k.bed.gz" }
+ ]
+ }
+
withName: "BUSCO" {
scratch = true
ext.args = { 'test' in workflow.profile.tokenize(',') ?
diff --git a/docs/output.md b/docs/output.md
index 18fe2b6d..e3204a1d 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -15,6 +15,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit)
- [Static plots](#static-plots) - Static versions of the BlobToolKit plots
- [BUSCO](#busco) - BUSCO results
+- [Read alignments](#read-alignments) - Aligned reads (optional)
+- [Read coverage](#read-coverage) - Read coverage tracks
+- [Base content](#base-content) - _k_-mer statistics (for k ≤ 4)
- [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
@@ -26,8 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive
Output files
- `blobtoolkit/`
- - `/`
- - `*.json.gz`: files generated from genome and alignment coverage statistics
+ - `/`
+ - `*.json.gz`: files generated from genome and alignment coverage statistics.
More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer)
@@ -53,12 +56,56 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas
Output files
-- `blobtoolkit/`
- - `busco/`
- - `*.batch_summary.txt`: BUSCO scores as tab-separated files (1 file per lineage).
- - `*.fasta.txt`: BUSCO scores as formatted text (1 file per lineage).
- - `*.json`: BUSCO scores as JSON (1 file per lineage).
- - `*/`: all output BUSCO files, including the coordinate and sequence files of the annotated genes.
+- `busco/`
+ - `/`
+ - `short_summary.json`: BUSCO scores for that lineage as a tab-separated file.
+ - `short_summary.tsv`: BUSCO scores for that lineage as JSON.
+ - `short_summary.txt`: BUSCO scores for that lineage as formatted text.
+ - `full_table.tsv`: Coordinates of the annotated BUSCO genes as a tab-separated file.
+ - `missing_busco_list.tsv`: List of the BUSCO genes that could not be found.
+ - `*_busco_sequences.tar.gz`: Sequences of the annotated BUSCO genes. 1 _tar_ archive for each of the three annotation levels (`single_copy`, `multi_copy`, `fragmented`), with 1 file per gene.
+ - `hmmer_output.tar.gz`: Archive of the HMMER alignment scores.
+
+
+
+### Read alignments
+
+Read alignments in BAM format -- only if the pipeline is run with `--align`.
+
+
+Output files
+
+- `read_mapping/`
+ - `/`
+ - `.bam`: alignments of that sample's reads in BAM format.
+
+
+
+### Read coverage
+
+Read coverage statistics as computed by the pipeline.
+Those files are the raw data used to build the BlobDir.
+
+
+Output files
+
+- `read_mapping/`
+ - `/`
+ - `.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows.
+
+
+
+### Base content
+
+_k_-mer statistics.
+Those files are the raw data used to build the BlobDir.
+
+
+Output files
+
+- `base_content/`
+ - `_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer.
+ - `_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts.
diff --git a/modules.json b/modules.json
index 667a4482..d80a794d 100644
--- a/modules.json
+++ b/modules.json
@@ -64,6 +64,11 @@
"git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
"installed_by": ["modules"]
},
+ "pigz/compress": {
+ "branch": "master",
+ "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d",
+ "installed_by": ["modules"]
+ },
"samtools/fasta": {
"branch": "master",
"git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62",
diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
index de1313d5..a5556348 100644
--- a/modules/local/blobtoolkit/updatemeta.nf
+++ b/modules/local/blobtoolkit/updatemeta.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead."
}
- container "docker.io/pacificbiosciences/pyyaml:5.3.1"
+ container "docker.io/genomehubs/blobtoolkit:4.3.9"
input:
tuple val(meta), path(input)
diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml
new file mode 100644
index 00000000..7551d187
--- /dev/null
+++ b/modules/nf-core/pigz/compress/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+name: "pigz_compress"
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - "pigz=2.8"
diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf
new file mode 100644
index 00000000..152e7006
--- /dev/null
+++ b/modules/nf-core/pigz/compress/main.nf
@@ -0,0 +1,45 @@
+process PIGZ_COMPRESS {
+ tag "$meta.id"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pigz:2.8':
+ 'biocontainers/pigz:2.8' }"
+
+ input:
+ tuple val(meta), path(raw_file)
+
+ output:
+ tuple val(meta), path("$archive"), emit: archive
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ archive = raw_file.toString() + ".gz"
+ """
+ # Note: needs --stdout for pigz to avoid the following issue:
+ # pigz: skipping: ${raw_file} is a symbolic link
+ pigz --processes $task.cpus --stdout --force ${args} ${raw_file} > ${archive}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' )
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ archive = raw_file.toString() + ".gz"
+ """
+ touch ${archive}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' )
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml
new file mode 100644
index 00000000..42efd735
--- /dev/null
+++ b/modules/nf-core/pigz/compress/meta.yml
@@ -0,0 +1,47 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "pigz_compress"
+description: Compresses files with pigz.
+keywords:
+ - compress
+ - gzip
+ - parallelized
+tools:
+ - "pigz":
+ description: "Parallel implementation of the gzip algorithm."
+ homepage: "https://zlib.net/pigz/"
+ documentation: "https://zlib.net/pigz/pigz.pdf"
+
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1', single_end:false ]`
+
+ - raw_file:
+ type: file
+ description: File to be compressed
+ pattern: "*.*"
+
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1', single_end:false ]`
+
+ - archive:
+ type: file
+ description: The compressed file
+ pattern: "*.gz"
+
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+
+authors:
+ - "@leoisl"
+maintainers:
+ - "@leoisl"
diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test
new file mode 100644
index 00000000..248d40fb
--- /dev/null
+++ b/modules/nf-core/pigz/compress/tests/main.nf.test
@@ -0,0 +1,49 @@
+nextflow_process {
+ name "Test Process PIGZ_COMPRESS"
+ script "../main.nf"
+ process "PIGZ_COMPRESS"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "pigz"
+ tag "pigz/compress"
+
+ test("sarscov2 - genome - fasta") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test'], // meta map
+ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+ ]
+ """
+ }
+ }
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("sarscov2 - genome - fasta - stub") {
+ options "-stub-run"
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test'], // meta map
+ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+ ]
+ """
+ }
+ }
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(file(process.out.archive[0][1]).name).match() }
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap
new file mode 100644
index 00000000..6e50456f
--- /dev/null
+++ b/modules/nf-core/pigz/compress/tests/main.nf.test.snap
@@ -0,0 +1,37 @@
+{
+ "sarscov2 - genome - fasta": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad"
+ ],
+ "archive": [
+ [
+ {
+ "id": "test"
+ },
+ "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad"
+ ]
+ }
+ ],
+ "timestamp": "2023-12-11T22:39:53.350546"
+ },
+ "sarscov2 - genome - fasta - stub": {
+ "content": [
+ "genome.fasta.gz"
+ ],
+ "timestamp": "2023-12-11T22:52:24.309192"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml
new file mode 100644
index 00000000..42c46bfa
--- /dev/null
+++ b/modules/nf-core/pigz/compress/tests/tags.yml
@@ -0,0 +1,2 @@
+pigz/compress:
+ - "modules/nf-core/pigz/compress/**"
diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf
index 78be4620..86703851 100644
--- a/subworkflows/local/coverage_stats.nf
+++ b/subworkflows/local/coverage_stats.nf
@@ -6,6 +6,7 @@ include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main'
include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
include { BLOBTK_DEPTH } from '../../modules/local/blobtk/depth'
include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main'
+include { PIGZ_COMPRESS } from '../../modules/nf-core/pigz/compress/main'
include { CREATE_BED } from '../../modules/local/create_bed'
@@ -53,6 +54,17 @@ workflow COVERAGE_STATS {
ch_versions = ch_versions.mix ( FASTAWINDOWS.out.versions.first() )
+ // Compress the TSV files
+ PIGZ_COMPRESS (
+ FASTAWINDOWS.out.mononuc
+ | mix ( FASTAWINDOWS.out.dinuc )
+ | mix ( FASTAWINDOWS.out.trinuc )
+ | mix ( FASTAWINDOWS.out.tetranuc )
+ | mix ( FASTAWINDOWS.out.freq )
+ )
+ ch_versions = ch_versions.mix ( PIGZ_COMPRESS.out.versions.first() )
+
+
// Create genome windows file in BED format
CREATE_BED ( FASTAWINDOWS.out.mononuc )
ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() )