Skip to content

Commit

Permalink
add provenance (#20)
Browse files Browse the repository at this point in the history
* add provenance

* add mafft

* update null value

* remove reference to unused script

* update

* Update README.md

* update provenance formatting for subcommands

* update readme

* update formatting

* remove whitespace
  • Loading branch information
taranewman authored Oct 30, 2024
1 parent 7c754c3 commit 6a49cf4
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 10 deletions.
84 changes: 84 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,87 @@ A script to do some basic QC is provided in `bin/qc.py`. It measures the % of re

#### Output
A subdirectory for each process in the workflow is created in `--outdir`. A `nml_upload` subdirectory containing dehosted fastq files and consensus sequences is included.

#### Provenance
In the output directory for each sample, a provenance file will be written with the following format:
```
- process_name: performHostFilter
tools:
- tool_name: bwa
tool_version: 0.7.17-r1188
subcommand: mem
parameters:
- parameter: -t
value: 8
- tool_name: samtools
tool_version: 1.10
subcommand: sort
- process_name: readTrimming
tools:
- tool_name: trim_galore
tool_version: 0.6.4_dev
- process_name: filterResidualAdapters
tools:
- tool_name: filter_residual_adapters.py
sha256: c3d062687abf2bbec48721a562ec609742101eec82887b1f31b9997361da901e
- process_name: trimPrimerSequences
tools:
- tool_name: samtools
tool_version: 1.10
subcommand: view
parameters:
- parameter: -F
value: 4
- tool_name: samtools
tool_version: 1.10
subcommand: index
- tool_name: samtools
tool_version: 1.10
subcommand: sort
- tool_name: ivar
tool_version: 1.3.1
subcommand: trim
parameters:
- parameter: -e
value: null
- parameter: -m
value: 50
- parameter: -q
value: 20
- process_name: callConsensusFreebayes
tools:
- tool_name: freebayes
tool_version: v1.3.2-dirty
parameters:
- parameter: -p
value: 1
- parameter: -f
value: ref.fa
- parameter: -F
value: 0.2
- parameter: -C
value: 1
- parameter: --pooled-continuous
value: null
- parameter: min-coverage
value: 10
- parameter: --gvcf
value: null
- parameter: --gvcf-dont-use-chunk
value: true
- tool_name: bcftools
tool_version: 1.10.2
parameters:
- parameter: norm
- parameter: consensus
- input_filename: sample_R1.fastq.gz
file_type: fastq-input
sha256: 1411f944271b07918cf08393ab102d7760a811fb5b0df12ace104c95bb6ab341
- input_filename: sample_R2.fastq.gz
file_type: fastq-input
sha256: 0693d7e519b2e2a294d9d4a79ddfc3830137363b2c8bf9990fc392800a1ca11f
- pipeline_name: BCCDC-PHL/mpxv-artic-nf
pipeline_version: 0.1.2
timestamp_analysis_start: 2024-10-28T15:44:15.656920-07:00
```
22 changes: 22 additions & 0 deletions modules/hash_files.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
process hash_files {

tag { sampleName + " / " + file_type }

input:
tuple val(sampleName), path(files_to_hash), val(file_type)

output:
tuple val(sampleName), path("${sampleName}_${file_type}.sha256.csv"), emit: csv
tuple val(sampleName), path("${sampleName}_${file_type}_provenance.yml"), emit: provenance

script:
"""
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sampleName}_${file_type}.sha256.csv
while IFS=',' read -r hash filename; do
printf -- "- input_filename: \$filename\\n" >> ${sampleName}_${file_type}_provenance.yml;
printf -- " file_type: ${file_type}\\n" >> ${sampleName}_${file_type}_provenance.yml;
printf -- " sha256: \$hash\\n" >> ${sampleName}_${file_type}_provenance.yml;
done < ${sampleName}_${file_type}.sha256.csv
"""

}
89 changes: 85 additions & 4 deletions modules/illumina.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,23 @@ process performHostFilter {

output:
tuple val(sampleName), path("${sampleName}_hostfiltered_R1.fastq.gz"), path("${sampleName}_hostfiltered_R2.fastq.gz"), emit: fastqPairs
tuple val(sampleName), path("${sampleName}_performHostFilter_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: performHostFilter\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " tools:\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " - tool_name: bwa\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " tool_version: \$(bwa 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " subcommand: mem\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " parameters:\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " - parameter: -t\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " value: ${task.cpus}\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " - tool_name: samtools\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_performHostFilter_provenance.yml
printf -- " subcommand: sort\\n" >> ${sampleName}_performHostFilter_provenance.yml
bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \
filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleName}.viral_and_nonmapping_reads.bam
samtools sort -@ ${task.cpus} -n ${sampleName}.viral_and_nonmapping_reads.bam | \
Expand Down Expand Up @@ -72,10 +86,16 @@ process readTrimming {
tuple val(sampleName), path(forward), path(reverse)

output:
tuple val(sampleName), path("*_val_1.fq.gz"), path("*_val_2.fq.gz"), optional: true
tuple val(sampleName), path("*_val_1.fq.gz"), path("*_val_2.fq.gz"), optional: true , emit: trim_out
tuple val(sampleName), path("${sampleName}_readTrimming_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: readTrimming\\n" >> ${sampleName}_readTrimming_provenance.yml
printf -- " tools:\\n" >> ${sampleName}_readTrimming_provenance.yml
printf -- " - tool_name: trim_galore\\n" >> ${sampleName}_readTrimming_provenance.yml
printf -- " tool_version: \$(trim_galore --version | sed -n '4p' | sed 's/version //' | tr -d '[:space:]')\\n" >> ${sampleName}_readTrimming_provenance.yml
if [[ \$(gunzip -c ${forward} | head -n4 | wc -l) -eq 0 ]]; then
cp ${forward} ${sampleName}_hostfiltered_val_1.fq.gz
cp ${reverse} ${sampleName}_hostfiltered_val_2.fq.gz
Expand All @@ -102,10 +122,16 @@ process filterResidualAdapters {
tuple val(sampleName), path(forward), path(reverse)

output:
tuple val(sampleName), path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz")
tuple val(sampleName), path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz"), emit: filter_out
tuple val(sampleName), path("${sampleName}_filterResidualAdapters_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: filterResidualAdapters\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml
printf -- " tools:\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml
printf -- " - tool_name: filter_residual_adapters.py\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml
printf -- " sha256: \$(shasum -a 256 ${projectDir}/bin/filter_residual_adapters.py | cut -d ' ' -f 1)\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml
filter_residual_adapters.py --input_R1 $forward --input_R2 $reverse
"""
}
Expand Down Expand Up @@ -148,10 +174,10 @@ process readMapping {
tuple val(sampleName), path(forward), path(reverse), path(ref), path("*")

output:
tuple val(sampleName), path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai")
tuple val(sampleName), path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai"), emit: sorted_bam

script:
"""
"""
bwa mem -t ${task.cpus} ${ref} ${forward} ${reverse} | \
samtools sort -o ${sampleName}.sorted.bam
samtools index ${sampleName}.sorted.bam
Expand All @@ -171,9 +197,36 @@ process trimPrimerSequences {
output:
tuple val(sampleName), path("${sampleName}.mapped.bam"), path("${sampleName}.mapped.bam.bai"), emit: mapped
tuple val(sampleName), path("${sampleName}.mapped.primertrimmed.sorted.bam"), path("${sampleName}.mapped.primertrimmed.sorted.bam.bai" ), emit: ptrim
tuple val(sampleName), path("${sampleName}_trimPrimerSequences_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: trimPrimerSequences\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " tools:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " subcommand: view\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " parameters:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - parameter: -F\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " value: 4\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " subcommand: index\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " subcommand: sort\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - tool_name: ivar\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " tool_version: \$(ivar version | sed -n '1p' | cut -d ' ' -f 3)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " subcommand: trim\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " parameters:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - parameter: -e\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " value: null\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - parameter: -m\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " value: ${params.keepLen}\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " - parameter: -q\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
printf -- " value: ${params.qualThreshold}\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml
samtools view -F4 -o ${sampleName}.mapped.bam ${bam}
samtools index ${sampleName}.mapped.bam
ivar trim -e -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.keepLen} -q ${params.qualThreshold} -f ${params.primer_pairs_tsv} -p ivar.out
Expand All @@ -195,9 +248,37 @@ process callConsensusFreebayes {
output:
tuple val(sampleName), path("${sampleName}.consensus.fa"), emit: consensus
tuple val(sampleName), path("${sampleName}.variants.norm.vcf"), emit: variants
tuple val(sampleName), path("${sampleName}_callConsensusFreebayes_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: callConsensusFreebayes\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " tools:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - tool_name: freebayes\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " tool_version: \$(freebayes --version | cut -d ' ' -f 3)\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " parameters:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: -p\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: 1\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: -f\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: ${ref}\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: -F\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: 0.2\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: -C\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: 1\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: --pooled-continuous\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: null\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: min-coverage\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: ${params.varMinDepth}\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: --gvcf\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: null\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: --gvcf-dont-use-chunk\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " value: true\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - tool_name: bcftools\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " tool_version: \$(bcftools 2>&1 | sed -n '4p' | cut -d ' ' -f 2)\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " parameters:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: norm\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
printf -- " - parameter: consensus\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml
# the sed is to fix the header until a release is made with this fix
# https://github.com/freebayes/freebayes/pull/549
freebayes -p 1 \
Expand Down
40 changes: 40 additions & 0 deletions modules/provenance.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process collect_provenance {

tag { sampleName }

executor 'local'

publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}_*_provenance.yml", mode: 'copy'

input:
tuple val(sampleName), path(provenance_files)

output:
tuple val(sampleName), file("${sampleName}_*_provenance.yml")

script:
"""
cat ${provenance_files} > ${sampleName}_\$(date +%Y%m%d%H%M%S)_provenance.yml
"""
}


process pipeline_provenance {

tag { pipeline_name + " / " + pipeline_version }

executor 'local'

input:
tuple val(pipeline_name), val(pipeline_version), val(analysis_start)

output:
file("pipeline_provenance.yml")

script:
"""
printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml
printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml
printf -- " timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml
"""
}
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ manifest {
author = 'connor-lab: Matt Bull, Sendu Bala. BCCDC-PHL: Dan Fornika. oicr-gsi: Jared Simpson, Michael Laszloffy'
description = 'Nextflow for running the Artic mpxv pipeline'
mainScript = 'main.nf'
name = 'BCCDC-PHL/mpxv-artic-nf'
nextflowVersion = '>=20.01.0'
version = '0.1.0'
version = '0.1.2'
}

def makeFastqSearchPath ( illuminaSuffixes, fastq_exts ) {
Expand Down
Loading

0 comments on commit 6a49cf4

Please sign in to comment.