diff --git a/README.md b/README.md index 7ffbc6e..b653597 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,87 @@ A script to do some basic QC is provided in `bin/qc.py`. It measures the % of re #### Output A subdirectory for each process in the workflow is created in `--outdir`. A `nml_upload` subdirectory containing dehosted fastq files and consensus sequences is included. + +#### Provenance +In the output directory for each sample, a provenance file will be written with the following format: +``` +- process_name: performHostFilter + tools: + - tool_name: bwa + tool_version: 0.7.17-r1188 + subcommand: mem + parameters: + - parameter: -t + value: 8 + - tool_name: samtools + tool_version: 1.10 + subcommand: sort +- process_name: readTrimming + tools: + - tool_name: trim_galore + tool_version: 0.6.4_dev +- process_name: filterResidualAdapters + tools: + - tool_name: filter_residual_adapters.py + sha256: c3d062687abf2bbec48721a562ec609742101eec82887b1f31b9997361da901e +- process_name: trimPrimerSequences + tools: + - tool_name: samtools + tool_version: 1.10 + subcommand: view + parameters: + - parameter: -F + value: 4 + - tool_name: samtools + tool_version: 1.10 + subcommand: index + - tool_name: samtools + tool_version: 1.10 + subcommand: sort + - tool_name: ivar + tool_version: 1.3.1 + subcommand: trim + parameters: + - parameter: -e + value: null + - parameter: -m + value: 50 + - parameter: -q + value: 20 +- process_name: callConsensusFreebayes + tools: + - tool_name: freebayes + tool_version: v1.3.2-dirty + parameters: + - parameter: -p + value: 1 + - parameter: -f + value: ref.fa + - parameter: -F + value: 0.2 + - parameter: -C + value: 1 + - parameter: --pooled-continuous + value: null + - parameter: min-coverage + value: 10 + - parameter: --gvcf + value: null + - parameter: --gvcf-dont-use-chunk + value: true + - tool_name: bcftools + tool_version: 1.10.2 + parameters: + - parameter: norm + - parameter: consensus +- input_filename: sample_R1.fastq.gz + file_type: fastq-input + sha256: 1411f944271b07918cf08393ab102d7760a811fb5b0df12ace104c95bb6ab341 +- input_filename: sample_R2.fastq.gz + file_type: fastq-input + sha256: 0693d7e519b2e2a294d9d4a79ddfc3830137363b2c8bf9990fc392800a1ca11f +- pipeline_name: BCCDC-PHL/mpxv-artic-nf + pipeline_version: 0.1.2 + timestamp_analysis_start: 2024-10-28T15:44:15.656920-07:00 + +``` diff --git a/modules/hash_files.nf b/modules/hash_files.nf new file mode 100644 index 0000000..93e9ed0 --- /dev/null +++ b/modules/hash_files.nf @@ -0,0 +1,22 @@ +process hash_files { + + tag { sampleName + " / " + file_type } + + input: + tuple val(sampleName), path(files_to_hash), val(file_type) + + output: + tuple val(sampleName), path("${sampleName}_${file_type}.sha256.csv"), emit: csv + tuple val(sampleName), path("${sampleName}_${file_type}_provenance.yml"), emit: provenance + + script: + """ + shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sampleName}_${file_type}.sha256.csv + while IFS=',' read -r hash filename; do + printf -- "- input_filename: \$filename\\n" >> ${sampleName}_${file_type}_provenance.yml; + printf -- " file_type: ${file_type}\\n" >> ${sampleName}_${file_type}_provenance.yml; + printf -- " sha256: \$hash\\n" >> ${sampleName}_${file_type}_provenance.yml; + done < ${sampleName}_${file_type}.sha256.csv + """ + +} \ No newline at end of file diff --git a/modules/illumina.nf b/modules/illumina.nf index 2bcfeee..9e2e75e 100644 --- a/modules/illumina.nf +++ b/modules/illumina.nf @@ -11,9 +11,23 @@ process performHostFilter { output: tuple val(sampleName), path("${sampleName}_hostfiltered_R1.fastq.gz"), path("${sampleName}_hostfiltered_R2.fastq.gz"), emit: fastqPairs + tuple val(sampleName), path("${sampleName}_performHostFilter_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: performHostFilter\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " tools:\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " - tool_name: bwa\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " tool_version: \$(bwa 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " subcommand: mem\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " parameters:\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " - parameter: -t\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " value: ${task.cpus}\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " - tool_name: samtools\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_performHostFilter_provenance.yml + printf -- " subcommand: sort\\n" >> ${sampleName}_performHostFilter_provenance.yml + + bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \ filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleName}.viral_and_nonmapping_reads.bam samtools sort -@ ${task.cpus} -n ${sampleName}.viral_and_nonmapping_reads.bam | \ @@ -72,10 +86,16 @@ process readTrimming { tuple val(sampleName), path(forward), path(reverse) output: - tuple val(sampleName), path("*_val_1.fq.gz"), path("*_val_2.fq.gz"), optional: true + tuple val(sampleName), path("*_val_1.fq.gz"), path("*_val_2.fq.gz"), optional: true , emit: trim_out + tuple val(sampleName), path("${sampleName}_readTrimming_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: readTrimming\\n" >> ${sampleName}_readTrimming_provenance.yml + printf -- " tools:\\n" >> ${sampleName}_readTrimming_provenance.yml + printf -- " - tool_name: trim_galore\\n" >> ${sampleName}_readTrimming_provenance.yml + printf -- " tool_version: \$(trim_galore --version | sed -n '4p' | sed 's/version //' | tr -d '[:space:]')\\n" >> ${sampleName}_readTrimming_provenance.yml + if [[ \$(gunzip -c ${forward} | head -n4 | wc -l) -eq 0 ]]; then cp ${forward} ${sampleName}_hostfiltered_val_1.fq.gz cp ${reverse} ${sampleName}_hostfiltered_val_2.fq.gz @@ -102,10 +122,16 @@ process filterResidualAdapters { tuple val(sampleName), path(forward), path(reverse) output: - tuple val(sampleName), path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz") + tuple val(sampleName), path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz"), emit: filter_out + tuple val(sampleName), path("${sampleName}_filterResidualAdapters_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: filterResidualAdapters\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml + printf -- " tools:\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml + printf -- " - tool_name: filter_residual_adapters.py\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml + printf -- " sha256: \$(shasum -a 256 ${projectDir}/bin/filter_residual_adapters.py | cut -d ' ' -f 1)\\n" >> ${sampleName}_filterResidualAdapters_provenance.yml + filter_residual_adapters.py --input_R1 $forward --input_R2 $reverse """ } @@ -148,10 +174,10 @@ process readMapping { tuple val(sampleName), path(forward), path(reverse), path(ref), path("*") output: - tuple val(sampleName), path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai") + tuple val(sampleName), path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai"), emit: sorted_bam script: - """ + """ bwa mem -t ${task.cpus} ${ref} ${forward} ${reverse} | \ samtools sort -o ${sampleName}.sorted.bam samtools index ${sampleName}.sorted.bam @@ -171,9 +197,36 @@ process trimPrimerSequences { output: tuple val(sampleName), path("${sampleName}.mapped.bam"), path("${sampleName}.mapped.bam.bai"), emit: mapped tuple val(sampleName), path("${sampleName}.mapped.primertrimmed.sorted.bam"), path("${sampleName}.mapped.primertrimmed.sorted.bam.bai" ), emit: ptrim + tuple val(sampleName), path("${sampleName}_trimPrimerSequences_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: trimPrimerSequences\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " tools:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " subcommand: view\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " parameters:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - parameter: -F\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " value: 4\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " subcommand: index\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - tool_name: samtools\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " tool_version: \$(samtools 2>&1 | grep "Version: " | cut -d ' ' -f 2)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " subcommand: sort\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - tool_name: ivar\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " tool_version: \$(ivar version | sed -n '1p' | cut -d ' ' -f 3)\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " subcommand: trim\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " parameters:\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - parameter: -e\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " value: null\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - parameter: -m\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " value: ${params.keepLen}\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " - parameter: -q\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + printf -- " value: ${params.qualThreshold}\\n" >> ${sampleName}_trimPrimerSequences_provenance.yml + + samtools view -F4 -o ${sampleName}.mapped.bam ${bam} samtools index ${sampleName}.mapped.bam ivar trim -e -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.keepLen} -q ${params.qualThreshold} -f ${params.primer_pairs_tsv} -p ivar.out @@ -195,9 +248,37 @@ process callConsensusFreebayes { output: tuple val(sampleName), path("${sampleName}.consensus.fa"), emit: consensus tuple val(sampleName), path("${sampleName}.variants.norm.vcf"), emit: variants + tuple val(sampleName), path("${sampleName}_callConsensusFreebayes_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: callConsensusFreebayes\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " tools:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - tool_name: freebayes\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " tool_version: \$(freebayes --version | cut -d ' ' -f 3)\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " parameters:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: -p\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: 1\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: -f\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: ${ref}\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: -F\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: 0.2\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: -C\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: 1\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: --pooled-continuous\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: null\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: min-coverage\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: ${params.varMinDepth}\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: --gvcf\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: null\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: --gvcf-dont-use-chunk\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " value: true\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - tool_name: bcftools\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " tool_version: \$(bcftools 2>&1 | sed -n '4p' | cut -d ' ' -f 2)\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " parameters:\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: norm\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + printf -- " - parameter: consensus\\n" >> ${sampleName}_callConsensusFreebayes_provenance.yml + # the sed is to fix the header until a release is made with this fix # https://github.com/freebayes/freebayes/pull/549 freebayes -p 1 \ diff --git a/modules/provenance.nf b/modules/provenance.nf new file mode 100644 index 0000000..2b8de56 --- /dev/null +++ b/modules/provenance.nf @@ -0,0 +1,40 @@ +process collect_provenance { + + tag { sampleName } + + executor 'local' + + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}_*_provenance.yml", mode: 'copy' + + input: + tuple val(sampleName), path(provenance_files) + + output: + tuple val(sampleName), file("${sampleName}_*_provenance.yml") + + script: + """ + cat ${provenance_files} > ${sampleName}_\$(date +%Y%m%d%H%M%S)_provenance.yml + """ +} + + +process pipeline_provenance { + + tag { pipeline_name + " / " + pipeline_version } + + executor 'local' + + input: + tuple val(pipeline_name), val(pipeline_version), val(analysis_start) + + output: + file("pipeline_provenance.yml") + + script: + """ + printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml + printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml + printf -- " timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4363fa1..f3455e7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,8 +2,9 @@ manifest { author = 'connor-lab: Matt Bull, Sendu Bala. BCCDC-PHL: Dan Fornika. oicr-gsi: Jared Simpson, Michael Laszloffy' description = 'Nextflow for running the Artic mpxv pipeline' mainScript = 'main.nf' + name = 'BCCDC-PHL/mpxv-artic-nf' nextflowVersion = '>=20.01.0' - version = '0.1.0' + version = '0.1.2' } def makeFastqSearchPath ( illuminaSuffixes, fastq_exts ) { diff --git a/workflows/illuminaMpxv.nf b/workflows/illuminaMpxv.nf index 44875c6..ed91e38 100644 --- a/workflows/illuminaMpxv.nf +++ b/workflows/illuminaMpxv.nf @@ -2,6 +2,7 @@ nextflow.enable.dsl = 2 +include { hash_files } from '../modules/hash_files.nf' include { articDownloadScheme } from '../modules/utils.nf' include { normalizeDepth } from '../modules/illumina.nf' include { performHostFilter } from '../modules/illumina.nf' @@ -19,6 +20,9 @@ include { writeQCSummaryCSV } from '../modules/qc.nf' include { collateSamples } from '../modules/upload.nf' +include { pipeline_provenance } from '../modules/provenance.nf' +include { collect_provenance } from '../modules/provenance.nf' + workflow prepareReferenceFiles { // Get reference fasta if (params.ref) { @@ -98,11 +102,11 @@ workflow sequenceAnalysis { readTrimming(performHostFilter.out.fastqPairs) - filterResidualAdapters(readTrimming.out) + filterResidualAdapters(readTrimming.out.trim_out) - readMapping(filterResidualAdapters.out.combine(ch_preparedRef)) + readMapping(filterResidualAdapters.out.filter_out.combine(ch_preparedRef)) - trimPrimerSequences(readMapping.out.combine(ch_bedFile)) + trimPrimerSequences(readMapping.out.sorted_bam.combine(ch_bedFile)) callConsensusFreebayes(trimPrimerSequences.out.ptrim.combine(ch_preparedRef.map{ it[0] })) @@ -114,8 +118,8 @@ workflow sequenceAnalysis { makeQCCSV(trimPrimerSequences.out.ptrim.join(callConsensusFreebayes.out.consensus, by: 0) .combine(ch_preparedRef.map{ it[0] }) - .combine(ch_bedFile) - .combine(ch_primerPairs)) + .combine(ch_bedFile) + .combine(ch_primerPairs)) makeQCCSV.out.csv.splitCsv() .unique() @@ -123,8 +127,30 @@ workflow sequenceAnalysis { writeQCSummaryCSV(qc.toList()) + collateSamples(callConsensusFreebayes.out.consensus.join(performHostFilter.out.fastqPairs)) + // Provenance collection processes + // [sample_id, [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]] + // ...and then concatenate them all together in the 'collect_provenance' process. + ch_start_time = Channel.of(workflow.start) + ch_pipeline_name = Channel.of(workflow.manifest.name) + ch_pipeline_version = Channel.of(workflow.manifest.version) + ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time)) + + // FASTQ provenance + hash_files(ch_filePairs.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq-input"))) + + // illumina.nf provenance + ch_provenance = performHostFilter.out.provenance + ch_provenance = ch_provenance.join(readTrimming.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] } + ch_provenance = ch_provenance.join(filterResidualAdapters.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(trimPrimerSequences.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(callConsensusFreebayes.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(ch_provenance.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] } + collect_provenance(ch_provenance) + emit: qc_pass = collateSamples.out }