From fb407b4d698fbf2f7dc9b3f3600c0bcc89fbeed3 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:23:42 +0100 Subject: [PATCH 01/77] cloudy refactor --- bin/combine_kreports.py | 87 ++++-- conf/base.config | 66 +++++ dockerfiles/general_container/Dockerfile | 9 + dockerfiles/general_container/environment.yml | 12 + modules/assemble_taxa.nf | 66 +++++ modules/extract_taxa.nf | 9 + modules/fastp.nf | 70 +++++ modules/generate_report.nf | 11 +- modules/kraken_client.nf | 34 ++- modules/qc_checks.nf | 15 +- nextflow.config | 251 ++++++++++++------ subworkflows/ingest.nf | 28 +- 12 files changed, 538 insertions(+), 120 deletions(-) create mode 100644 conf/base.config create mode 100644 dockerfiles/general_container/Dockerfile create mode 100644 dockerfiles/general_container/environment.yml create mode 100644 modules/fastp.nf diff --git a/bin/combine_kreports.py b/bin/combine_kreports.py index 541e6be..b7c76cf 100755 --- a/bin/combine_kreports.py +++ b/bin/combine_kreports.py @@ -1,28 +1,36 @@ #!/usr/bin/env python import sys -import gzip from Bio import SeqIO import argparse from datetime import datetime + def parse_depth(name): parse_name = name.split(" ") depth = 0 for i in parse_name: - if i!="": + if i != "": break depth += 1 - depth = int(depth/2) + depth = int(depth / 2) return depth + def get_kraken_hierarchy(kraken_file, entries={}, total=0): with open(kraken_file, "r") as f: hierarchy = [] for line in f: if line.startswith("% of Seqs"): continue - percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t") + ( + percentage, + num_clade_root, + num_direct, + raw_rank, + ncbi, + name, + ) = line.strip().split("\t") percentage = float(percentage) num_clade_root = int(num_clade_root) num_direct = int(num_direct) @@ -37,8 +45,18 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0): entries[ncbi]["count"] += num_direct entries[ncbi]["count_descendants"] += num_clade_root else: - entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root, - "raw_rank": raw_rank, "rank": rank, "depth": depth, "ncbi": ncbi, "name": name, "parents":[], "children":set()} + entries[ncbi] = { + "percentage": percentage, + "count": num_direct, + "count_descendants": num_clade_root, + "raw_rank": raw_rank, + "rank": rank, + "depth": depth, + "ncbi": ncbi, + "name": name, + "parents": [], + "children": set(), + } if len(hierarchy) > 1: parent = hierarchy[-2] @@ -46,7 +64,8 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0): entries[ncbi]["parents"] = hierarchy[:-1] entries[parent]["children"].add(ncbi) - return entries,total + return entries, total + def combine_kraken_reports(kreports): entries = {} @@ -55,12 +74,27 @@ def combine_kraken_reports(kreports): entries, total = get_kraken_hierarchy(kreport, entries, total) print(kreport, len(entries), total) for ncbi in entries: - entries[ncbi]["percentage"] = entries[ncbi]["count_descendants"] / float(total) *100 + entries[ncbi]["percentage"] = ( + entries[ncbi]["count_descendants"] / float(total) * 100 + ) return entries + def write_entry(out_handle, entry): offset = entry["depth"] - out_handle.write("%f\t%i\t%i\t%s\t%s\t%s%s\n" %(entry["percentage"], entry["count"], entry["count_descendants"], entry["raw_rank"], entry["ncbi"], 2*offset*" ", entry["name"] )) + out_handle.write( + "%f\t%i\t%i\t%s\t%s\t%s%s\n" + % ( + entry["percentage"], + entry["count"], + entry["count_descendants"], + entry["raw_rank"], + entry["ncbi"], + 2 * offset * " ", + entry["name"], + ) + ) + def choose_next(entries, taxa, ncbi="0"): if len(taxa) == 0: @@ -86,38 +120,49 @@ def choose_next(entries, taxa, ncbi="0"): return next, taxa + def write_kraken_report(outfile, entries): taxa = list(entries.keys()) with open(outfile, "w") as out_report: - out_report.write("% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n") + out_report.write( + "% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n" + ) current = "0" while len(taxa) > 0: write_entry(out_report, entries[current]) current, taxa = choose_next(entries, taxa, ncbi=current) + def main(): - #Parse arguments + # Parse arguments parser = argparse.ArgumentParser() - parser.add_argument('-r', dest='kraken_report_files', required=True, nargs='+', - help='Kraken report files to parse') - parser.add_argument('-o', "--outfile",dest='outfile', required=True, - help='Output name') - - args=parser.parse_args() - - #Start Program + parser.add_argument( + "-r", + dest="kraken_report_files", + required=True, + nargs="+", + help="Kraken report files to parse", + ) + parser.add_argument( + "-o", "--outfile", dest="outfile", required=True, help="Output name" + ) + + args = parser.parse_args() + + # Start Program now = datetime.now() time = now.strftime("%m/%d/%Y, %H:%M:%S") - sys.stdout.write("PROGRAM START TIME: " + time + '\n') + sys.stdout.write("PROGRAM START TIME: " + time + "\n") entries = combine_kraken_reports(args.kraken_report_files) write_kraken_report(args.outfile, entries) now = datetime.now() time = now.strftime("%m/%d/%Y, %H:%M:%S") - sys.stdout.write("PROGRAM END TIME: " + time + '\n') + sys.stdout.write("PROGRAM END TIME: " + time + "\n") sys.exit(0) + if __name__ == "__main__": main() diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 0000000..da4b559 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,66 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + nf-core/tmp Nextflow base config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A 'blank slate' config file, appropriate for general use on most high performance + compute environments. Assumes that all software is installed and available on + the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. +---------------------------------------------------------------------------------------- +*/ + +process { + + // TODO nf-core: Check the defaults for all processes + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + // Process-specific resource requirements + // NOTE - Please try and re-use the labels below as much as possible. + // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. + // If possible, it would be nice to keep the same label naming convention when + // adding in your local modules too. + // TODO nf-core: Customise requirements for specific processes. + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { + cache = false + } +} diff --git a/dockerfiles/general_container/Dockerfile b/dockerfiles/general_container/Dockerfile new file mode 100644 index 0000000..bcec54f --- /dev/null +++ b/dockerfiles/general_container/Dockerfile @@ -0,0 +1,9 @@ +FROM condaforge/mambaforge:latest AS conda + +COPY environment.yml . + +RUN /opt/conda/bin/mamba env create -f /environment.yml + +ENV PATH=/opt/conda/envs/scylla_general/bin:$PATH + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/dockerfiles/general_container/environment.yml b/dockerfiles/general_container/environment.yml new file mode 100644 index 0000000..97c010f --- /dev/null +++ b/dockerfiles/general_container/environment.yml @@ -0,0 +1,12 @@ +name: scylla_general +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - python<=3.10 + - pip + - biopython + - Mako + - pandas + - numpy \ No newline at end of file diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf index da7134a..bbbc92b 100644 --- a/modules/assemble_taxa.nf +++ b/modules/assemble_taxa.nf @@ -8,15 +8,26 @@ // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check process extract_reads { + tag "$meta.id" + label 'process_medium' + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' + + conda 'bioconda::biopython=1.78' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython%3A1.76' : + 'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }" + input: val unique_id path fastq path kraken_assignments path kraken_report path bracken_report + output: path "reads.*.f*.gz" + script: """ $projectDir/../bin/extract_kraken_reads.py \ @@ -42,7 +53,62 @@ process extract_reads { // either be as a result of the pipeline failing for one of many reasons, or because the file generated at the end // of a successful pipeline run is empty, in which case it would be meaningless to copy // I've not had a successful test case with the very small numbers of reads/database on my laptop + +process flye_assemble { + tag "$meta.id" + label 'process_high' + + conda "bioconda::flye=2.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flye:2.9--py39h6935b12_1' : + 'biocontainers/flye:2.9--py39h6935b12_1' }" + + input: + val unique_id + path fastq + output: + path "assembly.*.f*.gz" + + script: + outfile = "${fastq.name}".replaceAll("reads","assembly") + + """ + flye \ + --nano-raw ${fastq} \ + -o "assembly" + if [ -s assembly/final.contigs.fa ] + then + mv assembly/final.contigs.fa ${outfile} + gzip ${outfile} + fi + """ + +} + +process megahit_assemble { + tag "$meta.id" + label 'process_high' + + conda "bioconda::megahit=1.2.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' : + 'biocontainers/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' }" + + script: + """ + megahit \ + --r ${fastq} \ + -o "assembly" + if [ -s assembly/final.contigs.fa ] + then + mv assembly/final.contigs.fa ${outfile} + gzip ${outfile} + fi + """ +} + process denovo_assemble { + errorStrategy 'ignore' publishDir path: "${params.out_dir}/${unique_id}/assemblies_by_taxa/", mode: 'copy' input: diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index b563700..7794fda 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -8,7 +8,16 @@ // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check process extract_reads { + tag "$meta.id" + label 'process_medium' + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' + + conda 'bioconda::biopython=1.78' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython%3A1.76' : + 'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }" + input: val unique_id path fastq diff --git a/modules/fastp.nf b/modules/fastp.nf new file mode 100644 index 0000000..8ddd44d --- /dev/null +++ b/modules/fastp.nf @@ -0,0 +1,70 @@ +process fastp_paired { + tag "$meta.id" + label 'process_medium' + + publishDir "${params.out_dir}/preprocess/", mode: 'copy' + + conda "bioconda::fastp=0.23.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + val(prefix) + path(fastq_1) + path(fastq_2) + + output: + val(prefix) + path("${prefix}_1.fastp.fastq.gz") + path("${prefix}_2.fastp.fastq.gz") + path("${prefix}.fastp.fastq.gz"), emit: processed_fastq + path("${prefix}.fastp.json") + + script: + """ + fastp \\ + --in1 ${fastq_1} \\ + --in2 ${fastq_2} \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --merged-out ${prefix}.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + 2> ${prefix}.fastp.log + """ + +} + +process fastp_single { + tag "$meta.id" + label 'process_medium' + + publishDir "${params.out_dir}/preprocess/", mode: 'copy' + + conda "bioconda::fastp=0.23.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + val(prefix) + path(fastq) + + output: + val(prefix) + path("${prefix}.fastp.fastq.gz"), emit: processed_fastq + path("${prefix}.fastp.json") + + script: + + """ + fastp \\ + --in1 ${fastq} \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --thread $task.cpus \\ + 2> ${prefix}.fastp.log + """ +} \ No newline at end of file diff --git a/modules/generate_report.nf b/modules/generate_report.nf index 4a4e171..ffdf0d5 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -1,8 +1,13 @@ // module to make a single sample report during ingest containing kraken summary and qc stats process make_report { + tag "$meta.id" + label "process_low" + publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy' - maxForks 1 - cpus 1 + + conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" + container "biowilko/scylla_general:0.0.1" + input: val unique_id path stats @@ -29,7 +34,7 @@ workflow generate_report { bracken_jsons main: // Acquire report template - template = file("$projectDir/../bin/scylla.mako.html") + template = file("$baseDir/bin/scylla.mako.html") make_report(unique_id, stats, bracken_jsons, template) emit: diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index 7302e04..756f290 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -1,18 +1,14 @@ include { qc_checks } from '../modules/qc_checks' include { generate_report } from '../modules/generate_report' - -kraken_compute = params.threads == 1 ? 1 : params.threads - 1 - process kraken2_client { - errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries 5 - label "scylla" + tag "$meta.id" + label 'process_low' + label 'error_retry' + + conda "epi2melabs::kraken2-server=0.1.3" + container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681' - containerOptions {workflow.profile != "singularity" ? "--network host" : ""} - // retry if server responds out of resource - errorStrategy = { task.exitStatus in [8] ? 'retry' : 'finish' } - maxForks kraken_compute input: path fastq output: @@ -28,6 +24,8 @@ process kraken2_client { } process combine_kraken_outputs { + + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' input: val unique_id @@ -72,7 +70,16 @@ process determine_bracken_length { // this fails if the kraken file input is empty - currently have no check that it is populated process bracken { + tag "$meta.id" + label 'process_low' + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + + conda "bioconda::bracken=2.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0': + 'biocontainers/bracken:2.7--py39hc16433a_0' }" + input: val unique_id path kraken_report @@ -93,7 +100,14 @@ process bracken { } process bracken_to_json { + tag "$meta.id" + label "process_low" + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + + conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" + container "biowilko/scylla_general:0.0.1" + input: val unique_id path kraken_report diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index aa139ca..a6777a4 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -1,6 +1,12 @@ // module for other ingest qc checks process read_stats { + tag "$meta.id" + label "process_medium" + + conda "epi2melabs::kraken2-server=0.1.3" + container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681' + input: path fastq output: @@ -14,7 +20,14 @@ process read_stats { } process combine_stats { + tag "$meta.id" + label "process_low" + publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' + + conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" + container "biowilko/scylla_general:0.0.1" + input: val unique_id path stats @@ -24,7 +37,7 @@ process combine_stats { script: """ cp ${stats} "raw_reads.stats" - $projectDir/../bin/fastcat_histogram.py \ + fastcat_histogram.py \ --sample_id "raw_reads" \ "raw_reads.stats" "raw_reads.json" """ diff --git a/nextflow.config b/nextflow.config index e09769a..393b000 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,31 @@ params { + // TODO nf-core: Specify your pipeline's command line flags + // Input options + input = null + + // Boilerplate options + outdir = null + tracedir = "${params.outdir}/pipeline_info" + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'genomes' + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' + help = false version = false wfversion = "v0.0.1" @@ -29,29 +54,6 @@ params { bracken_length = null bracken_level = 'S' database_set = "Viral" - database_sets = [ - 'Viral': [ - 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20221209.tar.gz', - 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' - ], - 'EuPath': [ - 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113.tar.gz', - 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' - ], - 'PlusPF-8': [ - 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspf_8gb_20210517.tar.gz', - 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' - ], - 'ncbi_16s_18s': [ - 'reference': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_16s_18s.fna', - 'refindex': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_16s_18s.fna.fai', - 'database': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_kraken2.tar.gz', - 'kmer_dist': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/database1000mers.kmer_distrib', - 'ref2taxid': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ref2taxid.targloci.tsv', - 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' - ] - ] - kraken_report = null bracken_report = null @@ -62,16 +64,6 @@ params { max_human_reads_before_rejection = 1000 read_type = "illumina" - - - - - - - - - - disable_ping = false threads = 2 k2_port = 8080 @@ -95,6 +87,8 @@ params { } } +// Load base.config by default for all pipelines +includeConfig 'conf/base.config' manifest { name = 'snowy-leopard/scylla' @@ -106,57 +100,164 @@ manifest { version = 'v0.0.1' } - -executor { - $local { - cpus = 8 - memory = "8 GB" +profiles { + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false } -} - -// used by default for "standard" (docker) and singularity profiles, -// other profiles may override. -process { - withLabel:scylla { - container = "scylla:latest" + conda { + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } - shell = ['/bin/bash', '-euo', 'pipefail'] -} - - -profiles { - // the "standard" profile is used implicitely by nextflow - // if no other profile is given on the CLI - standard { - docker { - enabled = true - // this ensures container is run as host user and group, but - // also adds host user to the within-container group - runOptions = "--user \$(id -u):\$(id -g) --group-add 100" - } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + docker { + docker.enabled = true + docker.registry = 'quay.io' + docker.userEmulation = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } - - // using singularity instead of docker singularity { - singularity { - enabled = true - autoMounts = true - } + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + podman { + podman.enabled = true + podman.registry = 'quay.io' + conda.enabled = false + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + shifter { + shifter.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + charliecloud { + charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } +} - conda { - conda.enabled = true - } +// Export these variables to prevent local Python/R libraries from conflicting with those in the container +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. - // local profile for simplified development testing - local { - process.executor = 'local' - } +env { + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" } +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" +} report { - enabled = true - file = "${params.out_dir}/execution/report.html" -} \ No newline at end of file + enabled = true + file = "${params.tracedir}/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" +} +dag { + enabled = true + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" +} + + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index eed53a1..5e0d885 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -1,7 +1,8 @@ // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq -include { get_params_and_versions } from '../modules/get_params_and_versions' -include { kraken_pipeline } from '../subworkflows/kraken_pipeline' -include { extract_taxa } from '../modules/extract_taxa' +include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions' +include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline' +include { extract_taxa } from '$baseDir/modules/extract_taxa' +include { fastp_single, fastp_paired} from '$baseDir/modules/fastp' workflow ingest { take: @@ -19,13 +20,20 @@ workflow ingest { } workflow { - // check input fastq exists - input_fastq = file("${params.fastq}", type: "file", checkIfExists:true) - - unique_id = "${params.unique_id}" - if (unique_id == "null") { - unique_id = "${input_fastq.simpleName}" + // check input fastq exists and run fastp + + if ($params.paired) { + input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true) + input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true) + fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2) + fastp_paired.out.processed_fastq + .set {processed_fastq} + } else { + input_fastq = file("${params.fastq}", type: "file", checkIfExists:true) + fastp_single("${params.unique_id}", input_fastq) + fastp_single.out.processed_fastq + .set {processed_fastq} } - ingest(unique_id, input_fastq) + ingest(unique_id, processed_fastq) } \ No newline at end of file From d21eebd6eba519dada01ed91dff02305aff170ba Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:38:11 +0100 Subject: [PATCH 02/77] better db handling --- subworkflows/kraken_pipeline.nf | 48 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index e176c40..4ac86ee 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -9,32 +9,34 @@ workflow kraken_pipeline { main: // Check source param is valid - sources = params.database_sets - source_name = params.database_set - source_data = sources.get(source_name, false) - if (!sources.containsKey(source_name) || !source_data) { - keys = sources.keySet() - throw new Exception("Source $params.source is invalid, must be one of $keys") - } + // sources = params.database_sets + // source_name = params.database_set + // source_data = sources.get(source_name, false) + // if (!sources.containsKey(source_name) || !source_data) { + // keys = sources.keySet() + // throw new Exception("Source $params.source is invalid, must be one of $keys") + // } // Grab taxonomy files - taxonomy = file(sources[source_name]["taxonomy"], type: "file") - if (params.taxonomy) { - log.info("Checking custom taxonomy mapping exists") - taxonomy = file(params.taxonomy, type: "dir", checkIfExists:true) - } + // taxonomy = file(sources[source_name]["taxonomy"], type: "file") + // if (params.taxonomy) { + // log.info("Checking custom taxonomy mapping exists") + + // } + + taxonomy = file("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", type: "dir", checkIfExists:true) - if (database) { - source_database = database - } else { - source_database = source_data.get("database", false) - if (!source_database) { - throw new Exception( - "Error: Source $source_name does not include a database for " - + "use with kraken, please choose another source, " - + "provide a custom database or disable kraken.") - } - database = file(source_database, type: "file") + // if (database) { + // source_database = database + // } else { + // source_database = source_data.get("database", false) + // if (!source_database) { + // throw new Exception( + // "Error: Source $source_name does not include a database for " + // + "use with kraken, please choose another source, " + // + "provide a custom database or disable kraken.") + // } + database = file($params.db, type: "dir", checkIfExists:true) // start_server(database, taxonomy) run_kraken_and_bracken(unique_id, fastq, database, taxonomy) From ad91dd3234cd065838a6f801c801ba38a015b5c8 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:45:00 +0100 Subject: [PATCH 03/77] it needs a main.nf.... --- main.nf | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 main.nf diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..5e0d885 --- /dev/null +++ b/main.nf @@ -0,0 +1,39 @@ +// workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq +include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions' +include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline' +include { extract_taxa } from '$baseDir/modules/extract_taxa' +include { fastp_single, fastp_paired} from '$baseDir/modules/fastp' + +workflow ingest { + take: + unique_id + fastq + //metadata + main: + get_params_and_versions() + //clean_metadata(metadata) + kraken_pipeline(unique_id, fastq) + extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) + emit: + html_report = kraken_pipeline.out.report + +} + +workflow { + // check input fastq exists and run fastp + + if ($params.paired) { + input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true) + input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true) + fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2) + fastp_paired.out.processed_fastq + .set {processed_fastq} + } else { + input_fastq = file("${params.fastq}", type: "file", checkIfExists:true) + fastp_single("${params.unique_id}", input_fastq) + fastp_single.out.processed_fastq + .set {processed_fastq} + } + + ingest(unique_id, processed_fastq) +} \ No newline at end of file From ad9ed8677bbed3af957815ff2ae4634784023948 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:48:10 +0100 Subject: [PATCH 04/77] properly define mainscript --- main.nf | 39 --------------------------------------- nextflow.config | 2 +- 2 files changed, 1 insertion(+), 40 deletions(-) delete mode 100644 main.nf diff --git a/main.nf b/main.nf deleted file mode 100644 index 5e0d885..0000000 --- a/main.nf +++ /dev/null @@ -1,39 +0,0 @@ -// workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq -include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions' -include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline' -include { extract_taxa } from '$baseDir/modules/extract_taxa' -include { fastp_single, fastp_paired} from '$baseDir/modules/fastp' - -workflow ingest { - take: - unique_id - fastq - //metadata - main: - get_params_and_versions() - //clean_metadata(metadata) - kraken_pipeline(unique_id, fastq) - extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) - emit: - html_report = kraken_pipeline.out.report - -} - -workflow { - // check input fastq exists and run fastp - - if ($params.paired) { - input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true) - input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true) - fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2) - fastp_paired.out.processed_fastq - .set {processed_fastq} - } else { - input_fastq = file("${params.fastq}", type: "file", checkIfExists:true) - fastp_single("${params.unique_id}", input_fastq) - fastp_single.out.processed_fastq - .set {processed_fastq} - } - - ingest(unique_id, processed_fastq) -} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 393b000..9646fcf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -95,7 +95,7 @@ manifest { author = 'Snowy Leopard' homePage = 'https://github.com/snowy-leopard/scylla' description = 'Classify metagenomic sequence data from human respiratory infections.' - mainScript = 'ingest.nf' + mainScript = 'subworkflows/ingest.nf' nextflowVersion = '>=20.10.0' version = 'v0.0.1' } From d5ae2f3e22d5b8f3c33984be44097038eb1ea76a Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:51:53 +0100 Subject: [PATCH 05/77] why use commas like everybody else? --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 5e0d885..9f23394 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -2,7 +2,7 @@ include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions' include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline' include { extract_taxa } from '$baseDir/modules/extract_taxa' -include { fastp_single, fastp_paired} from '$baseDir/modules/fastp' +include { fastp_single; fastp_paired} from '$baseDir/modules/fastp' workflow ingest { take: From 59fc786709364edf71ab9a6acd31091e8ed36d75 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:54:29 +0100 Subject: [PATCH 06/77] fix includes again --- subworkflows/ingest.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 9f23394..0daf513 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -1,8 +1,8 @@ // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq -include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions' -include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline' -include { extract_taxa } from '$baseDir/modules/extract_taxa' -include { fastp_single; fastp_paired} from '$baseDir/modules/fastp' +include { get_params_and_versions } from '../modules/get_params_and_versions' +include { kraken_pipeline } from '../subworkflows/kraken_pipeline' +include { extract_taxa } from '../modules/extract_taxa' +include { fastp_single; fastp_paired} from '../modules/fastp' workflow ingest { take: From 504ca03fd85f4fc9ce07faeaf8b95d5f880b1018 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:56:04 +0100 Subject: [PATCH 07/77] not $params --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 0daf513..9517f5b 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -22,7 +22,7 @@ workflow ingest { workflow { // check input fastq exists and run fastp - if ($params.paired) { + if (params.paired) { input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true) input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true) fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2) From e98551d307163eb325fbba179d9b66f77ecb0ba1 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 11:59:59 +0100 Subject: [PATCH 08/77] params not strings? --- subworkflows/ingest.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 9517f5b..98b9096 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -23,14 +23,14 @@ workflow { // check input fastq exists and run fastp if (params.paired) { - input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true) - input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true) - fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2) + input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) + input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) + fastp_paired(params.unique_id, input_fastq_1, input_fastq_2) fastp_paired.out.processed_fastq .set {processed_fastq} } else { - input_fastq = file("${params.fastq}", type: "file", checkIfExists:true) - fastp_single("${params.unique_id}", input_fastq) + input_fastq = file(params.fastq, type: "file", checkIfExists:true) + fastp_single(params.unique_id, input_fastq) fastp_single.out.processed_fastq .set {processed_fastq} } From 2fd603b0094b691246792d60f705f23589296b05 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 12:14:06 +0100 Subject: [PATCH 09/77] remove $meta.id --- modules/assemble_taxa.nf | 6 +++--- modules/extract_taxa.nf | 2 +- modules/fastp.nf | 4 ++-- modules/generate_report.nf | 2 +- modules/get_params_and_versions.nf | 3 +++ modules/kraken_client.nf | 6 +++--- modules/qc_checks.nf | 4 ++-- subworkflows/ingest.nf | 2 +- subworkflows/kraken_pipeline.nf | 2 +- 9 files changed, 17 insertions(+), 14 deletions(-) diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf index bbbc92b..01d741f 100644 --- a/modules/assemble_taxa.nf +++ b/modules/assemble_taxa.nf @@ -8,7 +8,7 @@ // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check process extract_reads { - tag "$meta.id" + label 'process_medium' publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' @@ -55,7 +55,7 @@ process extract_reads { // I've not had a successful test case with the very small numbers of reads/database on my laptop process flye_assemble { - tag "$meta.id" + label 'process_high' conda "bioconda::flye=2.9" @@ -86,7 +86,7 @@ process flye_assemble { } process megahit_assemble { - tag "$meta.id" + label 'process_high' conda "bioconda::megahit=1.2.9" diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 7794fda..66a3cbc 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -8,7 +8,7 @@ // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check process extract_reads { - tag "$meta.id" + label 'process_medium' publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' diff --git a/modules/fastp.nf b/modules/fastp.nf index 8ddd44d..bc8c8f5 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -1,5 +1,5 @@ process fastp_paired { - tag "$meta.id" + label 'process_medium' publishDir "${params.out_dir}/preprocess/", mode: 'copy' @@ -38,7 +38,7 @@ process fastp_paired { } process fastp_single { - tag "$meta.id" + label 'process_medium' publishDir "${params.out_dir}/preprocess/", mode: 'copy' diff --git a/modules/generate_report.nf b/modules/generate_report.nf index ffdf0d5..0f8afff 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -1,6 +1,6 @@ // module to make a single sample report during ingest containing kraken summary and qc stats process make_report { - tag "$meta.id" + label "process_low" publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy' diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf index 629c54e..cea7a8b 100644 --- a/modules/get_params_and_versions.nf +++ b/modules/get_params_and_versions.nf @@ -3,6 +3,9 @@ import groovy.json.JsonBuilder process get_versions { + + + label "scylla" publishDir "${params.out_dir}/execution", mode: 'copy' cpus 1 diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index 756f290..9850087 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -2,7 +2,7 @@ include { qc_checks } from '../modules/qc_checks' include { generate_report } from '../modules/generate_report' process kraken2_client { - tag "$meta.id" + label 'process_low' label 'error_retry' @@ -70,7 +70,7 @@ process determine_bracken_length { // this fails if the kraken file input is empty - currently have no check that it is populated process bracken { - tag "$meta.id" + label 'process_low' publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' @@ -100,7 +100,7 @@ process bracken { } process bracken_to_json { - tag "$meta.id" + label "process_low" publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index a6777a4..ef32b98 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -1,7 +1,7 @@ // module for other ingest qc checks process read_stats { - tag "$meta.id" + label "process_medium" conda "epi2melabs::kraken2-server=0.1.3" @@ -20,7 +20,7 @@ process read_stats { } process combine_stats { - tag "$meta.id" + label "process_low" publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 98b9096..7a9c194 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -35,5 +35,5 @@ workflow { .set {processed_fastq} } - ingest(unique_id, processed_fastq) + ingest(params.unique_id, processed_fastq) } \ No newline at end of file diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index 4ac86ee..ee80101 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -36,7 +36,7 @@ workflow kraken_pipeline { // + "use with kraken, please choose another source, " // + "provide a custom database or disable kraken.") // } - database = file($params.db, type: "dir", checkIfExists:true) + database = file(params.db, type: "dir", checkIfExists:true) // start_server(database, taxonomy) run_kraken_and_bracken(unique_id, fastq, database, taxonomy) From 414cb12f3d76dd384b40c97095638cebca24d6a6 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:28:25 +0100 Subject: [PATCH 10/77] Make main.nf --- dockerfiles/general_container/environment.yml | 12 ----- .../{general_container => scylla}/Dockerfile | 4 +- dockerfiles/scylla/environment.yml | 16 +++++++ main.nf | 27 ++++++++++++ modules/kraken_client.nf | 15 +++++-- modules/qc_checks.nf | 4 +- nextflow.config | 2 +- subworkflows/ingest.nf | 44 +++++++++++-------- 8 files changed, 86 insertions(+), 38 deletions(-) delete mode 100644 dockerfiles/general_container/environment.yml rename dockerfiles/{general_container => scylla}/Dockerfile (65%) create mode 100644 dockerfiles/scylla/environment.yml create mode 100644 main.nf diff --git a/dockerfiles/general_container/environment.yml b/dockerfiles/general_container/environment.yml deleted file mode 100644 index 97c010f..0000000 --- a/dockerfiles/general_container/environment.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: scylla_general -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python<=3.10 - - pip - - biopython - - Mako - - pandas - - numpy \ No newline at end of file diff --git a/dockerfiles/general_container/Dockerfile b/dockerfiles/scylla/Dockerfile similarity index 65% rename from dockerfiles/general_container/Dockerfile rename to dockerfiles/scylla/Dockerfile index bcec54f..20980ca 100644 --- a/dockerfiles/general_container/Dockerfile +++ b/dockerfiles/scylla/Dockerfile @@ -4,6 +4,6 @@ COPY environment.yml . RUN /opt/conda/bin/mamba env create -f /environment.yml -ENV PATH=/opt/conda/envs/scylla_general/bin:$PATH +ENV PATH=/opt/conda/envs/scylla/bin:$PATH -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml new file mode 100644 index 0000000..667eba1 --- /dev/null +++ b/dockerfiles/scylla/environment.yml @@ -0,0 +1,16 @@ +name: scylla +channels: + - bioconda + - conda-forge + - defaults + - epi2melabs +dependencies: + - python<=3.10 + - pysam + - pip + - kraken2-server # not on mac + - bracken # needs to be manually installed from git and softlinked if Mac M1 + - fastcat + - biopython + - pandas + - numpy \ No newline at end of file diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..0c936b6 --- /dev/null +++ b/main.nf @@ -0,0 +1,27 @@ +include { ingest } from './submodules/ingest' +include { fastp_single; fastp_paired} from '../modules/fastp' + +workflow { + // check input fastq exists and run fastp + + if (params.paired) { + Channel.of(file(params.fastq1, type: "file", checkIfExists:true)) + .set {input_fastq_1_ch} + + Channel.of(file(params.fastq2, type: "file", checkIfExists:true)) + .set {input_fastq_2_ch} + + fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch) + fastp_paired.out.processed_fastq + .set {processed_fastq} + } else { + Channel.of(file(params.fastq, type: "file", checkIfExists:true)) + .set {input_fastq_ch} + + fastp_single(params.unique_id, input_fastq_ch) + fastp_single.out.processed_fastq + .set {processed_fastq} + } + + ingest(params.unique_id, processed_fastq) +} \ No newline at end of file diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index 9850087..8b25fda 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -7,7 +7,7 @@ process kraken2_client { label 'error_retry' conda "epi2melabs::kraken2-server=0.1.3" - container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681' + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" input: path fastq @@ -25,8 +25,12 @@ process kraken2_client { process combine_kraken_outputs { + label 'process_single' + + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + input: val unique_id path kraken_reports @@ -52,7 +56,11 @@ process combine_kraken_outputs { } process determine_bracken_length { - label "scylla" + label "process_low" + + conda "anaconda::sed=4.8" + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + input: path database output: @@ -106,7 +114,8 @@ process bracken_to_json { publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" - container "biowilko/scylla_general:0.0.1" + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + input: val unique_id diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index ef32b98..c825041 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -5,7 +5,7 @@ process read_stats { label "process_medium" conda "epi2melabs::kraken2-server=0.1.3" - container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681' + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" input: path fastq @@ -26,7 +26,7 @@ process combine_stats { publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" - container "biowilko/scylla_general:0.0.1" + container "biowilko/scylla@sha256:362e51bf7f4fe4222aa674b7c75f514fe90d875263016b033ff69258f2eeae43" input: val unique_id diff --git a/nextflow.config b/nextflow.config index 9646fcf..0ec4253 100644 --- a/nextflow.config +++ b/nextflow.config @@ -95,7 +95,7 @@ manifest { author = 'Snowy Leopard' homePage = 'https://github.com/snowy-leopard/scylla' description = 'Classify metagenomic sequence data from human respiratory infections.' - mainScript = 'subworkflows/ingest.nf' + mainScript = 'main.nf' nextflowVersion = '>=20.10.0' version = 'v0.0.1' } diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 7a9c194..1e21a35 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -2,7 +2,6 @@ include { get_params_and_versions } from '../modules/get_params_and_versions' include { kraken_pipeline } from '../subworkflows/kraken_pipeline' include { extract_taxa } from '../modules/extract_taxa' -include { fastp_single; fastp_paired} from '../modules/fastp' workflow ingest { take: @@ -10,7 +9,7 @@ workflow ingest { fastq //metadata main: - get_params_and_versions() + // get_params_and_versions() //clean_metadata(metadata) kraken_pipeline(unique_id, fastq) extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) @@ -19,21 +18,30 @@ workflow ingest { } -workflow { - // check input fastq exists and run fastp +// workflow { +// // check input fastq exists and run fastp - if (params.paired) { - input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) - input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) - fastp_paired(params.unique_id, input_fastq_1, input_fastq_2) - fastp_paired.out.processed_fastq - .set {processed_fastq} - } else { - input_fastq = file(params.fastq, type: "file", checkIfExists:true) - fastp_single(params.unique_id, input_fastq) - fastp_single.out.processed_fastq - .set {processed_fastq} - } +// if (params.paired) { +// Channel.of(file(file(params.fastq1, type: "file", checkIfExists:true))) +// .set {input_fastq_1_ch} - ingest(params.unique_id, processed_fastq) -} \ No newline at end of file +// Channel.of(file(file(params.fastq2, type: "file", checkIfExists:true))) +// .set {input_fastq_2_ch} + +// // input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) +// // input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) +// fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch) +// fastp_paired.out.processed_fastq +// .set {processed_fastq} +// } else { +// Channel.of(file(file(params.fastq, type: "file", checkIfExists:true))) +// .set {input_fastq_ch} + +// // input_fastq = file(params.fastq, type: "file", checkIfExists:true) +// fastp_single(params.unique_id, input_fastq_ch) +// fastp_single.out.processed_fastq +// .set {processed_fastq} +// } + +// ingest(params.unique_id, processed_fastq) +// } \ No newline at end of file From 40a4d1cc7051776e107d56abdb7191dbff9336d5 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:30:31 +0100 Subject: [PATCH 11/77] quote localhost --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 0ec4253..7bdf98f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,7 +67,7 @@ params { disable_ping = false threads = 2 k2_port = 8080 - k2_host = localhost + k2_host = 'localhost' port = 8080 process_label = "scylla" monochrome_logs = false From 2b5df2530deed0cfce5b42de7f3f049f2bbe9ddc Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:31:51 +0100 Subject: [PATCH 12/77] properly include ingest --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 0c936b6..7daaee3 100644 --- a/main.nf +++ b/main.nf @@ -1,4 +1,4 @@ -include { ingest } from './submodules/ingest' +include { ingest } from './subworkflows/ingest' include { fastp_single; fastp_paired} from '../modules/fastp' workflow { From 0385711753662ecd0bdaf95029b9d986beda8a91 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:32:26 +0100 Subject: [PATCH 13/77] properly include fastp --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7daaee3..0d67642 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,5 @@ include { ingest } from './subworkflows/ingest' -include { fastp_single; fastp_paired} from '../modules/fastp' +include { fastp_single; fastp_paired} from './modules/fastp' workflow { // check input fastq exists and run fastp From 1f137a7a687ca0ec56249fbfc2abb4b6aa3847cb Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:40:07 +0100 Subject: [PATCH 14/77] Fix combine stats container --- modules/generate_report.nf | 2 +- modules/qc_checks.nf | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/generate_report.nf b/modules/generate_report.nf index 0f8afff..e4baf1e 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -6,7 +6,7 @@ process make_report { publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" - container "biowilko/scylla_general:0.0.1" + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" input: val unique_id diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index c825041..57735e9 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -26,7 +26,8 @@ process combine_stats { publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" - container "biowilko/scylla@sha256:362e51bf7f4fe4222aa674b7c75f514fe90d875263016b033ff69258f2eeae43" + container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + input: val unique_id From 8fa35dc745d52420c26e41272238cc716eac8bff Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:44:03 +0100 Subject: [PATCH 15/77] no need to use paths to bin --- modules/assemble_taxa.nf | 2 +- modules/extract_taxa.nf | 2 +- modules/generate_report.nf | 2 +- modules/kraken_client.nf | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf index 01d741f..a2811ab 100644 --- a/modules/assemble_taxa.nf +++ b/modules/assemble_taxa.nf @@ -30,7 +30,7 @@ process extract_reads { script: """ - $projectDir/../bin/extract_kraken_reads.py \ + extract_kraken_reads.py \ -s ${fastq} \ -k ${kraken_assignments} \ -r ${kraken_report} \ diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 66a3cbc..996be81 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -29,7 +29,7 @@ process extract_reads { path "reads_summary.json", emit: summary script: """ - $projectDir/../bin/extract_kraken_reads.py \ + extract_kraken_reads.py \ -s ${fastq} \ -k ${kraken_assignments} \ -r ${kraken_report} \ diff --git a/modules/generate_report.nf b/modules/generate_report.nf index e4baf1e..bc8dadb 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -18,7 +18,7 @@ process make_report { script: report_name = "${unique_id}" """ - $projectDir/../bin/make_report.py \ + make_report.py \ --prefix "${report_name}" \ --read_counts ${stats} \ --assignments ${lineages} \ diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index 8b25fda..3ba91aa 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -46,7 +46,7 @@ process combine_kraken_outputs { """ } else { """ - $projectDir/../bin/combine_kreports.py \\ + combine_kreports.py \\ -r ${kraken_reports} \\ -o "${params.database_set}.kraken_report.txt" @@ -128,7 +128,7 @@ process bracken_to_json { cat "${bracken_summary}" | cut -f2,6 | tail -n+2 > taxacounts.txt cat "${bracken_summary}" | cut -f2 | tail -n+2 > taxa.txt taxonkit lineage --data-dir ${taxonomy_dir} -R taxa.txt > lineages.txt - $projectDir/../bin/aggregate_lineages_bracken.py \\ + aggregate_lineages_bracken.py \\ -i "lineages.txt" -b "taxacounts.txt" \\ -u "${kraken_report}" \\ -p "temp_bracken" From b1349ac62f26b53e94db57ce0504cd1c642fba56 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:45:13 +0100 Subject: [PATCH 16/77] init paired to prevent error msg --- conf/base.config | 6 +----- nextflow.config | 1 + 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/conf/base.config b/conf/base.config index da4b559..2bfc384 100644 --- a/conf/base.config +++ b/conf/base.config @@ -58,9 +58,5 @@ process { withLabel:error_retry { errorStrategy = 'retry' maxRetries = 2 - - } - withName:CUSTOM_DUMPSOFTWAREVERSIONS { - cache = false - } + } diff --git a/nextflow.config b/nextflow.config index 7bdf98f..088fb81 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,7 @@ params { port = 8080 process_label = "scylla" monochrome_logs = false + paired = false validate_params = true show_hidden_params = false From ca5785baf68a079251029007915ff3527a244b70 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:46:04 +0100 Subject: [PATCH 17/77] fix base.config formatting --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 2bfc384..ca1ea86 100644 --- a/conf/base.config +++ b/conf/base.config @@ -58,5 +58,5 @@ process { withLabel:error_retry { errorStrategy = 'retry' maxRetries = 2 - + } } From 6579a0fe00200e6ddea91113b636c66c247de4f2 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 16:56:11 +0100 Subject: [PATCH 18/77] parameterise container checksum --- dockerfiles/scylla/environment.yml | 3 ++- modules/generate_report.nf | 2 +- modules/kraken_client.nf | 8 ++++---- modules/qc_checks.nf | 4 ++-- nextflow.config | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml index 667eba1..b3ac398 100644 --- a/dockerfiles/scylla/environment.yml +++ b/dockerfiles/scylla/environment.yml @@ -13,4 +13,5 @@ dependencies: - fastcat - biopython - pandas - - numpy \ No newline at end of file + - numpy + - taxonkit \ No newline at end of file diff --git a/modules/generate_report.nf b/modules/generate_report.nf index bc8dadb..928c146 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -6,7 +6,7 @@ process make_report { publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: val unique_id diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index 3ba91aa..dd89ea8 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -7,7 +7,7 @@ process kraken2_client { label 'error_retry' conda "epi2melabs::kraken2-server=0.1.3" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: path fastq @@ -27,7 +27,7 @@ process combine_kraken_outputs { label 'process_single' - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' @@ -59,7 +59,7 @@ process determine_bracken_length { label "process_low" conda "anaconda::sed=4.8" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: path database @@ -114,7 +114,7 @@ process bracken_to_json { publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index 57735e9..45b23e8 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -5,7 +5,7 @@ process read_stats { label "process_medium" conda "epi2melabs::kraken2-server=0.1.3" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: path fastq @@ -26,7 +26,7 @@ process combine_stats { publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" - container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9" + container "biowilko/scylla@${params.wf.container_sha}" input: diff --git a/nextflow.config b/nextflow.config index 088fb81..0832834 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,7 +84,7 @@ params { "--fastq test_data/barcode01/reads.fastq.gz", ] agent = null - container_sha = "sha156f44a25dd22e39eb10e82d12e0584d41dde909" + container_sha = "sha256:aeaf567b0e05bea87081a8d0f86d92a06664707abd0e61d84146fad26597cb1f" } } From e20bca44856b9f0a19ffa88d611f14ce35ca8a2a Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:03:31 +0100 Subject: [PATCH 19/77] use existing taxonomy dir --- nextflow.config | 1 + subworkflows/kraken_pipeline.nf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 0832834..2629838 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,6 +72,7 @@ params { process_label = "scylla" monochrome_logs = false paired = false + taxonomy_dir = "/shared/public/db/taxonomy/" validate_params = true show_hidden_params = false diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index ee80101..d4b92ed 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -24,7 +24,7 @@ workflow kraken_pipeline { // } - taxonomy = file("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", type: "dir", checkIfExists:true) + taxonomy = file($params.taxonomy_dir, type: "dir", checkIfExists:true) // if (database) { // source_database = database From e5e12383ccc988783eb1353ccfa257e7e051d5e6 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:04:26 +0100 Subject: [PATCH 20/77] Holla holla no $ --- subworkflows/kraken_pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index d4b92ed..a05ef0d 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -24,7 +24,7 @@ workflow kraken_pipeline { // } - taxonomy = file($params.taxonomy_dir, type: "dir", checkIfExists:true) + taxonomy = file(params.taxonomy_dir, type: "dir", checkIfExists:true) // if (database) { // source_database = database From 19d649e1249ae4e52bf7cacb67f912d911135149 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:20:35 +0100 Subject: [PATCH 21/77] bgzip fastp out --- modules/fastp.nf | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index bc8c8f5..e75dc0a 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -16,8 +16,8 @@ process fastp_paired { output: val(prefix) - path("${prefix}_1.fastp.fastq.gz") - path("${prefix}_2.fastp.fastq.gz") + path("${prefix}_1.fastp.fastq.gz"), optional: true + path("${prefix}_2.fastp.fastq.gz"), optional: true path("${prefix}.fastp.fastq.gz"), emit: processed_fastq path("${prefix}.fastp.json") @@ -26,13 +26,26 @@ process fastp_paired { fastp \\ --in1 ${fastq_1} \\ --in2 ${fastq_2} \\ - --out1 ${prefix}_1.fastp.fastq.gz \\ - --out2 ${prefix}_2.fastp.fastq.gz \\ - --merged-out ${prefix}.fastp.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq \\ + --out2 ${prefix}_2.fastp.fastq \\ + --merged-out ${prefix}.fastp.fastq \\ --json ${prefix}.fastp.json \\ --thread $task.cpus \\ --detect_adapter_for_pe \\ 2> ${prefix}.fastp.log + + if [ -s ${prefix}_1.fastp.fastq ]; then + bgzip --threads -c $task.cpus > ${prefix}_1.fastp.fastq.gz + fi + + if [ -s ${prefix}_2.fastp.fastq ]; then + bgzip --threads -c $task.cpus > ${prefix}_2.fastp.fastq.gz + fi + + if [ -s ${prefix}.fastp.fastq ]; then + bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz + fi + """ } @@ -62,9 +75,13 @@ process fastp_single { """ fastp \\ --in1 ${fastq} \\ - --out1 ${prefix}.fastp.fastq.gz \\ + --out1 ${prefix}.fastp.fastq \\ --json ${prefix}.fastp.json \\ --thread $task.cpus \\ 2> ${prefix}.fastp.log + + if [ -s ${prefix}.fastp.fastq ]; then + bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz + fi """ } \ No newline at end of file From 88a79dd3a4bf7f475fe0c91ed926a5e27c4aae21 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:42:19 +0100 Subject: [PATCH 22/77] move fastp to scylla container --- modules/fastp.nf | 18 ++++++------------ nextflow.config | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index e75dc0a..c9a6dc9 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -4,10 +4,7 @@ process fastp_paired { publishDir "${params.out_dir}/preprocess/", mode: 'copy' - conda "bioconda::fastp=0.23.4" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : - 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + container "biowilko/scylla@${params.wf.container_sha}" input: val(prefix) @@ -35,15 +32,15 @@ process fastp_paired { 2> ${prefix}.fastp.log if [ -s ${prefix}_1.fastp.fastq ]; then - bgzip --threads -c $task.cpus > ${prefix}_1.fastp.fastq.gz + bgzip --threads $task.cpus -c > ${prefix}_1.fastp.fastq.gz fi if [ -s ${prefix}_2.fastp.fastq ]; then - bgzip --threads -c $task.cpus > ${prefix}_2.fastp.fastq.gz + bgzip --threads $task.cpus -c > ${prefix}_2.fastp.fastq.gz fi if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz + bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz fi """ @@ -56,10 +53,7 @@ process fastp_single { publishDir "${params.out_dir}/preprocess/", mode: 'copy' - conda "bioconda::fastp=0.23.4" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : - 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + container "biowilko/scylla@${params.wf.container_sha}" input: val(prefix) @@ -81,7 +75,7 @@ process fastp_single { 2> ${prefix}.fastp.log if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz + bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz fi """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 2629838..cfb36b2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -85,7 +85,7 @@ params { "--fastq test_data/barcode01/reads.fastq.gz", ] agent = null - container_sha = "sha256:aeaf567b0e05bea87081a8d0f86d92a06664707abd0e61d84146fad26597cb1f" + container_sha = "sha256:6cfdca7d4a731f0af221f0cfc842e3dbf4524db0503e71198121cba96b7e7700" } } From 4e913d7407e5102c567000f283c0b3a910053d1c Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:45:24 +0100 Subject: [PATCH 23/77] ooops wrong hash --- dockerfiles/scylla/environment.yml | 4 +++- nextflow.config | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml index b3ac398..96884ce 100644 --- a/dockerfiles/scylla/environment.yml +++ b/dockerfiles/scylla/environment.yml @@ -14,4 +14,6 @@ dependencies: - biopython - pandas - numpy - - taxonkit \ No newline at end of file + - taxonkit + - fastp + - tabix \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index cfb36b2..7a93445 100644 --- a/nextflow.config +++ b/nextflow.config @@ -85,7 +85,7 @@ params { "--fastq test_data/barcode01/reads.fastq.gz", ] agent = null - container_sha = "sha256:6cfdca7d4a731f0af221f0cfc842e3dbf4524db0503e71198121cba96b7e7700" + container_sha = "sha256:f0a53699c2c367dc98c4dc37e9a683972219ca6242032666a76cf2b081ff5825" } } From 48882d985876a2c8d8151e0293c5576c3043ce94 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 17:52:01 +0100 Subject: [PATCH 24/77] actually compress the fastqs dummy --- modules/fastp.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index c9a6dc9..82fd0bb 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -32,15 +32,15 @@ process fastp_paired { 2> ${prefix}.fastp.log if [ -s ${prefix}_1.fastp.fastq ]; then - bgzip --threads $task.cpus -c > ${prefix}_1.fastp.fastq.gz + bgzip --threads $task.cpus -c ${prefix}_1.fastp.fastq > ${prefix}_1.fastp.fastq.gz fi if [ -s ${prefix}_2.fastp.fastq ]; then - bgzip --threads $task.cpus -c > ${prefix}_2.fastp.fastq.gz + bgzip --threads $task.cpus -c ${prefix}_2.fastp.fastq > ${prefix}_2.fastp.fastq.gz fi if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz + bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz fi """ @@ -75,7 +75,7 @@ process fastp_single { 2> ${prefix}.fastp.log if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz + bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz fi """ } \ No newline at end of file From 415f191d3159d9c68b2846bc5d0d56961cba7eab Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 18:03:39 +0100 Subject: [PATCH 25/77] add mako to container --- dockerfiles/scylla/environment.yml | 3 ++- nextflow.config | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml index 96884ce..a41a800 100644 --- a/dockerfiles/scylla/environment.yml +++ b/dockerfiles/scylla/environment.yml @@ -16,4 +16,5 @@ dependencies: - numpy - taxonkit - fastp - - tabix \ No newline at end of file + - tabix + - mako \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 7a93445..d38bceb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -85,7 +85,7 @@ params { "--fastq test_data/barcode01/reads.fastq.gz", ] agent = null - container_sha = "sha256:f0a53699c2c367dc98c4dc37e9a683972219ca6242032666a76cf2b081ff5825" + container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f" } } From b23736a35950ed19009f38bf620f1afa2722c0d6 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 18:15:04 +0100 Subject: [PATCH 26/77] use pathlib in report --- bin/make_report.py | 55 ++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/bin/make_report.py b/bin/make_report.py index b7680b3..2aea734 100755 --- a/bin/make_report.py +++ b/bin/make_report.py @@ -16,9 +16,11 @@ from Bio import SeqIO -def make_output_report(report_to_generate, template, version, sample, data_for_report={}): +def make_output_report( + report_to_generate, template, version, sample, data_for_report={} +): template_dir = os.path.abspath(os.path.dirname(__file__)) - mylookup = TemplateLookup(directories=[template_dir]) #absolute or relative works + mylookup = TemplateLookup(directories=[template_dir]) # absolute or relative works mytemplate = mylookup.get_template(template) buf = StringIO() @@ -26,26 +28,28 @@ def make_output_report(report_to_generate, template, version, sample, data_for_r print(template_dir) js_path = os.path.join(template_dir, "report_utils", "sankey.js") print(js_path) - with open(js_path,'r') as f: + with open(js_path, "r") as f: data_for_report["sankey_js"] = f.read() - ctx = Context(buf, - date = date.today(), - version = version, - sample=sample, - data_for_report = data_for_report) + ctx = Context( + buf, + date=date.today(), + version=version, + sample=sample, + data_for_report=data_for_report, + ) try: mytemplate.render_context(ctx) except: traceback = RichTraceback() - for (filename, lineno, function, line) in traceback.traceback: + for filename, lineno, function, line in traceback.traceback: print("File %s, line %s, in %s" % (filename, lineno, function)) print(line, "\n") print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) - with open(report_to_generate, 'w') as fw: + with open(report_to_generate, "w") as fw: print("Generating: " + f"{report_to_generate}") fw.write(buf.getvalue()) @@ -53,34 +57,43 @@ def make_output_report(report_to_generate, template, version, sample, data_for_r def main(): parser = argparse.ArgumentParser() - parser.add_argument("--assignments", help="JSON file of kraken/bracken assignments", required=True) - parser.add_argument("--read_counts", help="JSON file of read_counts", required=False) - parser.add_argument("--sample_id", help="Unique ID of sample", required=False, default="sample") + parser.add_argument( + "--assignments", + help="JSON file of kraken/bracken assignments", + required=True, + type=Path, + ) + parser.add_argument( + "--read_counts", help="JSON file of read_counts", required=False, type=Path + ) + parser.add_argument( + "--sample_id", help="Unique ID of sample", required=False, default="sample" + ) parser.add_argument("--prefix", help="HTML output prefix ", default="scylla") - parser.add_argument("--template", help="HTML template for report", default="scylla.mako.html") + parser.add_argument( + "--template", help="HTML template for report", default="scylla.mako.html" + ) parser.add_argument("--version", help="Scylla version", default="unknown") args = parser.parse_args() sample = args.sample_id - with open(args.assignments, 'r') as bracken_file: + with open(args.assignments.resolve(), "r") as bracken_file: assignments = bracken_file.read().strip().replace('"', '\\"') if args.read_counts: - with open(args.read_counts, 'r') as qc_file: - read_counts = qc_file.read().strip().replace('"', '\\"') + with open(args.read_counts.resolve(), "r") as qc_file: + read_counts = qc_file.read().strip().replace('"', '\\"') else: read_counts = {} - data_for_report = {"sankey_data":assignments, "read_count_data": read_counts} - - + data_for_report = {"sankey_data": assignments, "read_count_data": read_counts} outfile = args.prefix + "_report.html" make_output_report(outfile, args.template, args.version, sample, data_for_report) if __name__ == "__main__": - main() \ No newline at end of file + main() From 135311533d72369b41801d2cf570533895cabaaf Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 18:18:15 +0100 Subject: [PATCH 27/77] specify json is a text file? --- bin/make_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/make_report.py b/bin/make_report.py index 2aea734..55f7c1b 100755 --- a/bin/make_report.py +++ b/bin/make_report.py @@ -81,11 +81,11 @@ def main(): sample = args.sample_id - with open(args.assignments.resolve(), "r") as bracken_file: + with open(args.assignments.resolve(), "rt") as bracken_file: assignments = bracken_file.read().strip().replace('"', '\\"') if args.read_counts: - with open(args.read_counts.resolve(), "r") as qc_file: + with open(args.read_counts.resolve(), "rt") as qc_file: read_counts = qc_file.read().strip().replace('"', '\\"') else: read_counts = {} From 26ed5ef87301cf9dd705b35df5ec426bf57dbcad Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 18:23:34 +0100 Subject: [PATCH 28/77] treat bracken jsons in a sane manner --- modules/kraken_client.nf | 1 + subworkflows/kraken_pipeline.nf | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index dd89ea8..dcc2265 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -124,6 +124,7 @@ process bracken_to_json { path bracken_summary output: path "${params.database_set}.bracken.json" + """ cat "${bracken_summary}" | cut -f2,6 | tail -n+2 > taxacounts.txt cat "${bracken_summary}" | cut -f2 | tail -n+2 > taxa.txt diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index a05ef0d..8b076b8 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -40,13 +40,18 @@ workflow kraken_pipeline { // start_server(database, taxonomy) run_kraken_and_bracken(unique_id, fastq, database, taxonomy) + + run_kraken_and_bracken.out.json + .collect() + .set { kraken_jsons } + // stop_server(run_kraken_and_bracken.out.bracken_report.collect()) qc_checks(unique_id, fastq) - all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json") - .concat(run_kraken_and_bracken.out.json) - .unique {it.getName()} - .collect() - generate_report(unique_id, qc_checks.out, all_bracken_jsons ) + // all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json") + // .concat(run_kraken_and_bracken.out.json) + // .unique {it.getName()} + // .collect() + generate_report(unique_id, qc_checks.out, bracken_jsons ) emit: bracken_report = run_kraken_and_bracken.out.bracken_report kraken_report = run_kraken_and_bracken.out.kraken_report From c1dd4ed844c25b07a3a1afa598d8eaae436bd7c7 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 15 Jun 2023 18:24:36 +0100 Subject: [PATCH 29/77] bracken not kraken dumdum --- subworkflows/kraken_pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index 8b076b8..bb44191 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -43,7 +43,7 @@ workflow kraken_pipeline { run_kraken_and_bracken.out.json .collect() - .set { kraken_jsons } + .set { bracken_jsons } // stop_server(run_kraken_and_bracken.out.bracken_report.collect()) qc_checks(unique_id, fastq) From cd6f267befb324b3b07f0515f394cc0bb51b6d1e Mon Sep 17 00:00:00 2001 From: Biowilko Date: Fri, 16 Jun 2023 09:00:57 +0100 Subject: [PATCH 30/77] properly merge paired reads --- modules/fastp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index 82fd0bb..23616f8 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -26,9 +26,9 @@ process fastp_paired { --out1 ${prefix}_1.fastp.fastq \\ --out2 ${prefix}_2.fastp.fastq \\ --merged-out ${prefix}.fastp.fastq \\ + -m \\ --json ${prefix}.fastp.json \\ --thread $task.cpus \\ - --detect_adapter_for_pe \\ 2> ${prefix}.fastp.log if [ -s ${prefix}_1.fastp.fastq ]; then From 041fddc0351d11111d3c165bfbdfbaf6ba920cdc Mon Sep 17 00:00:00 2001 From: Biowilko Date: Fri, 16 Jun 2023 09:06:29 +0100 Subject: [PATCH 31/77] fix fastp paired params --- modules/fastp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index 23616f8..cdf146c 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -25,7 +25,7 @@ process fastp_paired { --in2 ${fastq_2} \\ --out1 ${prefix}_1.fastp.fastq \\ --out2 ${prefix}_2.fastp.fastq \\ - --merged-out ${prefix}.fastp.fastq \\ + --merged_out ${prefix}.fastp.fastq \\ -m \\ --json ${prefix}.fastp.json \\ --thread $task.cpus \\ From 05f047efc5244c0cd3ecc5d2158454e21c406bd7 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 19 Jun 2023 09:31:58 +0100 Subject: [PATCH 32/77] Add k2_server dockerfile --- dockerfiles/kraken2-server/Dockerfile | 9 +++++++++ dockerfiles/kraken2-server/environment.yml | 8 ++++++++ 2 files changed, 17 insertions(+) create mode 100644 dockerfiles/kraken2-server/Dockerfile create mode 100644 dockerfiles/kraken2-server/environment.yml diff --git a/dockerfiles/kraken2-server/Dockerfile b/dockerfiles/kraken2-server/Dockerfile new file mode 100644 index 0000000..b2df307 --- /dev/null +++ b/dockerfiles/kraken2-server/Dockerfile @@ -0,0 +1,9 @@ +FROM condaforge/mambaforge:latest AS conda + +COPY environment.yml . + +RUN /opt/conda/bin/mamba env create -f /environment.yml + +ENV PATH=/opt/conda/envs/kraken2-server/bin:$PATH + +CMD ["/bin/bash"] diff --git a/dockerfiles/kraken2-server/environment.yml b/dockerfiles/kraken2-server/environment.yml new file mode 100644 index 0000000..ade0536 --- /dev/null +++ b/dockerfiles/kraken2-server/environment.yml @@ -0,0 +1,8 @@ +name: kraken2-server +channels: + - bioconda + - conda-forge + - defaults + - epi2melabs +dependencies: + - kraken2-server # not on mac \ No newline at end of file From 26d5e30516dc1569a885c9d884b9dda0152b191c Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 19 Jun 2023 10:23:20 +0100 Subject: [PATCH 33/77] Make fastp output into correct dir --- modules/fastp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index cdf146c..8720534 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -2,7 +2,7 @@ process fastp_paired { label 'process_medium' - publishDir "${params.out_dir}/preprocess/", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' container "biowilko/scylla@${params.wf.container_sha}" From bf7cb65525f457454df219609abf7049915df866 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 19 Jun 2023 16:59:05 +0100 Subject: [PATCH 34/77] fix publish dir --- modules/assemble_taxa.nf | 2 +- modules/extract_taxa.nf | 2 +- modules/fastp.nf | 4 ++-- modules/generate_report.nf | 2 +- modules/get_params_and_versions.nf | 6 ++---- modules/kraken_client.nf | 6 +++--- modules/qc_checks.nf | 2 +- 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf index a2811ab..be71157 100644 --- a/modules/assemble_taxa.nf +++ b/modules/assemble_taxa.nf @@ -11,7 +11,7 @@ process extract_reads { label 'process_medium' - publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 996be81..b4c792c 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -11,7 +11,7 @@ process extract_reads { label 'process_medium' - publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/fastp.nf b/modules/fastp.nf index 8720534..e451a70 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -2,7 +2,7 @@ process fastp_paired { label 'process_medium' - publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' + publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy' container "biowilko/scylla@${params.wf.container_sha}" @@ -51,7 +51,7 @@ process fastp_single { label 'process_medium' - publishDir "${params.out_dir}/preprocess/", mode: 'copy' + publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy' container "biowilko/scylla@${params.wf.container_sha}" diff --git a/modules/generate_report.nf b/modules/generate_report.nf index 928c146..232fa7d 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -3,7 +3,7 @@ process make_report { label "process_low" - publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" container "biowilko/scylla@${params.wf.container_sha}" diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf index cea7a8b..7822757 100644 --- a/modules/get_params_and_versions.nf +++ b/modules/get_params_and_versions.nf @@ -5,9 +5,7 @@ import groovy.json.JsonBuilder process get_versions { - - label "scylla" - publishDir "${params.out_dir}/execution", mode: 'copy' + publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy' cpus 1 output: path "versions.txt" @@ -21,7 +19,7 @@ process get_versions { process get_params { - publishDir "${params.out_dir}/execution", mode: 'copy' + publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy' cpus 1 output: path "params.json" diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index dcc2265..a4979db 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -29,7 +29,7 @@ process combine_kraken_outputs { container "biowilko/scylla@${params.wf.container_sha}" - publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' input: val unique_id @@ -81,7 +81,7 @@ process bracken { label 'process_low' - publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' conda "bioconda::bracken=2.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -111,7 +111,7 @@ process bracken_to_json { label "process_low" - publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" container "biowilko/scylla@${params.wf.container_sha}" diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index 45b23e8..2d384bd 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -23,7 +23,7 @@ process combine_stats { label "process_low" - publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' + publishDir "${params.out_dir}/${params.unique_id}/qc", mode: 'copy' conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" container "biowilko/scylla@${params.wf.container_sha}" From 4a534768f4568af942628c8304514c2780d71c45 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Tue, 20 Jun 2023 10:27:43 +0100 Subject: [PATCH 35/77] Don't exit 2 if too much human --- bin/extract_kraken_reads.py | 382 +++++++++++++++++++++++++++--------- 1 file changed, 285 insertions(+), 97 deletions(-) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index 569098e..33bc107 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -14,30 +14,34 @@ import json from datetime import datetime + def mean(l): if len(l) == 0: return 0 - return sum(l)/len(l) + return sum(l) / len(l) + def median(l): - if len(l)%2 == 0: - i = (len(l))/2 + if len(l) % 2 == 0: + i = (len(l)) / 2 else: - i = (len(l) + 1)/2 + i = (len(l) + 1) / 2 i = int(i) l = sorted(l) return l[i] + def parse_depth(name): parse_name = name.split(" ") depth = 0 for i in parse_name: - if i!="": + if i != "": break depth += 1 - depth = int(depth/2) + depth = int(depth / 2) return depth + def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None): first = kraken_file if not kraken_file: @@ -49,24 +53,43 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None): for line in f: if line.startswith("% of Seqs"): continue - percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t") + ( + percentage, + num_clade_root, + num_direct, + raw_rank, + ncbi, + name, + ) = line.strip().split("\t") percentage = float(percentage) num_clade_root = int(num_clade_root) num_direct = int(num_direct) depth = parse_depth(name) name = name.strip() rank = raw_rank[0] - hierarchy = hierarchy[:depth - 1] + hierarchy = hierarchy[: depth - 1] hierarchy.append(ncbi) - if name in ['Homo sapiens', 'unclassified', 'root']: - if max_human and name == 'Homo sapiens' and num_direct > max_human: - sys.stderr.write("ERROR: found %i human reads, max allowed is %i\n" %(num_direct, max_human)) - sys.exit(2) + if name in ["Homo sapiens", "unclassified", "root"]: + if max_human and name == "Homo sapiens" and num_direct > max_human: + sys.stderr.write( + "ERROR: found %i human reads, max allowed is %i\n" + % (num_direct, max_human) + ) + # sys.exit(2) continue - entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root, - "raw_rank": raw_rank, "rank": rank, "depth": depth, "name": name, "parents":[], "children":[]} + entries[ncbi] = { + "percentage": percentage, + "count": num_direct, + "count_descendants": num_clade_root, + "raw_rank": raw_rank, + "rank": rank, + "depth": depth, + "name": name, + "parents": [], + "children": [], + } if len(hierarchy) > 1: parent = hierarchy[-2] @@ -77,22 +100,45 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None): if kraken_file and bracken_file: with open(bracken_file, "r") as f: for line in f: - percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t") + ( + percentage, + num_clade_root, + num_direct, + raw_rank, + ncbi, + name, + ) = line.strip().split("\t") percentage = float(percentage) num_clade_root = int(num_clade_root) num_direct = int(num_direct) name = name.strip() - if name in ['Homo sapiens', 'unclassified', 'root']: - continue + if name in ["Homo sapiens", "unclassified", "root"]: + continue - entries[ncbi].update({"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root}) + entries[ncbi].update( + { + "percentage": percentage, + "count": num_direct, + "count_descendants": num_clade_root, + } + ) - sys.stdout.write("FOUND %i TAXA IN BRACKEN REPORT\n" %len(entries)) + sys.stdout.write("FOUND %i TAXA IN BRACKEN REPORT\n" % len(entries)) return entries -def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=None,min_count_descendants=None, -min_percent=None, top_n=None, include_parents=False, include_children=False): + +def get_taxon_id_lists( + bracken_hierarchy, + names=[], + target_ranks=[], + min_count=None, + min_count_descendants=None, + min_percent=None, + top_n=None, + include_parents=False, + include_children=False, +): lists_to_extract = {} for taxon in bracken_hierarchy: entry = bracken_hierarchy[taxon] @@ -116,8 +162,7 @@ def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=N child = children.pop() lists_to_extract[taxon].append(child) children.extend(bracken_hierarchy[child]["children"]) - sys.stdout.write("SELECTED %i TAXA TO EXTRACT\n" %len(lists_to_extract)) - + sys.stdout.write("SELECTED %i TAXA TO EXTRACT\n" % len(lists_to_extract)) if top_n and len(lists_to_extract) > top_n: X = list(lists_to_extract.keys()) @@ -126,18 +171,18 @@ def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=N to_delete = ordered[top_n:] for taxon in to_delete: del lists_to_extract[taxon] - sys.stdout.write("REDUCED TO %i TAXA TO EXTRACT\n" %len(lists_to_extract)) - + sys.stdout.write("REDUCED TO %i TAXA TO EXTRACT\n" % len(lists_to_extract)) return lists_to_extract + def check_read_files(reads): - if(reads[-3:] == '.gz'): - read_file = gzip.open(reads,'rt') - zipped=True + if reads[-3:] == ".gz": + read_file = gzip.open(reads, "rt") + zipped = True else: - read_file = open(reads,'rt') - zipped=False + read_file = open(reads, "rt") + zipped = False first = read_file.readline() if len(first) == 0: sys.stderr.write("ERROR: sequence file's first line is blank\n") @@ -151,10 +196,11 @@ def check_read_files(reads): sys.exit(1) return filetype, zipped + def parse_kraken_assignment_line(line): - line_vals = line.strip().split('\t') + line_vals = line.strip().split("\t") if len(line_vals) < 5: - return -1, '' + return -1, "" if "taxid" in line_vals[2]: temp = line_vals[2].split("taxid ")[-1] tax_id = temp[:-1] @@ -162,15 +208,30 @@ def parse_kraken_assignment_line(line): tax_id = line_vals[2] read_id = line_vals[1] - if (tax_id == 'A'): + if tax_id == "A": tax_id = 81077 else: tax_id = tax_id return tax_id, read_id -def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, reads2, prefix, max_human=None, names=[], target_ranks=[],min_count=None, -min_count_descendants=None, min_percent=None, top_n=None, include_parents=False, include_children=False): +def extract_taxa( + kraken_file, + bracken_file, + kraken_assignment_file, + reads1, + reads2, + prefix, + max_human=None, + names=[], + target_ranks=[], + min_count=None, + min_count_descendants=None, + min_percent=None, + top_n=None, + include_parents=False, + include_children=False, +): # open read files filetype, zipped = check_read_files(reads1) s_file1 = SeqIO.index(reads1, filetype) @@ -179,8 +240,17 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read # get taxids to extract bracken_hierarchy = get_bracken_hierarchy(kraken_file, bracken_file, max_human) - lists_to_extract = get_taxon_id_lists(bracken_hierarchy, names, target_ranks,min_count,min_count_descendants, - min_percent, top_n, include_parents, include_children) + lists_to_extract = get_taxon_id_lists( + bracken_hierarchy, + names, + target_ranks, + min_count, + min_count_descendants, + min_percent, + top_n, + include_parents, + include_children, + ) # open output files outfile_handles = {} @@ -194,16 +264,18 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read if key not in keys: keys[key] = [] keys[key].append(taxon) - outfile_handles[taxon] = open("%s.%s.%s" %(prefix, taxon, filetype), "w") - print("opening %s.%s.%s" %(prefix, taxon, filetype)) + outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w") + print("opening %s.%s.%s" % (prefix, taxon, filetype)) out_counts[taxon] = 0 quals[taxon] = [] lens[taxon] = [] - sys.stdout.write("INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES\n" %len(keys)) - sys.stdout.write("[%s]\n" %','.join(keys)) - + sys.stdout.write( + "INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES\n" + % len(keys) + ) + sys.stdout.write("[%s]\n" % ",".join(keys)) - with open(kraken_assignment_file, 'r') as kfile: + with open(kraken_assignment_file, "r") as kfile: for line in kfile: tax_id, read_id = parse_kraken_assignment_line(line) if tax_id in keys: @@ -212,13 +284,17 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read elif reads2 and read_id in s_file2: read = s_file2[read_id] else: - sys.stderr.write("ERROR: read id %s not found in read files\n" %read_id) + sys.stderr.write( + "ERROR: read id %s not found in read files\n" % read_id + ) sys.exit(1) for taxon in keys[tax_id]: SeqIO.write(read, outfile_handles[taxon], filetype) out_counts[taxon] += 1 - quals[taxon].append(median(read.letter_annotations["phred_quality"])) + quals[taxon].append( + median(read.letter_annotations["phred_quality"]) + ) lens[taxon].append(len(read)) for handle in outfile_handles: @@ -227,81 +303,192 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read summary = [] for taxon in lists_to_extract: - summary.append({"human_readable": bracken_hierarchy[taxon]["name"], "taxon": taxon, "tax_level": bracken_hierarchy[taxon]["rank"], "filename":"%s.%s.%s" %(prefix, taxon, filetype), "qc_metrics":{"num_reads":out_counts[taxon], "avg_qual":mean(quals[taxon]), "mean_len":mean(lens[taxon])}}) - with open("%s_summary.json" %prefix, 'w') as f: + summary.append( + { + "human_readable": bracken_hierarchy[taxon]["name"], + "taxon": taxon, + "tax_level": bracken_hierarchy[taxon]["rank"], + "filename": "%s.%s.%s" % (prefix, taxon, filetype), + "qc_metrics": { + "num_reads": out_counts[taxon], + "avg_qual": mean(quals[taxon]), + "mean_len": mean(lens[taxon]), + }, + } + ) + with open("%s_summary.json" % prefix, "w") as f: print(summary) json.dump(summary, f) return out_counts -#Main method + +# Main method def main(): - #Parse arguments + # Parse arguments parser = argparse.ArgumentParser() - parser.add_argument('-k', dest='kraken_assignment_file', required=True, - help='Kraken assignment file to parse') - parser.add_argument('-r', dest='kraken_report_file', required=False, - help='Kraken file of taxon relationships and quantities') - parser.add_argument('-b', dest='bracken_report_file', required=False, - help='Bracken file of taxon relationships and quantities. Quantities used in preference over kraken') - parser.add_argument('-s','-s1', '-1', dest='reads1', required=True, - help='FASTA/FASTQ File containing the raw reads.') - parser.add_argument('-s2', '-2', dest='reads2', default= "", - help='2nd FASTA/FASTQ File containing the raw reads (paired).') - - parser.add_argument('-p', "--prefix",dest='prefix', required=True, default="taxid", - help='Prefix for output files') - - parser.add_argument("--taxid",dest='taxid', required=False, nargs='*', - help='List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file') - parser.add_argument("--rank",dest='rank', required=False, nargs='*', - help='Rank(s) to extract') - parser.add_argument("--max_human",dest='max_human', required=False, type=int, - help='Maximum human reads to allow') - parser.add_argument("--min_count",dest='min_count', required=False, type=int, - help='Minimum direct read count') - parser.add_argument("--min_count_descendants",dest='min_count_descendants', required=False, type=int, - help='Minimum read count at taxon level or descendants') - parser.add_argument("--min_percent",dest='min_percent', required=False, type=float, - help='Minimum percentage of reads e.g 4') - parser.add_argument("--n",dest='top_n', required=False, type=int, - help='Maximum number of taxa to extract (top n)') - parser.add_argument('--include_parents', dest="include_parents", required=False, - action='store_true',default=False, - help='Include reads classified at parent levels of the specified taxids') - parser.add_argument('--include_children', dest='include_children', required=False, - action='store_true',default=False, - help='Include reads classified more specifically than the specified taxids') + parser.add_argument( + "-k", + dest="kraken_assignment_file", + required=True, + help="Kraken assignment file to parse", + ) + parser.add_argument( + "-r", + dest="kraken_report_file", + required=False, + help="Kraken file of taxon relationships and quantities", + ) + parser.add_argument( + "-b", + dest="bracken_report_file", + required=False, + help="Bracken file of taxon relationships and quantities. Quantities used in preference over kraken", + ) + parser.add_argument( + "-s", + "-s1", + "-1", + dest="reads1", + required=True, + help="FASTA/FASTQ File containing the raw reads.", + ) + parser.add_argument( + "-s2", + "-2", + dest="reads2", + default="", + help="2nd FASTA/FASTQ File containing the raw reads (paired).", + ) + + parser.add_argument( + "-p", + "--prefix", + dest="prefix", + required=True, + default="taxid", + help="Prefix for output files", + ) + + parser.add_argument( + "--taxid", + dest="taxid", + required=False, + nargs="*", + help="List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file", + ) + parser.add_argument( + "--rank", dest="rank", required=False, nargs="*", help="Rank(s) to extract" + ) + parser.add_argument( + "--max_human", + dest="max_human", + required=False, + type=int, + help="Maximum human reads to allow", + ) + parser.add_argument( + "--min_count", + dest="min_count", + required=False, + type=int, + help="Minimum direct read count", + ) + parser.add_argument( + "--min_count_descendants", + dest="min_count_descendants", + required=False, + type=int, + help="Minimum read count at taxon level or descendants", + ) + parser.add_argument( + "--min_percent", + dest="min_percent", + required=False, + type=float, + help="Minimum percentage of reads e.g 4", + ) + parser.add_argument( + "--n", + dest="top_n", + required=False, + type=int, + help="Maximum number of taxa to extract (top n)", + ) + parser.add_argument( + "--include_parents", + dest="include_parents", + required=False, + action="store_true", + default=False, + help="Include reads classified at parent levels of the specified taxids", + ) + parser.add_argument( + "--include_children", + dest="include_children", + required=False, + action="store_true", + default=False, + help="Include reads classified more specifically than the specified taxids", + ) parser.set_defaults(append=False) - args=parser.parse_args() + args = parser.parse_args() if not args.kraken_report_file and not args.bracken_report_file: - sys.stderr.write("ERROR: require at least one report file from bracken or kraken\n") + sys.stderr.write( + "ERROR: require at least one report file from bracken or kraken\n" + ) sys.exit(1) - #Start Program + # Start Program now = datetime.now() time = now.strftime("%m/%d/%Y, %H:%M:%S") - sys.stdout.write("PROGRAM START TIME: " + time + '\n') - - rank_dict = {'kingdom':'K','domain':'D','phylum':'P','class':'C','order':'O','family':'F','genus':'G','species':'S', - 'K':'K', 'D':'D', 'P':'P', 'C':'C', 'O':'O', 'F':'F', 'G':'G', 'S':'S'} + sys.stdout.write("PROGRAM START TIME: " + time + "\n") + + rank_dict = { + "kingdom": "K", + "domain": "D", + "phylum": "P", + "class": "C", + "order": "O", + "family": "F", + "genus": "G", + "species": "S", + "K": "K", + "D": "D", + "P": "P", + "C": "C", + "O": "O", + "F": "F", + "G": "G", + "S": "S", + } if args.rank: target_ranks = [rank_dict[r] for r in args.rank] else: target_ranks = [] print(target_ranks) - out_counts = extract_taxa(args.kraken_report_file, args.bracken_report_file, args.kraken_assignment_file, - args.reads1, args.reads2, args.prefix, - max_human = args.max_human, target_ranks=target_ranks,min_count=args.min_count, - min_count_descendants=args.min_count_descendants, - min_percent=args.min_percent, top_n=args.top_n, include_parents=args.include_parents, - include_children=args.include_children) + out_counts = extract_taxa( + args.kraken_report_file, + args.bracken_report_file, + args.kraken_assignment_file, + args.reads1, + args.reads2, + args.prefix, + max_human=args.max_human, + target_ranks=target_ranks, + min_count=args.min_count, + min_count_descendants=args.min_count_descendants, + min_percent=args.min_percent, + top_n=args.top_n, + include_parents=args.include_parents, + include_children=args.include_children, + ) now = datetime.now() time = now.strftime("%m/%d/%Y, %H:%M:%S") - sys.stdout.write("PROGRAM END TIME: " + time + '\n') + sys.stdout.write("PROGRAM END TIME: " + time + "\n") sys.stdout.write("READ COUNTS: \n") @@ -310,5 +497,6 @@ def main(): sys.exit(0) + if __name__ == "__main__": main() From a6e87101ea47cdb82a8e0148c9e1d5f571e88321 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 11:35:02 +0100 Subject: [PATCH 36/77] move params to own file --- conf/params.config | 80 ++++++++++++++++++++++++++++++++++++++++++++++ nextflow.config | 78 +------------------------------------------- 2 files changed, 81 insertions(+), 77 deletions(-) create mode 100644 conf/params.config diff --git a/conf/params.config b/conf/params.config new file mode 100644 index 0000000..f935a9b --- /dev/null +++ b/conf/params.config @@ -0,0 +1,80 @@ +params { + // TODO nf-core: Specify your pipeline's command line flags + // Input options + input = null + + // Boilerplate options + outdir = null + tracedir = "${params.outdir}/pipeline_info" + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'genomes' + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' + + help = false + version = false + wfversion = "v0.0.1" + + out_dir = "output" + store_dir = "store_dir" + + unique_id = null + fastq = null + fastq1 = null + fastq2 = null + fastq_dir = null + metadata = null + + database = null + taxonomy = null + bracken_dist = null + bracken_length = null + bracken_level = 'S' + database_set = "Viral" + + kraken_report = null + bracken_report = null + kraken_assignments = null + assembly_rank="S G" + assembly_min_reads = 10 + assembly_min_percent = 1 + max_human_reads_before_rejection = 1000 + read_type = "illumina" + + disable_ping = false + threads = 2 + k2_port = 8080 + k2_host = 'localhost' + port = 8080 + process_label = "scylla" + monochrome_logs = false + paired = false + taxonomy_dir = "/shared/public/db/taxonomy/" + + validate_params = true + show_hidden_params = false + + analyse_unclassified = false + schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,pangolin_version,wfversion,wf,process_label' + + wf { + example_cmd = [ + "--fastq test_data/barcode01/reads.fastq.gz", + ] + agent = null + container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index d38bceb..bb304ab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,87 +10,11 @@ // for further help editing this file. -params { - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null - // Boilerplate options - outdir = null - tracedir = "${params.outdir}/pipeline_info" - publish_dir_mode = 'copy' - email = null - email_on_fail = null - plaintext_email = false - monochrome_logs = false - hook_url = null - help = false - version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' - - // Max resource options - // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' - - help = false - version = false - wfversion = "v0.0.1" - - out_dir = "output" - store_dir = "store_dir" - - unique_id = null - fastq = null - fastq_dir = null - metadata = null - - database = null - taxonomy = null - bracken_dist = null - bracken_length = null - bracken_level = 'S' - database_set = "Viral" - - kraken_report = null - bracken_report = null - kraken_assignments = null - assembly_rank="S G" - assembly_min_reads = 10 - assembly_min_percent = 1 - max_human_reads_before_rejection = 1000 - read_type = "illumina" - - disable_ping = false - threads = 2 - k2_port = 8080 - k2_host = 'localhost' - port = 8080 - process_label = "scylla" - monochrome_logs = false - paired = false - taxonomy_dir = "/shared/public/db/taxonomy/" - - validate_params = true - show_hidden_params = false - - analyse_unclassified = false - schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,pangolin_version,wfversion,wf,process_label' - - wf { - example_cmd = [ - "--fastq test_data/barcode01/reads.fastq.gz", - ] - agent = null - container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f" - } -} // Load base.config by default for all pipelines includeConfig 'conf/base.config' +includeConfig 'conf/params.config' manifest { name = 'snowy-leopard/scylla' From 823c75ccc671049d4b5bda934bb4cad5ee813a73 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 11:54:26 +0100 Subject: [PATCH 37/77] add additional_bracken_jsons parameter: to test on CLIMB --- conf/params.config | 1 + subworkflows/kraken_pipeline.nf | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/conf/params.config b/conf/params.config index f935a9b..68c3a6a 100644 --- a/conf/params.config +++ b/conf/params.config @@ -48,6 +48,7 @@ params { kraken_report = null bracken_report = null kraken_assignments = null + additional_bracken_jsons = null assembly_rank="S G" assembly_min_reads = 10 assembly_min_percent = 1 diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index bb44191..a3ba436 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -41,16 +41,20 @@ workflow kraken_pipeline { // start_server(database, taxonomy) run_kraken_and_bracken(unique_id, fastq, database, taxonomy) - run_kraken_and_bracken.out.json - .collect() - .set { bracken_jsons } - // stop_server(run_kraken_and_bracken.out.bracken_report.collect()) qc_checks(unique_id, fastq) - // all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json") - // .concat(run_kraken_and_bracken.out.json) - // .unique {it.getName()} - // .collect() + if (params.additional_bracken_jsons) { + Channel.of(file(params.additional_bracken_jsons, type: "file", checkIfExists:true)) + .concat(run_kraken_and_bracken.out.json) + .unique {it.getName()} + .collect() + .set { bracken_jsons } + } else { + run_kraken_and_bracken.out.json + .collect() + .set { bracken_jsons } + } + generate_report(unique_id, qc_checks.out, bracken_jsons ) emit: bracken_report = run_kraken_and_bracken.out.bracken_report From 1d0f445950989606918c0911043d3dfcc67e30f7 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:08:26 +0100 Subject: [PATCH 38/77] allow carry through of self discovered unique_id --- main.nf | 6 ++++-- modules/assemble_taxa.nf | 2 +- modules/extract_taxa.nf | 2 +- modules/fastp.nf | 6 ++++-- modules/generate_report.nf | 2 +- modules/get_params_and_versions.nf | 14 ++++++++++---- modules/kraken_client.nf | 6 +++--- modules/qc_checks.nf | 2 +- 8 files changed, 25 insertions(+), 15 deletions(-) diff --git a/main.nf b/main.nf index 0d67642..e819e83 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,8 @@ include { ingest } from './subworkflows/ingest' include { fastp_single; fastp_paired} from './modules/fastp' workflow { + unique_id = "${params.unique_id}" + // check input fastq exists and run fastp if (params.paired) { @@ -11,14 +13,14 @@ workflow { Channel.of(file(params.fastq2, type: "file", checkIfExists:true)) .set {input_fastq_2_ch} - fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch) + fastp_paired(unique_id, unique_id, input_fastq_1_ch, input_fastq_2_ch) fastp_paired.out.processed_fastq .set {processed_fastq} } else { Channel.of(file(params.fastq, type: "file", checkIfExists:true)) .set {input_fastq_ch} - fastp_single(params.unique_id, input_fastq_ch) + fastp_single(unique_id, unique_id input_fastq_ch) fastp_single.out.processed_fastq .set {processed_fastq} } diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf index be71157..a2811ab 100644 --- a/modules/assemble_taxa.nf +++ b/modules/assemble_taxa.nf @@ -11,7 +11,7 @@ process extract_reads { label 'process_medium' - publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index b4c792c..996be81 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -11,7 +11,7 @@ process extract_reads { label 'process_medium' - publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/fastp.nf b/modules/fastp.nf index e451a70..747e431 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -2,11 +2,12 @@ process fastp_paired { label 'process_medium' - publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' container "biowilko/scylla@${params.wf.container_sha}" input: + val(unique_id) val(prefix) path(fastq_1) path(fastq_2) @@ -51,11 +52,12 @@ process fastp_single { label 'process_medium' - publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' container "biowilko/scylla@${params.wf.container_sha}" input: + val(unique_id) val(prefix) path(fastq) diff --git a/modules/generate_report.nf b/modules/generate_report.nf index 232fa7d..0fccc76 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -3,7 +3,7 @@ process make_report { label "process_low" - publishDir path: "${params.out_dir}/${params.unique_id}/", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" container "biowilko/scylla@${params.wf.container_sha}" diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf index 7822757..90f9359 100644 --- a/modules/get_params_and_versions.nf +++ b/modules/get_params_and_versions.nf @@ -5,8 +5,10 @@ import groovy.json.JsonBuilder process get_versions { - publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/execution", mode: 'copy' cpus 1 + input: + val unique_id output: path "versions.txt" script: @@ -19,8 +21,10 @@ process get_versions { process get_params { - publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/execution", mode: 'copy' cpus 1 + input: + val unique_id output: path "params.json" script: @@ -32,9 +36,11 @@ process get_params { } workflow get_params_and_versions { + take: + unique_ud main: - software_versions = get_versions() - workflow_params = get_params() + software_versions = get_versions(unique_id) + workflow_params = get_params(unique_id) emit: software_versions workflow_params diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index a4979db..dcc2265 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -29,7 +29,7 @@ process combine_kraken_outputs { container "biowilko/scylla@${params.wf.container_sha}" - publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' input: val unique_id @@ -81,7 +81,7 @@ process bracken { label 'process_low' - publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' conda "bioconda::bracken=2.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -111,7 +111,7 @@ process bracken_to_json { label "process_low" - publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy' + publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy' conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" container "biowilko/scylla@${params.wf.container_sha}" diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index 2d384bd..45b23e8 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -23,7 +23,7 @@ process combine_stats { label "process_low" - publishDir "${params.out_dir}/${params.unique_id}/qc", mode: 'copy' + publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy' conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" container "biowilko/scylla@${params.wf.container_sha}" From b1f67dd50ee60c70ba8808a14305e2dfe83c8ea6 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:11:56 +0100 Subject: [PATCH 39/77] typo --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index e819e83..c05d18a 100644 --- a/main.nf +++ b/main.nf @@ -20,7 +20,7 @@ workflow { Channel.of(file(params.fastq, type: "file", checkIfExists:true)) .set {input_fastq_ch} - fastp_single(unique_id, unique_id input_fastq_ch) + fastp_single(unique_id, unique_id, input_fastq_ch) fastp_single.out.processed_fastq .set {processed_fastq} } From d6509ac028ed058c9694d6d732f148ee7a3f2e19 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:14:18 +0100 Subject: [PATCH 40/77] unify input declaration style in fastp --- modules/fastp.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index 747e431..22f2620 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -57,14 +57,14 @@ process fastp_single { container "biowilko/scylla@${params.wf.container_sha}" input: - val(unique_id) - val(prefix) - path(fastq) + val unique_id + val prefix + path fastq output: - val(prefix) - path("${prefix}.fastp.fastq.gz"), emit: processed_fastq - path("${prefix}.fastp.json") + val prefix + path "${prefix}.fastp.fastq.gz", emit: processed_fastq + path "${prefix}.fastp.json" script: From 9dc4e87fff634e929cb7c8f0e3824ef8c432c55a Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:17:26 +0100 Subject: [PATCH 41/77] unify input declaration style in fastp --- modules/fastp.nf | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/fastp.nf b/modules/fastp.nf index 22f2620..4faf898 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -7,17 +7,17 @@ process fastp_paired { container "biowilko/scylla@${params.wf.container_sha}" input: - val(unique_id) - val(prefix) - path(fastq_1) - path(fastq_2) + val unique_id + val prefix + path fastq_1 + path fastq_2 output: - val(prefix) - path("${prefix}_1.fastp.fastq.gz"), optional: true - path("${prefix}_2.fastp.fastq.gz"), optional: true - path("${prefix}.fastp.fastq.gz"), emit: processed_fastq - path("${prefix}.fastp.json") + val prefix + path "${prefix}_1.fastp.fastq.gz", optional: true + path "${prefix}_2.fastp.fastq.gz", optional: true + path "${prefix}.fastp.fastq.gz", emit: processed_fastq + path "${prefix}.fastp.json" script: """ From 90e629e17fae07ab5d52ff3d8d0c605604b10723 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:20:00 +0100 Subject: [PATCH 42/77] typo --- modules/get_params_and_versions.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf index 90f9359..3b12fc5 100644 --- a/modules/get_params_and_versions.nf +++ b/modules/get_params_and_versions.nf @@ -37,7 +37,7 @@ process get_params { workflow get_params_and_versions { take: - unique_ud + unique_id main: software_versions = get_versions(unique_id) workflow_params = get_params(unique_id) From 9726dadc1948b147ceefff041878f9aff8c549d5 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 20 Jun 2023 12:23:09 +0100 Subject: [PATCH 43/77] remove workflow - it's exactly in main.nf --- subworkflows/ingest.nf | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 1e21a35..ebcc2f7 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -17,31 +17,3 @@ workflow ingest { html_report = kraken_pipeline.out.report } - -// workflow { -// // check input fastq exists and run fastp - -// if (params.paired) { -// Channel.of(file(file(params.fastq1, type: "file", checkIfExists:true))) -// .set {input_fastq_1_ch} - -// Channel.of(file(file(params.fastq2, type: "file", checkIfExists:true))) -// .set {input_fastq_2_ch} - -// // input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) -// // input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) -// fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch) -// fastp_paired.out.processed_fastq -// .set {processed_fastq} -// } else { -// Channel.of(file(file(params.fastq, type: "file", checkIfExists:true))) -// .set {input_fastq_ch} - -// // input_fastq = file(params.fastq, type: "file", checkIfExists:true) -// fastp_single(params.unique_id, input_fastq_ch) -// fastp_single.out.processed_fastq -// .set {processed_fastq} -// } - -// ingest(params.unique_id, processed_fastq) -// } \ No newline at end of file From 770d30b7fcc4f4040ac2b533e348898d696657d6 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 09:53:53 +0100 Subject: [PATCH 44/77] fastp -> preprocess, new pair concat method --- bin/concatenate_reads.py | 167 +++++++++++++++++++++++++++++++++++++++ main.nf | 9 ++- modules/extract_taxa.nf | 78 ++++++++++++------ modules/fastp.nf | 83 ------------------- modules/preprocess.nf | 94 ++++++++++++++++++++++ 5 files changed, 322 insertions(+), 109 deletions(-) create mode 100644 bin/concatenate_reads.py delete mode 100644 modules/fastp.nf create mode 100644 modules/preprocess.nf diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py new file mode 100644 index 0000000..4d91af7 --- /dev/null +++ b/bin/concatenate_reads.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +""" +No dependency Python script for joining two paired end FASTQ files. +Supports concatenating reads with a separator ("NNNNN") or interleaving reads via the +--interleave option. Auto-detects gzip'd files, offers header checking via a --strict flag, +and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag). +""" +import argparse +import gzip +import sys + + +def enforce_headers(f1_header, f2_header): + if f1_header[0] != "@" or f2_header[0] != "@": + raise Exception("Invalid input FASTQ files.") + if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]: + raise Exception( + "Input FASTQ files do not share headers. " "Check and re-run w/o --strict." + ) + + +def join_w_separator(f1, f2, f_out, strict=False, sep="|"): + readlines = True + ix = 0 + + # Use 'G', which is in most Phred scores. + # http://en.wikipedia.org/wiki/FASTQ_format + phred_sep = "|" * len(sep) + + while readlines: + f1_line = f1.readline() + f2_line = f2.readline() + + if f1_line == "": + readlines = False + f1.close() + f2.close() + f_out.close() + if f2_line != "": + raise Exception("Input FASTQ files do not match in length.") + continue + + if ix % 4 == 0: # Header + if strict: # Fail if they don't match up to the first whitespace + enforce_headers(f1_line, f2_line) + + # Write the header out + f_out.write(f1_line) + + elif (ix - 1) % 4 == 0: # Sequence + f_out.write(f1_line.strip() + sep + f2_line) + elif (ix - 2) % 4 == 0: # Separator + f_out.write("+\n") + else: + f_out.write(f1_line.strip() + phred_sep + f2_line) + + ix += 1 + + +def join_interleaved(f1, f2, f_out, strict=False): + readlines = True + ix = 0 + f1_lines = [] + f2_lines = [] + + def flush_buffer(f_out, f1_lines, f2_lines): + if f1_lines and f2_lines: + assert len(f1_lines) == 4 + assert len(f2_lines) == 4 + f_out.write(f1_lines[0]) + f_out.write(f1_lines[1]) + f_out.write(f1_lines[2]) + f_out.write(f1_lines[3]) + f_out.write(f2_lines[0]) + f_out.write(f2_lines[1]) + f_out.write(f2_lines[2]) + f_out.write(f2_lines[3]) + + f1_lines = [] + f2_lines = [] + + return f1_lines, f2_lines + + while readlines: + f1_line = f1.readline() + f2_line = f2.readline() + + if f1_line == "": + readlines = False + if f2_line != "": + raise Exception("Input FASTQ files do not match in length.") + break + + if ix % 4 == 0: # Header #1 + if strict: + enforce_headers(f1_line, f2_line) + + f1_lines, f2_lines = flush_buffer(f_out, f1_lines, f2_lines) + + # Fill buffer up to 4 lines + f1_lines.append(f1_line) + f2_lines.append(f2_line) + ix += 1 + + _, _ = flush_buffer(f_out, f1_lines, f2_lines) + f1.close() + f2.close() + f_out.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=( + "Join two FASTQ paired end read files. Defaults to interleaving reads. " + "Note: Requires input files to be sorted." + ) + ) + parser.add_argument("fastq1", help="First input FASTQ.") + parser.add_argument("fastq2", help="Second input FASTQ.") + parser.add_argument( + "output_fastq", + nargs="?", + help=("Output FASTQ file name (optional, " "streams to STDOUT if missing."), + ) + parser.add_argument( + "--sep", + default="|", + help=("Optional separator to override default (|)."), + ) + parser.add_argument( + "--no-interleave", + action="store_true", + help="Concatenate the two reads using a separate (--sep) rather than " + "interleaving them (default).", + ) + parser.add_argument( + "--strict", + action="store_true", + help=( + "Enforce that the headers of the input " + "FASTQ files match until the first whitespace " + "delimiter (e.g., a space)." + ), + ) + parser.add_argument("--gzip", action="store_true", help=("Gzip output_fastq.")) + args = parser.parse_args() + + if ".gz" in args.fastq1 or ".gzip" in args.fastq1: + f1 = gzip.open(args.fastq1, mode="r") + else: + f1 = open(args.fastq1, mode="r") + if ".gz" in args.fastq2 or ".gzip" in args.fastq2: + f2 = gzip.open(args.fastq2, mode="r") + else: + f2 = open(args.fastq2, mode="r") + + if args.output_fastq is None: + f_out = sys.stdout + elif args.gzip: + f_out = gzip.open(args.output_fastq, mode="w") + else: + f_out = open(args.output_fastq, mode="w") + + if not args.no_interleave: + join_interleaved(f1, f2, f_out, strict=args.strict) + else: + join_w_separator(f1, f2, f_out, strict=args.strict, sep=args.sep) diff --git a/main.nf b/main.nf index c05d18a..a132eb8 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,5 @@ include { ingest } from './subworkflows/ingest' -include { fastp_single; fastp_paired} from './modules/fastp' +include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess' workflow { unique_id = "${params.unique_id}" @@ -13,14 +13,17 @@ workflow { Channel.of(file(params.fastq2, type: "file", checkIfExists:true)) .set {input_fastq_2_ch} - fastp_paired(unique_id, unique_id, input_fastq_1_ch, input_fastq_2_ch) + fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch) + + + fastp_paired.out.processed_fastq .set {processed_fastq} } else { Channel.of(file(params.fastq, type: "file", checkIfExists:true)) .set {input_fastq_ch} - fastp_single(unique_id, unique_id, input_fastq_ch) + fastp_single(unique_id, input_fastq_ch) fastp_single.out.processed_fastq .set {processed_fastq} } diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 996be81..26019f5 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -20,7 +20,8 @@ process extract_reads { input: val unique_id - path fastq + path fastq1 + path fastq2 path kraken_assignments path kraken_report path bracken_report @@ -28,24 +29,46 @@ process extract_reads { path "reads.*.f*.gz", emit: reads path "reads_summary.json", emit: summary script: - """ - extract_kraken_reads.py \ - -s ${fastq} \ - -k ${kraken_assignments} \ - -r ${kraken_report} \ - -b ${bracken_report} \ - -p reads \ - --include_children \ - --max_human ${params.max_human_reads_before_rejection} \ - --min_count_descendants ${params.assembly_min_reads} \ - --rank ${params.assembly_rank} \ - --min_percent ${params.assembly_min_percent} - - for f in \$(ls reads.*.f*) - do - gzip \$f - done - """ + if (params.paired) { + """ + extract_kraken_reads.py \ + -s1 ${fastq1} \ + -s2 ${fastq2} \ + -k ${kraken_assignments} \ + -r ${kraken_report} \ + -b ${bracken_report} \ + -p reads \ + --include_children \ + --max_human ${params.max_human_reads_before_rejection} \ + --min_count_descendants ${params.assembly_min_reads} \ + --rank ${params.assembly_rank} \ + --min_percent ${params.assembly_min_percent} + + for f in \$(ls reads.*.f*) + do + gzip \$f + done + """ + } else { + """ + extract_kraken_reads.py \ + -s ${fastq1} \ + -k ${kraken_assignments} \ + -r ${kraken_report} \ + -b ${bracken_report} \ + -p reads \ + --include_children \ + --max_human ${params.max_human_reads_before_rejection} \ + --min_count_descendants ${params.assembly_min_reads} \ + --rank ${params.assembly_rank} \ + --min_percent ${params.assembly_min_percent} + + for f in \$(ls reads.*.f*) + do + gzip \$f + done + """ + } } @@ -53,16 +76,25 @@ process extract_reads { workflow extract_taxa { take: unique_id - fastq + fastq_1 + fastq_2 kraken_assignments kraken_report bracken_report main: - extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report) + extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) } workflow { - fastq = file("${params.fastq}", type: "file", checkIfExists:true) + + if (params.paired) { + fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) + + fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) + } else { + fastq_1 = file(params.fastq, type: "file", checkIfExists:true) + fastq_2 = None + } unique_id = "${params.unique_id}" if ("${params.unique_id}" == "null") { @@ -73,5 +105,5 @@ workflow { bracken_report = file("${params.bracken_report}") kraken_report = file("${params.kraken_report}") - extract_taxa(unique_id, fastq, kraken_assignments, kraken_report, bracken_report) + extract_taxa(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) } \ No newline at end of file diff --git a/modules/fastp.nf b/modules/fastp.nf deleted file mode 100644 index 4faf898..0000000 --- a/modules/fastp.nf +++ /dev/null @@ -1,83 +0,0 @@ -process fastp_paired { - - label 'process_medium' - - publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' - - container "biowilko/scylla@${params.wf.container_sha}" - - input: - val unique_id - val prefix - path fastq_1 - path fastq_2 - - output: - val prefix - path "${prefix}_1.fastp.fastq.gz", optional: true - path "${prefix}_2.fastp.fastq.gz", optional: true - path "${prefix}.fastp.fastq.gz", emit: processed_fastq - path "${prefix}.fastp.json" - - script: - """ - fastp \\ - --in1 ${fastq_1} \\ - --in2 ${fastq_2} \\ - --out1 ${prefix}_1.fastp.fastq \\ - --out2 ${prefix}_2.fastp.fastq \\ - --merged_out ${prefix}.fastp.fastq \\ - -m \\ - --json ${prefix}.fastp.json \\ - --thread $task.cpus \\ - 2> ${prefix}.fastp.log - - if [ -s ${prefix}_1.fastp.fastq ]; then - bgzip --threads $task.cpus -c ${prefix}_1.fastp.fastq > ${prefix}_1.fastp.fastq.gz - fi - - if [ -s ${prefix}_2.fastp.fastq ]; then - bgzip --threads $task.cpus -c ${prefix}_2.fastp.fastq > ${prefix}_2.fastp.fastq.gz - fi - - if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz - fi - - """ - -} - -process fastp_single { - - label 'process_medium' - - publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' - - container "biowilko/scylla@${params.wf.container_sha}" - - input: - val unique_id - val prefix - path fastq - - output: - val prefix - path "${prefix}.fastp.fastq.gz", emit: processed_fastq - path "${prefix}.fastp.json" - - script: - - """ - fastp \\ - --in1 ${fastq} \\ - --out1 ${prefix}.fastp.fastq \\ - --json ${prefix}.fastp.json \\ - --thread $task.cpus \\ - 2> ${prefix}.fastp.log - - if [ -s ${prefix}.fastp.fastq ]; then - bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz - fi - """ -} \ No newline at end of file diff --git a/modules/preprocess.nf b/modules/preprocess.nf new file mode 100644 index 0000000..9e68dba --- /dev/null +++ b/modules/preprocess.nf @@ -0,0 +1,94 @@ +process fastp_paired { + + label 'process_medium' + + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' + + container "biowilko/scylla@${params.wf.container_sha}" + + input: + val unique_id + path fastq_1 + path fastq_2 + + output: + val unique_id + path "${unique_id}_1.fastp.fastq", emit: processed_fastq_1 + path "${unique_id}_2.fastp.fastq", emit: processed_fastq_2 + path "${unique_id}.fastp.json" + + script: + """ + fastp \\ + --in1 ${fastq_1} \\ + --in2 ${fastq_2} \\ + --out1 ${unique_id}_1.fastp.fastq \\ + --out2 ${unique_id}_2.fastp.fastq \\ + --json ${unique_id}.fastp.json \\ + --thread $task.cpus \\ + 2> ${unique_id}.fastp.log + + """ + +} + +process fastp_single { + + label 'process_medium' + + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' + + container "biowilko/scylla@${params.wf.container_sha}" + + input: + val unique_id + path fastq + + output: + val unique_id + path "${unique_id}.fastp.fastq.gz", emit: processed_fastq + path "${unique_id}.fastp.json" + + script: + + """ + fastp \\ + --in1 ${fastq} \\ + --out1 ${unique_id}.fastp.fastq \\ + --json ${unique_id}.fastp.json \\ + --thread $task.cpus \\ + 2> ${unique_id}.fastp.log + + if [ -s ${unique_id}.fastp.fastq ]; then + bgzip --threads $task.cpus -c ${unique_id}.fastp.fastq > ${unique_id}.fastp.fastq.gz + fi + """ +} + +process paired_concatenate { + + label 'process_low' + + publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy' + + container "biowilko/scylla@${params.wf.container_sha}" + + input: + val unique_id + path processed_fastq_1 + path processed_fastq_2 + + output: + val unique_id + path "${unique_id}.fastp.fastq.gz", emit: concatented_fastq + + script: + """ + concatenate_reads.py --no-interleave --strict \\ + ${processed_fastq_1} ${processed_fastq_2} \\ + > ${unique_id}.concatenated.fastq + + + bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.fastp.fastq.gz + """ +} \ No newline at end of file From a059471346a768e3caf4467f5390f0119e50a37a Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:04:09 +0100 Subject: [PATCH 45/77] Fix paired concatenate --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index a132eb8..f36503b 100644 --- a/main.nf +++ b/main.nf @@ -15,9 +15,9 @@ workflow { fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch) + paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2) - - fastp_paired.out.processed_fastq + paired_concatenate.out.concatenated_fastq .set {processed_fastq} } else { Channel.of(file(params.fastq, type: "file", checkIfExists:true)) From 71f8a2067fda939056f856c1753dcb0fb3af54b8 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:05:59 +0100 Subject: [PATCH 46/77] fix typo --- modules/preprocess.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 9e68dba..da3eacf 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -80,7 +80,7 @@ process paired_concatenate { output: val unique_id - path "${unique_id}.fastp.fastq.gz", emit: concatented_fastq + path "${unique_id}.fastp.fastq.gz", emit: concatenated_fastq script: """ From 6debe4c7f5f9b276e46873239501801aae646e5a Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:12:36 +0100 Subject: [PATCH 47/77] extract taxa workflow rejigger --- subworkflows/ingest.nf | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index ebcc2f7..3ef74ac 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -12,7 +12,17 @@ workflow ingest { // get_params_and_versions() //clean_metadata(metadata) kraken_pipeline(unique_id, fastq) - extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) + + if (params.paired) { + fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) + + fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) + } else { + fastq_1 = file(params.fastq, type: "file", checkIfExists:true) + fastq_2 = None + } + + extract_taxa(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) emit: html_report = kraken_pipeline.out.report From 8922f55e0f3fb20d34b336ed6897368115117482 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:18:19 +0100 Subject: [PATCH 48/77] make concat executable --- bin/concatenate_reads.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/concatenate_reads.py diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py old mode 100644 new mode 100755 From 494ddb1f415379fdac75bdbcc43471f347ab149b Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:24:07 +0100 Subject: [PATCH 49/77] proper concatenated.fastq bgzip --- modules/preprocess.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/preprocess.nf b/modules/preprocess.nf index da3eacf..9bdc480 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -80,7 +80,7 @@ process paired_concatenate { output: val unique_id - path "${unique_id}.fastp.fastq.gz", emit: concatenated_fastq + path "${unique_id}.concatenated.fastq.gz", emit: concatenated_fastq script: """ @@ -89,6 +89,6 @@ process paired_concatenate { > ${unique_id}.concatenated.fastq - bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.fastp.fastq.gz + bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.concatenated.fastq.gz """ } \ No newline at end of file From dbaa0ea968349fc0c396e872834078a3ba82e166 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:35:20 +0100 Subject: [PATCH 50/77] Move around extract reads (sorry) --- main.nf | 24 +----------------------- modules/preprocess.nf | 13 ++++++++++--- subworkflows/ingest.nf | 36 +++++++++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/main.nf b/main.nf index f36503b..271d457 100644 --- a/main.nf +++ b/main.nf @@ -5,28 +5,6 @@ workflow { unique_id = "${params.unique_id}" // check input fastq exists and run fastp - - if (params.paired) { - Channel.of(file(params.fastq1, type: "file", checkIfExists:true)) - .set {input_fastq_1_ch} - Channel.of(file(params.fastq2, type: "file", checkIfExists:true)) - .set {input_fastq_2_ch} - - fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch) - - paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2) - - paired_concatenate.out.concatenated_fastq - .set {processed_fastq} - } else { - Channel.of(file(params.fastq, type: "file", checkIfExists:true)) - .set {input_fastq_ch} - - fastp_single(unique_id, input_fastq_ch) - fastp_single.out.processed_fastq - .set {processed_fastq} - } - - ingest(params.unique_id, processed_fastq) + ingest(params.unique_id) } \ No newline at end of file diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 9bdc480..3349da9 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -13,8 +13,8 @@ process fastp_paired { output: val unique_id - path "${unique_id}_1.fastp.fastq", emit: processed_fastq_1 - path "${unique_id}_2.fastp.fastq", emit: processed_fastq_2 + path "${unique_id}_1.fastp.fastq.gz", emit: processed_fastq_1 + path "${unique_id}_2.fastp.fastq.gz", emit: processed_fastq_2 path "${unique_id}.fastp.json" script: @@ -27,7 +27,14 @@ process fastp_paired { --json ${unique_id}.fastp.json \\ --thread $task.cpus \\ 2> ${unique_id}.fastp.log - + + if [ -s ${unique_id}_1.fastp.fastq ]; then + bgzip --threads $task.cpus -c ${unique_id}_1.fastp.fastq > ${unique_id}_1.fastp.fastq.gz + fi + + if [ -s ${unique_id}_2.fastp.fastq ]; then + bgzip --threads $task.cpus -c ${unique_id}_2.fastp.fastq > ${unique_id}_2.fastp.fastq.gz + fi """ } diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 3ef74ac..6f9d181 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -1,28 +1,50 @@ // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq include { get_params_and_versions } from '../modules/get_params_and_versions' include { kraken_pipeline } from '../subworkflows/kraken_pipeline' -include { extract_taxa } from '../modules/extract_taxa' +include { extract_reads } from '../modules/extract_taxa' workflow ingest { take: unique_id - fastq //metadata main: + + if (params.paired) { + Channel.of(file(params.fastq1, type: "file", checkIfExists:true)) + .set {input_fastq_1_ch} + + Channel.of(file(params.fastq2, type: "file", checkIfExists:true)) + .set {input_fastq_2_ch} + + fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch) + + paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2) + + paired_concatenate.out.concatenated_fastq + .set {processed_fastq} + } else { + Channel.of(file(params.fastq, type: "file", checkIfExists:true)) + .set {input_fastq_ch} + + fastp_single(unique_id, input_fastq_ch) + fastp_single.out.processed_fastq + .set {processed_fastq} + } + // get_params_and_versions() //clean_metadata(metadata) - kraken_pipeline(unique_id, fastq) + kraken_pipeline(unique_id, processed_fastq) if (params.paired) { - fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) + fastq_1 = fastp_paired.out.processed_fastq_1 - fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) + fastq_2 = fastp_paired.out.processed_fastq_2 } else { - fastq_1 = file(params.fastq, type: "file", checkIfExists:true) + fastq_1 = proccessed_fastq fastq_2 = None } - extract_taxa(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) + extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) emit: html_report = kraken_pipeline.out.report From 4ee6c148e60f58f5e8ed8fe1f0ba839941ce6b9d Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:40:32 +0100 Subject: [PATCH 51/77] import preprocess in ingest --- subworkflows/ingest.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 6f9d181..d2eccad 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -2,6 +2,7 @@ include { get_params_and_versions } from '../modules/get_params_and_versions' include { kraken_pipeline } from '../subworkflows/kraken_pipeline' include { extract_reads } from '../modules/extract_taxa' +include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess' workflow ingest { take: From 593e82c7fcedb919d2cda5e5321c2ae60def184c Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:44:07 +0100 Subject: [PATCH 52/77] proper preprocess include statement --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index 271d457..37c4bdd 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,4 @@ include { ingest } from './subworkflows/ingest' -include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess' workflow { unique_id = "${params.unique_id}" From f67b115adb8a580f70cf4e25bc5c26033c3bcc4f Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 10:45:03 +0100 Subject: [PATCH 53/77] proper preprocess include statement --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index d2eccad..1027004 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -2,7 +2,7 @@ include { get_params_and_versions } from '../modules/get_params_and_versions' include { kraken_pipeline } from '../subworkflows/kraken_pipeline' include { extract_reads } from '../modules/extract_taxa' -include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess' +include { fastp_single; fastp_paired; paired_concatenate } from '../modules/preprocess' workflow ingest { take: From 272c6ced26be2035e1f5062348590cb69f231653 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 11:04:01 +0100 Subject: [PATCH 54/77] use bio.bgzf not gzip --- bin/concatenate_reads.py | 8 ++++---- modules/preprocess.nf | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py index 4d91af7..56890c9 100755 --- a/bin/concatenate_reads.py +++ b/bin/concatenate_reads.py @@ -6,7 +6,7 @@ and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag). """ import argparse -import gzip +from Bio import bgzf import sys @@ -146,18 +146,18 @@ def flush_buffer(f_out, f1_lines, f2_lines): args = parser.parse_args() if ".gz" in args.fastq1 or ".gzip" in args.fastq1: - f1 = gzip.open(args.fastq1, mode="r") + f1 = bgzf.open(args.fastq1, mode="r") else: f1 = open(args.fastq1, mode="r") if ".gz" in args.fastq2 or ".gzip" in args.fastq2: - f2 = gzip.open(args.fastq2, mode="r") + f2 = bgzf.open(args.fastq2, mode="r") else: f2 = open(args.fastq2, mode="r") if args.output_fastq is None: f_out = sys.stdout elif args.gzip: - f_out = gzip.open(args.output_fastq, mode="w") + f_out = bgzf.open(args.output_fastq, mode="w") else: f_out = open(args.output_fastq, mode="w") diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 3349da9..6fa67e5 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -34,7 +34,7 @@ process fastp_paired { if [ -s ${unique_id}_2.fastp.fastq ]; then bgzip --threads $task.cpus -c ${unique_id}_2.fastp.fastq > ${unique_id}_2.fastp.fastq.gz - fi + fi """ } From 465334de1a12aebc59d4aa789a60cd7739661a3c Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:18:04 +0100 Subject: [PATCH 55/77] typo --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 1027004..49c1040 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -41,7 +41,7 @@ workflow ingest { fastq_2 = fastp_paired.out.processed_fastq_2 } else { - fastq_1 = proccessed_fastq + fastq_1 = processed_fastq fastq_2 = None } From 12e36d24d3597eaf21df86eff9eb0ecb19206561 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:18:53 +0100 Subject: [PATCH 56/77] try null --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 49c1040..7957350 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -42,7 +42,7 @@ workflow ingest { fastq_2 = fastp_paired.out.processed_fastq_2 } else { fastq_1 = processed_fastq - fastq_2 = None + fastq_2 = null } extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) From 41f96d896316db78a42bfde35fb28215afc4e36f Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:25:14 +0100 Subject: [PATCH 57/77] split if else for paired --- modules/extract_taxa.nf | 47 ++++++++++++++++++++++++++++------------- subworkflows/ingest.nf | 8 +++---- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 26019f5..5e9e8f5 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -7,7 +7,7 @@ // probably want to count up how many have been found here for run log // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check -process extract_reads { +process extract_paired_reads { label 'process_medium' @@ -29,7 +29,6 @@ process extract_reads { path "reads.*.f*.gz", emit: reads path "reads_summary.json", emit: summary script: - if (params.paired) { """ extract_kraken_reads.py \ -s1 ${fastq1} \ @@ -49,7 +48,29 @@ process extract_reads { gzip \$f done """ - } else { +} + +process extract_reads { + + label 'process_medium' + + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' + + conda 'bioconda::biopython=1.78' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython%3A1.76' : + 'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }" + + input: + val unique_id + path fastq + path kraken_assignments + path kraken_report + path bracken_report + output: + path "reads.*.f*.gz", emit: reads + path "reads_summary.json", emit: summary + script: """ extract_kraken_reads.py \ -s ${fastq1} \ @@ -68,7 +89,6 @@ process extract_reads { gzip \$f done """ - } } @@ -86,16 +106,6 @@ workflow extract_taxa { } workflow { - - if (params.paired) { - fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) - - fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) - } else { - fastq_1 = file(params.fastq, type: "file", checkIfExists:true) - fastq_2 = None - } - unique_id = "${params.unique_id}" if ("${params.unique_id}" == "null") { unique_id = "${fastq.simpleName}" @@ -105,5 +115,12 @@ workflow { bracken_report = file("${params.bracken_report}") kraken_report = file("${params.kraken_report}") - extract_taxa(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) + if (params.paired) { + fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) + fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) + extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) + } else { + fastq = file(params.fastq, type: "file", checkIfExists:true) + extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report) + } } \ No newline at end of file diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 7957350..6a95af7 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -38,14 +38,14 @@ workflow ingest { if (params.paired) { fastq_1 = fastp_paired.out.processed_fastq_1 - fastq_2 = fastp_paired.out.processed_fastq_2 + extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) + } else { - fastq_1 = processed_fastq - fastq_2 = null + fastq = processed_fastq + extract_reads(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) } - extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report) emit: html_report = kraken_pipeline.out.report From d515aabbe88ad2c6a4efcbf1db8105a162c520c0 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:25:52 +0100 Subject: [PATCH 58/77] add import --- subworkflows/ingest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf index 6a95af7..3a2a67c 100644 --- a/subworkflows/ingest.nf +++ b/subworkflows/ingest.nf @@ -1,7 +1,7 @@ // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq include { get_params_and_versions } from '../modules/get_params_and_versions' include { kraken_pipeline } from '../subworkflows/kraken_pipeline' -include { extract_reads } from '../modules/extract_taxa' +include { extract_reads; extract_paired_reads } from '../modules/extract_taxa' include { fastp_single; fastp_paired; paired_concatenate } from '../modules/preprocess' workflow ingest { From 46c545c7b81ade3066516fbb934d53d6d4981d49 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:29:27 +0100 Subject: [PATCH 59/77] EXPERIMENTAL: overwrite some params with climb specifics --- conf/climb.config | 14 ++++++++ conf/params.config | 20 +++++++++++ nextflow.config | 4 +++ subworkflows/kraken_pipeline.nf | 62 +++++++++++++++++++-------------- 4 files changed, 74 insertions(+), 26 deletions(-) create mode 100644 conf/climb.config diff --git a/conf/climb.config b/conf/climb.config new file mode 100644 index 0000000..745cafa --- /dev/null +++ b/conf/climb.config @@ -0,0 +1,14 @@ +params { + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' + + k2_port = 8080 + k2_host = "10.1.185.58" + + taxonomy = "/shared/public/db/taxonomy/" + database = "/shared/public/db/kraken2/k2_pluspf/" + database_set = "PlusPF" +} \ No newline at end of file diff --git a/conf/params.config b/conf/params.config index 68c3a6a..9ed6464 100644 --- a/conf/params.config +++ b/conf/params.config @@ -31,6 +31,8 @@ params { out_dir = "output" store_dir = "store_dir" + climb = false + unique_id = null fastq = null fastq1 = null @@ -44,6 +46,24 @@ params { bracken_length = null bracken_level = 'S' database_set = "Viral" + database_sets = [ + 'Viral': [ + 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20221209.tar.gz', + 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' + ], + 'EuPath': [ + 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113.tar.gz', + 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' + ], + 'PlusPF-8': [ + 'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspf_8gb_20210517.tar.gz', + 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' + ], + 'ncbi_16s_18s': [ + 'database': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_kraken2.tar.gz', + 'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' + ] + ] kraken_report = null bracken_report = null diff --git a/nextflow.config b/nextflow.config index bb304ab..2766db6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,10 @@ includeConfig 'conf/base.config' includeConfig 'conf/params.config' +if (params.climb) { + includeConfig 'conf/climb.config' +} + manifest { name = 'snowy-leopard/scylla' author = 'Snowy Leopard' diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index a3ba436..268fa80 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -8,35 +8,45 @@ workflow kraken_pipeline { fastq main: - // Check source param is valid - // sources = params.database_sets - // source_name = params.database_set - // source_data = sources.get(source_name, false) - // if (!sources.containsKey(source_name) || !source_data) { - // keys = sources.keySet() - // throw new Exception("Source $params.source is invalid, must be one of $keys") - // } - // Grab taxonomy files - // taxonomy = file(sources[source_name]["taxonomy"], type: "file") - // if (params.taxonomy) { - // log.info("Checking custom taxonomy mapping exists") - - // } + // Check source param is valid if applicable + if not (params.database and params.taxonomy_dir) { + sources = params.database_sets + source_name = params.database_set + source_data = sources.get(source_name, false) + if (!sources.containsKey(source_name) || !source_data) { + keys = sources.keySet() + throw new Exception("Source $params.source is invalid, must be one of $keys") + } + } - taxonomy = file(params.taxonomy_dir, type: "dir", checkIfExists:true) + // Grab taxonomy files + if (params.taxonomy) { + taxonomy = file(params.taxonomy, type: "dir", checkIfExists:true) + } else { + default_taxonomy = source_data.get("taxonomy", false) + if (!default_taxonomy) { + throw new Exception( + "Error: Source $source_name does not include a taxonomy for " + + "use with kraken, please choose another source or" + + "provide a custom taxonomy.") + } + taxonomy = unpackTaxonomy(default_taxonomy) + } - // if (database) { - // source_database = database - // } else { - // source_database = source_data.get("database", false) - // if (!source_database) { - // throw new Exception( - // "Error: Source $source_name does not include a database for " - // + "use with kraken, please choose another source, " - // + "provide a custom database or disable kraken.") - // } - database = file(params.db, type: "dir", checkIfExists:true) + // Grab database files + if (params.database) { + database = file(params.database, type: "dir", checkIfExists:true) + } else { + default_database = source_data.get("database", false) + if (!default_database) { + throw new Exception( + "Error: Source $source_name does not include a database for " + + "use with kraken, please choose another source or " + + "provide a custom database.") + } + database = unpackDatabase(default_database) + } // start_server(database, taxonomy) run_kraken_and_bracken(unique_id, fastq, database, taxonomy) From ebe6083d2920a60e789353fe37a2b68241c8920b Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:32:00 +0100 Subject: [PATCH 60/77] debug --- subworkflows/kraken_pipeline.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index 268fa80..e6976fb 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -7,10 +7,8 @@ workflow kraken_pipeline { unique_id fastq main: - - // Check source param is valid if applicable - if not (params.database and params.taxonomy_dir) { + if (!params.database and !params.taxonomy_dir) { sources = params.database_sets source_name = params.database_set source_data = sources.get(source_name, false) @@ -34,7 +32,7 @@ workflow kraken_pipeline { taxonomy = unpackTaxonomy(default_taxonomy) } - // Grab database files + // Grab database files if (params.database) { database = file(params.database, type: "dir", checkIfExists:true) } else { From 3ae3b2654ee74d5ca66a3b5e7a98d218b51bd4ab Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:33:54 +0100 Subject: [PATCH 61/77] debug --- subworkflows/kraken_pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index e6976fb..28cbf3f 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -8,7 +8,7 @@ workflow kraken_pipeline { fastq main: // Check source param is valid if applicable - if (!params.database and !params.taxonomy_dir) { + if (!params.database and !params.taxonomy) { sources = params.database_sets source_name = params.database_set source_data = sources.get(source_name, false) From efda31a3be77de5ca84fdd5ee4ed04fcca5370cc Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:35:39 +0100 Subject: [PATCH 62/77] debug --- subworkflows/kraken_pipeline.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf index 28cbf3f..aa97ece 100644 --- a/subworkflows/kraken_pipeline.nf +++ b/subworkflows/kraken_pipeline.nf @@ -1,4 +1,6 @@ include { run_kraken_and_bracken } from '../modules/kraken_client' +include { unpackDatabase } from '../modules/kraken_server' +include { unpackTaxonomy } from '../modules/kraken_server' include { qc_checks } from '../modules/qc_checks' include { generate_report } from '../modules/generate_report' From afe025253270b6b6d6a522b317851279f311f1bb Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 11:38:45 +0100 Subject: [PATCH 63/77] typo --- modules/extract_taxa.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 5e9e8f5..67187a1 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -73,7 +73,7 @@ process extract_reads { script: """ extract_kraken_reads.py \ - -s ${fastq1} \ + -s ${fastq} \ -k ${kraken_assignments} \ -r ${kraken_report} \ -b ${bracken_report} \ From 716191e8c9546148d520bbceb7e24dc341a32d85 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 11:41:17 +0100 Subject: [PATCH 64/77] Properly output paired reads --- bin/extract_kraken_reads.py | 80 +++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index 33bc107..fb14acb 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -264,8 +264,18 @@ def extract_taxa( if key not in keys: keys[key] = [] keys[key].append(taxon) - outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w") - print("opening %s.%s.%s" % (prefix, taxon, filetype)) + if reads2: + outfile_handles[taxon] = { + 1: open("%s.%s_1.%s" % (prefix, taxon, filetype), "w"), + 2: open("%s.%s_2.%s" % (prefix, taxon, filetype), "w"), + } + print( + "opening %s.%s_1.%s and %s.%s_2.%s" + % (prefix, taxon, filetype, prefix, taxon, filetype) + ) + else: + outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w") + print("opening %s.%s.%s" % (prefix, taxon, filetype)) out_counts[taxon] = 0 quals[taxon] = [] lens[taxon] = [] @@ -279,27 +289,53 @@ def extract_taxa( for line in kfile: tax_id, read_id = parse_kraken_assignment_line(line) if tax_id in keys: - if read_id in s_file1: - read = s_file1[read_id] - elif reads2 and read_id in s_file2: - read = s_file2[read_id] - else: - sys.stderr.write( - "ERROR: read id %s not found in read files\n" % read_id - ) - sys.exit(1) + if reads2: + if read_id in s_file1 and read_id in s_file2: + read1 = s_file1[read_id] + read2 = s_file2[read_id] + else: + sys.stderr.write( + "ERROR: read id %s not found in read files\n" % read_id + ) + sys.exit(1) + + for taxon in keys[tax_id]: + SeqIO.write(read1, outfile_handles[taxon][1], filetype) + SeqIO.write(read2, outfile_handles[taxon][2], filetype) + out_counts[taxon] += 2 + quals[taxon].append( + median(read1.letter_annotations["phred_quality"]) + ) + quals[taxon].append( + median(read2.letter_annotations["phred_quality"]) + ) + lens[taxon].append(len(read1)) + lens[taxon].append(len(read2)) - for taxon in keys[tax_id]: - SeqIO.write(read, outfile_handles[taxon], filetype) - out_counts[taxon] += 1 - quals[taxon].append( - median(read.letter_annotations["phred_quality"]) - ) - lens[taxon].append(len(read)) - - for handle in outfile_handles: - if outfile_handles[handle]: - outfile_handles[handle].close() + else: + if read_id in s_file1: + read = s_file1[read_id] + else: + sys.stderr.write( + "ERROR: read id %s not found in read file\n" % read_id + ) + sys.exit(1) + + for taxon in keys[tax_id]: + SeqIO.write(read, outfile_handles[taxon], filetype) + out_counts[taxon] += 1 + quals[taxon].append( + median(read.letter_annotations["phred_quality"]) + ) + lens[taxon].append(len(read)) + if reads2: + for handle_dict in outfile_handles: + outfile_handles[handle_dict][1].close() + outfile_handles[handle_dict][2].close() + else: + for handle in outfile_handles: + if outfile_handles[handle]: + outfile_handles[handle].close() summary = [] for taxon in lists_to_extract: From 983becbff49ca5e7a778b1231c72415c031f38f7 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 12:14:05 +0100 Subject: [PATCH 65/77] Comment out extract taxa workflow - unused --- modules/extract_taxa.nf | 62 ++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 67187a1..03204c0 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -93,34 +93,34 @@ process extract_reads { -workflow extract_taxa { - take: - unique_id - fastq_1 - fastq_2 - kraken_assignments - kraken_report - bracken_report - main: - extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) -} - -workflow { - unique_id = "${params.unique_id}" - if ("${params.unique_id}" == "null") { - unique_id = "${fastq.simpleName}" - } - - kraken_assignments = file("${params.kraken_assignments}") - bracken_report = file("${params.bracken_report}") - kraken_report = file("${params.kraken_report}") - - if (params.paired) { - fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) - fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) - extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) - } else { - fastq = file(params.fastq, type: "file", checkIfExists:true) - extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report) - } -} \ No newline at end of file +// workflow extract_taxa { +// take: +// unique_id +// fastq_1 +// fastq_2 +// kraken_assignments +// kraken_report +// bracken_report +// main: +// extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) +// } + +// workflow { +// unique_id = "${params.unique_id}" +// if ("${params.unique_id}" == "null") { +// unique_id = "${fastq.simpleName}" +// } + +// kraken_assignments = file("${params.kraken_assignments}") +// bracken_report = file("${params.bracken_report}") +// kraken_report = file("${params.kraken_report}") + +// if (params.paired) { +// fastq_1 = file(params.fastq1, type: "file", checkIfExists:true) +// fastq_2 = file(params.fastq2, type: "file", checkIfExists:true) +// extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report) +// } else { +// fastq = file(params.fastq, type: "file", checkIfExists:true) +// extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report) +// } +// } \ No newline at end of file From 63582b5fbfa02cb65699e19fed13ceb0af1d84eb Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 12:16:21 +0100 Subject: [PATCH 66/77] bgzip extracted taxa fastqs --- modules/extract_taxa.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 03204c0..734f00e 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -45,7 +45,7 @@ process extract_paired_reads { for f in \$(ls reads.*.f*) do - gzip \$f + bgzip --threads $task.cpus \$f done """ } @@ -85,9 +85,9 @@ process extract_reads { --min_percent ${params.assembly_min_percent} for f in \$(ls reads.*.f*) - do - gzip \$f - done + do + bgzip --threads $task.cpus \$f + done """ } From a709b5f279a8c78c3600505442a2cdce571592b3 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 12:31:29 +0100 Subject: [PATCH 67/77] run extract taxa in scylla container --- modules/extract_taxa.nf | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 734f00e..337b541 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -13,10 +13,8 @@ process extract_paired_reads { publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' - conda 'bioconda::biopython=1.78' - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython%3A1.76' : - 'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }" + conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + container "biowilko/scylla@${params.wf.container_sha}" input: val unique_id @@ -56,11 +54,9 @@ process extract_reads { publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' - conda 'bioconda::biopython=1.78' - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython%3A1.76' : - 'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }" - + conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + container "biowilko/scylla@${params.wf.container_sha}" + input: val unique_id path fastq From c189f717257530afef3c1ab06d699d1bbe5f2dc6 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 12:49:18 +0100 Subject: [PATCH 68/77] Properly output per-run execution reports --- conf/params.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/params.config b/conf/params.config index 9ed6464..7443080 100644 --- a/conf/params.config +++ b/conf/params.config @@ -5,7 +5,7 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" + tracedir = "${params.out_dir}/${params.unique_id}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null From 519523036f212579a30455f0f72170f72bfc5a74 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 12:56:29 +0100 Subject: [PATCH 69/77] extract taxa summary, handle paired fnames --- bin/extract_kraken_reads.py | 46 ++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index fb14acb..5f7c9d8 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -339,19 +339,39 @@ def extract_taxa( summary = [] for taxon in lists_to_extract: - summary.append( - { - "human_readable": bracken_hierarchy[taxon]["name"], - "taxon": taxon, - "tax_level": bracken_hierarchy[taxon]["rank"], - "filename": "%s.%s.%s" % (prefix, taxon, filetype), - "qc_metrics": { - "num_reads": out_counts[taxon], - "avg_qual": mean(quals[taxon]), - "mean_len": mean(lens[taxon]), - }, - } - ) + if reads2: + summary.append( + { + "human_readable": bracken_hierarchy[taxon]["name"], + "taxon": taxon, + "tax_level": bracken_hierarchy[taxon]["rank"], + "filenames": [ + "%s.%s_1.%s" % (prefix, taxon, filetype), + "%s.%s_2.%s" % (prefix, taxon, filetype), + ], + "qc_metrics": { + "num_reads": out_counts[taxon], + "avg_qual": mean(quals[taxon]), + "mean_len": mean(lens[taxon]), + }, + } + ) + else: + summary.append( + { + "human_readable": bracken_hierarchy[taxon]["name"], + "taxon": taxon, + "tax_level": bracken_hierarchy[taxon]["rank"], + "filenames": [ + "%s.%s.%s" % (prefix, taxon, filetype), + ], + "qc_metrics": { + "num_reads": out_counts[taxon], + "avg_qual": mean(quals[taxon]), + "mean_len": mean(lens[taxon]), + }, + } + ) with open("%s_summary.json" % prefix, "w") as f: print(summary) json.dump(summary, f) From 61bf897112ffb9431d4d40c10168527e76311b87 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 13:05:02 +0100 Subject: [PATCH 70/77] pass through command line parameter --- bin/extract_kraken_reads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index 5f7c9d8..7abbac6 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -533,6 +533,7 @@ def main(): args.reads2, args.prefix, max_human=args.max_human, + names = args.taxid, target_ranks=target_ranks, min_count=args.min_count, min_count_descendants=args.min_count_descendants, From 57ab71d85de012fbe935f68ace2450e11261b445 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 13:07:31 +0100 Subject: [PATCH 71/77] allow multi jsons as per another scylla branch and issue #11 --- bin/make_report.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bin/make_report.py b/bin/make_report.py index 55f7c1b..586c6df 100755 --- a/bin/make_report.py +++ b/bin/make_report.py @@ -59,7 +59,8 @@ def main(): parser.add_argument( "--assignments", - help="JSON file of kraken/bracken assignments", + help="JSON file(s) of kraken/bracken assignments", + nargs='+', required=True, type=Path, ) @@ -81,8 +82,14 @@ def main(): sample = args.sample_id - with open(args.assignments.resolve(), "rt") as bracken_file: - assignments = bracken_file.read().strip().replace('"', '\\"') + assignments = None + for bracken_file in args.assignments: + with open(bracken_file.resolve(), "rt") as bracken_handle: + contents = bracken_handle.read().strip().replace('"', '\\"') + if not assignments: + assignments = contents + else: + assignments = assignments[:-1] + ", " + contents if args.read_counts: with open(args.read_counts.resolve(), "rt") as qc_file: From 1869c2feac02aeea374852c421df793fd69e290c Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 13:22:26 +0100 Subject: [PATCH 72/77] Up resources available to bgzip taxon fastqs --- modules/extract_taxa.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index 337b541..e3e8198 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -9,7 +9,7 @@ // in config so could be good dehuman sanity check process extract_paired_reads { - label 'process_medium' + label 'process_high' publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' @@ -56,7 +56,7 @@ process extract_reads { conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' container "biowilko/scylla@${params.wf.container_sha}" - + input: val unique_id path fastq From 3e017d2fbf064f9a74b98193b079785d04022e5d Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 13:28:19 +0100 Subject: [PATCH 73/77] Re-enable human threshold fail --- bin/extract_kraken_reads.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index 7abbac6..c8e4b10 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -76,7 +76,7 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None): "ERROR: found %i human reads, max allowed is %i\n" % (num_direct, max_human) ) - # sys.exit(2) + sys.exit(2) continue entries[ncbi] = { @@ -533,7 +533,7 @@ def main(): args.reads2, args.prefix, max_human=args.max_human, - names = args.taxid, + names=args.taxid, target_ranks=target_ranks, min_count=args.min_count, min_count_descendants=args.min_count_descendants, From 6ac4a84f9c567b020ba25ca32e3863f3739939f1 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 13:52:10 +0100 Subject: [PATCH 74/77] increase extract_reads resource again --- modules/extract_taxa.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index e3e8198..dd6b434 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -50,7 +50,7 @@ process extract_paired_reads { process extract_reads { - label 'process_medium' + label 'process_high' publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' From ae6ef527d4da431444f330a11fe0082d87144eed Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 14:27:26 +0100 Subject: [PATCH 75/77] fix bug in extract_kraken_reads --- bin/extract_kraken_reads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py index 7abbac6..325bdd5 100755 --- a/bin/extract_kraken_reads.py +++ b/bin/extract_kraken_reads.py @@ -430,6 +430,7 @@ def main(): dest="taxid", required=False, nargs="*", + default=[], help="List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file", ) parser.add_argument( From 194a8522efb3d3885945bc9b8f422ebbb284b46c Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 21 Jun 2023 14:27:40 +0100 Subject: [PATCH 76/77] add more dependancies --- environment.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 2bc591f..77b0d01 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: scylla +name: scylla_dev channels: - bioconda - conda-forge @@ -15,4 +15,10 @@ dependencies: - fastcat - biopython - flye - - megahit \ No newline at end of file + - megahit + - pandas + - numpy + - taxonkit + - fastp + - tabix + - mako From 0503fd0236b04d043748a6f27ccad087f309b458 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Wed, 21 Jun 2023 14:44:24 +0100 Subject: [PATCH 77/77] ignore extract taxa fails with exit code 2 --- modules/extract_taxa.nf | 4 ++++ nextflow.config | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf index dd6b434..3484984 100644 --- a/modules/extract_taxa.nf +++ b/modules/extract_taxa.nf @@ -11,6 +11,8 @@ process extract_paired_reads { label 'process_high' + errorStrategy {task.exitStatus == 2 ? 'ignore' : 'terminate'} + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' @@ -52,6 +54,8 @@ process extract_reads { label 'process_high' + errorStrategy {task.exitStatus == 2 ? 'ignore' : 'terminate'} + publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy' conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' diff --git a/nextflow.config b/nextflow.config index 2766db6..b6980eb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -143,19 +143,19 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.tracedir}/execution_timeline_${params.unique_id}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.tracedir}/execution_report_${params.unique_id}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.tracedir}/execution_trace_${params.unique_id}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.tracedir}/pipeline_dag_${params.unique_id}.html" }