From 18c2996ee2a33089025bed05f9c054bb52151979 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Wed, 17 Jan 2024 13:33:39 +0000 Subject: [PATCH] Add viral (#46) * add virus without unclassified * change name viruses to viral and remove empty unclassified output * fix logo * Update some out of date conda defs, minor formatting tweaks * Update mappy container to 3.10 version --------- Co-authored-by: Biowilko --- bin/extract_fraction_from_reads.py | 19 +++-- bin/scylla.mako.html | 2 +- dockerfiles/scylla/environment.yml | 5 +- modules/check_hcid_status.nf | 6 +- modules/extract_all.nf | 130 +++++++++++++++++++++-------- modules/generate_report.nf | 2 +- modules/kraken_client.nf | 22 ++--- modules/kraken_server.nf | 2 + modules/preprocess.nf | 12 +-- modules/qc_checks.nf | 3 +- 10 files changed, 134 insertions(+), 69 deletions(-) diff --git a/bin/extract_fraction_from_reads.py b/bin/extract_fraction_from_reads.py index f18d79e..c933d2a 100755 --- a/bin/extract_fraction_from_reads.py +++ b/bin/extract_fraction_from_reads.py @@ -29,12 +29,12 @@ def median(l): def load_from_taxonomy(taxonomy_dir, taxids, include_unclassified): sys.stderr.write("Loading hierarchy\n") - entries = {"0":{"name": "unclassified", - "taxon": "0", - "rank": None - } - } - + entries = {} + if include_unclassified: + entries["0"] = {"name": "unclassified", + "taxon": "0", + "rank": None + } taxid_map = defaultdict(set) for key in taxids: @@ -159,6 +159,7 @@ def trim_read_id(read_id): def fastq_iterator( prefix: str, filetype: str, + include_unclassified: bool, entries: dict, read_map: dict, fastq_1: Path, @@ -169,6 +170,7 @@ def fastq_iterator( Args: prefix (str): Outfile prefix filetype (str): output filetype (only affects naming) + include_unclassified (bool): true if includes unclassified reads read_map (dict): dict of read_id: taxon_list (from kraken report) subtaxa_map (dict): dict of subtaxa: output taxon (from load_from_taxonomy) exclude (bool): if true, inverts the logic for including reads @@ -192,6 +194,9 @@ def fastq_iterator( print(entries) for taxon, entry in entries.items(): taxon_name = entry["name"].lower() + if include_unclassified and taxon_name != "unclassified": + taxon_name += "_and_unclassified" + taxon_name = taxon_name.replace("viruses", "viral") if fastq_2: out_handles_1[taxon] = open(f"{taxon_name}_1.{filetype}", "w") out_handles_2[taxon] = open(f"{taxon_name}_2.{filetype}", "w") @@ -332,7 +337,7 @@ def extract_reads( ) else: out_counts, quals, lens, names = fastq_iterator( - prefix, filetype, entries, read_map, reads1, reads2 + prefix, filetype, include_unclassified, entries, read_map, reads1, reads2 ) sys.stderr.write("Write summary\n") diff --git a/bin/scylla.mako.html b/bin/scylla.mako.html index 52cba67..c2cf573 100644 --- a/bin/scylla.mako.html +++ b/bin/scylla.mako.html @@ -1172,7 +1172,7 @@

Read statistics

Developed by the ARTIC.network, funded by Wellcome
- +
diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml index 40801b8..9cec028 100644 --- a/dockerfiles/scylla/environment.yml +++ b/dockerfiles/scylla/environment.yml @@ -3,7 +3,7 @@ channels: - bioconda - conda-forge - defaults - - epi2melabs + - nanoporetech dependencies: - python<=3.10 - pysam @@ -15,5 +15,4 @@ dependencies: - fastp - tabix - mako - - jq - - mappy \ No newline at end of file + - jq \ No newline at end of file diff --git a/modules/check_hcid_status.nf b/modules/check_hcid_status.nf index ab4ad61..f55fb07 100644 --- a/modules/check_hcid_status.nf +++ b/modules/check_hcid_status.nf @@ -2,10 +2,10 @@ process check_hcid { - label 'process_single' + label "process_single" - conda 'bioconda::biopython=1.78 bioconda::mappy=2.26' - container "${params.wf.container}:${params.wf.container_version}" + conda "bioconda::mappy=2.26" + container "biocontainers/mappy:2.26--py310h83093d7_1" publishDir path: "${params.outdir}/${unique_id}/qc/", mode: 'copy' diff --git a/modules/extract_all.nf b/modules/extract_all.nf index 0103192..0f3cbd6 100644 --- a/modules/extract_all.nf +++ b/modules/extract_all.nf @@ -3,19 +3,19 @@ // it is possible that no files would be extracted if there were no subsets of reads which matched the criteria // also note that the reads extracted don't match up with bracken abundance reestimates, although we do use those -// as more accurate numbers when deciding what to pull out (bracken doesn't provide read break down) +// as more accurate numbers when deciding what to pull out (bracken doesn"t provide read break down) // probably want to count up how many have been found here for run log // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified // in config so could be good dehuman sanity check process split_kreport { - label 'process_single' + label "process_single" - conda 'bioconda::biopython=1.78' - container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" + conda "python=3.10" + container "biocontainers/python:3.10" - publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy', pattern: "*.json" + publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy", pattern: "*.json" input: tuple val(unique_id), path(kreport) @@ -33,12 +33,12 @@ process split_kreport { process extract_taxa_paired_reads { - label 'process_single' - label 'process_high_memory' + label "process_single" + label "process_high_memory" - errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'} + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} - conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + conda "bioconda::pyfastx=2.01" container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" input: @@ -75,12 +75,12 @@ process extract_taxa_paired_reads { process extract_taxa_reads { - label 'process_single' - label 'process_high_memory' + label "process_single" + label "process_high_memory" - errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'} + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} - conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + conda "bioconda::pyfastx=2.01" container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" input: @@ -116,12 +116,12 @@ process extract_taxa_reads { process extract_paired_virus_and_unclassified { - label 'process_single' - label 'process_high_memory' + label "process_single" + label "process_high_memory" - errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'} + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} - conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + conda "bioconda::pyfastx=2.01" container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" input: @@ -145,12 +145,12 @@ process extract_paired_virus_and_unclassified { process extract_virus_and_unclassified { - label 'process_single' - label 'process_high_memory' + label "process_single" + label "process_high_memory" - errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'} + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} - conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + conda "bioconda::pyfastx=2.01" container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" input: @@ -172,7 +172,7 @@ process extract_virus_and_unclassified { } -process extract_paired_dehumanized { +process extract_paired_virus { label 'process_single' label 'process_high_memory' @@ -195,13 +195,12 @@ process extract_paired_dehumanized { -s2 ${fastq2} \ -k ${kraken_assignments} \ -t ${taxonomy_dir} \ - -p "dehumanized" \ - --exclude \ - --taxid ${params.taxid_human} + -p "virus" \ + --taxid 10239 """ } -process extract_dehumanized { +process extract_virus { label 'process_single' label 'process_high_memory' @@ -211,6 +210,63 @@ process extract_dehumanized { conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" + input: + tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport) + path taxonomy_dir + output: + tuple val(unique_id), path("*.fastq"), emit: reads + tuple val(unique_id), path("*_summary.json"), emit: summary + script: + """ + extract_fraction_from_reads.py \ + -s ${fastq} \ + -k ${kraken_assignments} \ + -t ${taxonomy_dir} \ + -p "virus" \ + --taxid 10239 + """ +} + + +process extract_paired_dehumanized { + + label "process_single" + label "process_high_memory" + + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} + + conda "bioconda::pyfastx=2.01" + container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" + + input: + tuple val(unique_id), path(fastq1), path(fastq2), path(kraken_assignments), path(kreport) + path taxonomy_dir + output: + tuple val(unique_id), path("*.fastq"), emit: reads + tuple val(unique_id), path("*_summary.json"), emit: summary + script: + """ + extract_fraction_from_reads.py \ + -s1 ${fastq1} \ + -s2 ${fastq2} \ + -k ${kraken_assignments} \ + -t ${taxonomy_dir} \ + -p "dehumanized" \ + --exclude \ + --taxid ${params.taxid_human} + """ +} + +process extract_dehumanized { + + label "process_single" + label "process_high_memory" + + errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"} + + conda "bioconda::pyfastx=2.01" + container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0" + input: tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport) path taxonomy_dir @@ -231,11 +287,11 @@ process extract_dehumanized { process bgzip_extracted_taxa { - label 'process_medium' + label "process_medium" - publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: "copy" - conda 'bioconda::biopython=1.78 bioconda::tabix=1.11' + conda "bioconda::tabix=1.11" container "${params.wf.container}:${params.wf.container_version}" input: @@ -243,7 +299,7 @@ process bgzip_extracted_taxa { val(prefix) output: tuple val(unique_id), path("*.f*q.gz") - tuple val(unique_id), path("virus*.f*q.gz"), emit: virus, optional:true + tuple val(unique_id), path("viral*_and_unclassified*.f*q.gz"), emit: virus, optional:true script: """ for f in \$(ls *.f*q) @@ -255,9 +311,9 @@ process bgzip_extracted_taxa { process merge_read_summary { - label 'process_single' + label "process_single" - publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: "copy" container "${params.wf.container}:${params.wf.container_version}" @@ -335,21 +391,23 @@ workflow extract_fractions { if ( params.paired ){ extract_paired_dehumanized(full_extract_ch, taxonomy_dir) extract_paired_virus_and_unclassified(full_extract_ch, taxonomy_dir) + extract_paired_virus(full_extract_ch, taxonomy_dir) extract_paired_dehumanized.out.reads - .concat(extract_paired_virus_and_unclassified.out.reads) + .concat(extract_paired_virus_and_unclassified.out.reads, extract_paired_virus.out.reads) .set {extracted_fractions} extract_paired_dehumanized.out.summary - .concat(extract_paired_virus_and_unclassified.out.summary) + .concat(extract_paired_virus_and_unclassified.out.summary, extract_paired_virus.out.summary) .groupTuple() .set {fractions_summary_ch} } else { extract_dehumanized(full_extract_ch, taxonomy_dir) extract_virus_and_unclassified(full_extract_ch, taxonomy_dir) + extract_virus(full_extract_ch, taxonomy_dir) extract_dehumanized.out.reads - .concat(extract_virus_and_unclassified.out.reads) + .concat(extract_virus_and_unclassified.out.reads, extract_virus.out.reads) .set {extracted_fractions} extract_dehumanized.out.summary - .concat(extract_virus_and_unclassified.out.summary) + .concat(extract_virus_and_unclassified.out.summary, extract_virus.out.summary) .groupTuple() .set {fractions_summary_ch} } diff --git a/modules/generate_report.nf b/modules/generate_report.nf index e564587..4136ebc 100644 --- a/modules/generate_report.nf +++ b/modules/generate_report.nf @@ -6,7 +6,7 @@ process make_report { publishDir path: "${params.outdir}/${unique_id}/", mode: 'copy' - conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" + conda "anaconda::Mako=1.2.3" container "${params.wf.container}:${params.wf.container_version}" input: diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf index d6e53eb..dd3295d 100644 --- a/modules/kraken_client.nf +++ b/modules/kraken_client.nf @@ -3,21 +3,23 @@ kraken_compute = params.kraken_clients == 1 ? 1 : params.kraken_clients - 1 process kraken2_client { - label 'process_low' - label 'error_retry' + label "process_low" + label "error_retry" maxForks kraken_compute - conda "epi2melabs::kraken2-server=0.1.3" + conda "nanoporetech::kraken2-server=0.1.7" container "${params.wf.container}:${params.wf.container_version}" containerOptions {workflow.profile != "singularity" ? "--network host" : ""} - publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy" input: tuple val(unique_id), path(fastq) + output: tuple val(unique_id), path("${params.database_set}.kraken_assignments.tsv"), emit: assignments tuple val(unique_id), path("${params.database_set}.kraken_report.txt"), emit: report + script: """ kraken2_client \ @@ -52,9 +54,9 @@ process determine_bracken_length { // this fails if the kraken file input is empty - currently have no check that it is populated process bracken { - label 'process_low' + label "process_low" - publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy" conda "bioconda::bracken=2.7" container "biocontainers/bracken:2.9--py39h1f90b4d_0" @@ -81,9 +83,9 @@ process bracken_to_json { label "process_low" - publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy" - conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" + conda "bioconda::taxonkit=0.15.1 python=3.10" container "${params.wf.container}:${params.wf.container_version}" input: @@ -109,9 +111,9 @@ process kraken_to_json { label "process_low" - publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy' + publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy" - conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3" + conda "bioconda::taxonkit=0.15.1 python=3.10" container "${params.wf.container}:${params.wf.container_version}" diff --git a/modules/kraken_server.nf b/modules/kraken_server.nf index 9b6f651..20b3022 100644 --- a/modules/kraken_server.nf +++ b/modules/kraken_server.nf @@ -71,6 +71,7 @@ process kraken_server { label "process_long" memory { 8.GB * task.attempt } cpus params.threads + conda "nanoporetech::kraken2-server=0.1.7" container "${params.wf.container}:${params.wf.container_version}" containerOptions {workflow.profile != "singularity" ? "--network host" : ""} input: @@ -91,6 +92,7 @@ process kraken_server { process stop_kraken_server { label "process_single" + conda "nanoporetech::kraken2-server=0.1.7" container "${params.wf.container}:${params.wf.container_version}" containerOptions {workflow.profile != "singularity" ? "--network host" : ""} // this shouldn't happen, but we'll keep retrying diff --git a/modules/preprocess.nf b/modules/preprocess.nf index c785944..daae63b 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -1,8 +1,8 @@ process fastp_paired { - label 'process_medium' + label "process_medium" - publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy' + publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy" container "${params.wf.container}:${params.wf.container_version}" @@ -39,9 +39,9 @@ process fastp_paired { process fastp_single { - label 'process_medium' + label "process_medium" - publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy' + publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy" container "${params.wf.container}:${params.wf.container_version}" @@ -71,9 +71,9 @@ process fastp_single { process paired_concatenate { - label 'process_low' + label "process_low" - publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy' + publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy" container "${params.wf.container}:${params.wf.container_version}" diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf index 7834a4c..e1c138a 100644 --- a/modules/qc_checks.nf +++ b/modules/qc_checks.nf @@ -4,7 +4,7 @@ process read_stats { label "process_medium" - conda "epi2melabs::kraken2-server=0.1.3" + conda "nanoporetech::fastcat=0.15.1" container "${params.wf.container}:${params.wf.container_version}" input: @@ -25,7 +25,6 @@ process publish_stats { publishDir "${params.outdir}/${unique_id}/qc", mode: 'copy' - conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3" container "${params.wf.container}:${params.wf.container_version}" input: