diff --git a/bin/extract_fraction_from_reads.py b/bin/extract_fraction_from_reads.py
index f18d79e..c933d2a 100755
--- a/bin/extract_fraction_from_reads.py
+++ b/bin/extract_fraction_from_reads.py
@@ -29,12 +29,12 @@ def median(l):
def load_from_taxonomy(taxonomy_dir, taxids, include_unclassified):
sys.stderr.write("Loading hierarchy\n")
- entries = {"0":{"name": "unclassified",
- "taxon": "0",
- "rank": None
- }
- }
-
+ entries = {}
+ if include_unclassified:
+ entries["0"] = {"name": "unclassified",
+ "taxon": "0",
+ "rank": None
+ }
taxid_map = defaultdict(set)
for key in taxids:
@@ -159,6 +159,7 @@ def trim_read_id(read_id):
def fastq_iterator(
prefix: str,
filetype: str,
+ include_unclassified: bool,
entries: dict,
read_map: dict,
fastq_1: Path,
@@ -169,6 +170,7 @@ def fastq_iterator(
Args:
prefix (str): Outfile prefix
filetype (str): output filetype (only affects naming)
+ include_unclassified (bool): true if includes unclassified reads
read_map (dict): dict of read_id: taxon_list (from kraken report)
subtaxa_map (dict): dict of subtaxa: output taxon (from load_from_taxonomy)
exclude (bool): if true, inverts the logic for including reads
@@ -192,6 +194,9 @@ def fastq_iterator(
print(entries)
for taxon, entry in entries.items():
taxon_name = entry["name"].lower()
+ if include_unclassified and taxon_name != "unclassified":
+ taxon_name += "_and_unclassified"
+ taxon_name = taxon_name.replace("viruses", "viral")
if fastq_2:
out_handles_1[taxon] = open(f"{taxon_name}_1.{filetype}", "w")
out_handles_2[taxon] = open(f"{taxon_name}_2.{filetype}", "w")
@@ -332,7 +337,7 @@ def extract_reads(
)
else:
out_counts, quals, lens, names = fastq_iterator(
- prefix, filetype, entries, read_map, reads1, reads2
+ prefix, filetype, include_unclassified, entries, read_map, reads1, reads2
)
sys.stderr.write("Write summary\n")
diff --git a/bin/scylla.mako.html b/bin/scylla.mako.html
index 52cba67..c2cf573 100644
--- a/bin/scylla.mako.html
+++ b/bin/scylla.mako.html
@@ -1172,7 +1172,7 @@
Read statistics
Developed by the ARTIC.network, funded by Wellcome
-
+
diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
index 40801b8..9cec028 100644
--- a/dockerfiles/scylla/environment.yml
+++ b/dockerfiles/scylla/environment.yml
@@ -3,7 +3,7 @@ channels:
- bioconda
- conda-forge
- defaults
- - epi2melabs
+ - nanoporetech
dependencies:
- python<=3.10
- pysam
@@ -15,5 +15,4 @@ dependencies:
- fastp
- tabix
- mako
- - jq
- - mappy
\ No newline at end of file
+ - jq
\ No newline at end of file
diff --git a/modules/check_hcid_status.nf b/modules/check_hcid_status.nf
index ab4ad61..f55fb07 100644
--- a/modules/check_hcid_status.nf
+++ b/modules/check_hcid_status.nf
@@ -2,10 +2,10 @@
process check_hcid {
- label 'process_single'
+ label "process_single"
- conda 'bioconda::biopython=1.78 bioconda::mappy=2.26'
- container "${params.wf.container}:${params.wf.container_version}"
+ conda "bioconda::mappy=2.26"
+ container "biocontainers/mappy:2.26--py310h83093d7_1"
publishDir path: "${params.outdir}/${unique_id}/qc/", mode: 'copy'
diff --git a/modules/extract_all.nf b/modules/extract_all.nf
index 0103192..0f3cbd6 100644
--- a/modules/extract_all.nf
+++ b/modules/extract_all.nf
@@ -3,19 +3,19 @@
// it is possible that no files would be extracted if there were no subsets of reads which matched the criteria
// also note that the reads extracted don't match up with bracken abundance reestimates, although we do use those
-// as more accurate numbers when deciding what to pull out (bracken doesn't provide read break down)
+// as more accurate numbers when deciding what to pull out (bracken doesn"t provide read break down)
// probably want to count up how many have been found here for run log
// ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
// in config so could be good dehuman sanity check
process split_kreport {
- label 'process_single'
+ label "process_single"
- conda 'bioconda::biopython=1.78'
- container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+ conda "python=3.10"
+ container "biocontainers/python:3.10"
- publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy', pattern: "*.json"
+ publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy", pattern: "*.json"
input:
tuple val(unique_id), path(kreport)
@@ -33,12 +33,12 @@ process split_kreport {
process extract_taxa_paired_reads {
- label 'process_single'
- label 'process_high_memory'
+ label "process_single"
+ label "process_high_memory"
- errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
- conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+ conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
input:
@@ -75,12 +75,12 @@ process extract_taxa_paired_reads {
process extract_taxa_reads {
- label 'process_single'
- label 'process_high_memory'
+ label "process_single"
+ label "process_high_memory"
- errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
- conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+ conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
input:
@@ -116,12 +116,12 @@ process extract_taxa_reads {
process extract_paired_virus_and_unclassified {
- label 'process_single'
- label 'process_high_memory'
+ label "process_single"
+ label "process_high_memory"
- errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
- conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+ conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
input:
@@ -145,12 +145,12 @@ process extract_paired_virus_and_unclassified {
process extract_virus_and_unclassified {
- label 'process_single'
- label 'process_high_memory'
+ label "process_single"
+ label "process_high_memory"
- errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
- conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+ conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
input:
@@ -172,7 +172,7 @@ process extract_virus_and_unclassified {
}
-process extract_paired_dehumanized {
+process extract_paired_virus {
label 'process_single'
label 'process_high_memory'
@@ -195,13 +195,12 @@ process extract_paired_dehumanized {
-s2 ${fastq2} \
-k ${kraken_assignments} \
-t ${taxonomy_dir} \
- -p "dehumanized" \
- --exclude \
- --taxid ${params.taxid_human}
+ -p "virus" \
+ --taxid 10239
"""
}
-process extract_dehumanized {
+process extract_virus {
label 'process_single'
label 'process_high_memory'
@@ -211,6 +210,63 @@ process extract_dehumanized {
conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+ input:
+ tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
+ path taxonomy_dir
+ output:
+ tuple val(unique_id), path("*.fastq"), emit: reads
+ tuple val(unique_id), path("*_summary.json"), emit: summary
+ script:
+ """
+ extract_fraction_from_reads.py \
+ -s ${fastq} \
+ -k ${kraken_assignments} \
+ -t ${taxonomy_dir} \
+ -p "virus" \
+ --taxid 10239
+ """
+}
+
+
+process extract_paired_dehumanized {
+
+ label "process_single"
+ label "process_high_memory"
+
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
+
+ conda "bioconda::pyfastx=2.01"
+ container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+
+ input:
+ tuple val(unique_id), path(fastq1), path(fastq2), path(kraken_assignments), path(kreport)
+ path taxonomy_dir
+ output:
+ tuple val(unique_id), path("*.fastq"), emit: reads
+ tuple val(unique_id), path("*_summary.json"), emit: summary
+ script:
+ """
+ extract_fraction_from_reads.py \
+ -s1 ${fastq1} \
+ -s2 ${fastq2} \
+ -k ${kraken_assignments} \
+ -t ${taxonomy_dir} \
+ -p "dehumanized" \
+ --exclude \
+ --taxid ${params.taxid_human}
+ """
+}
+
+process extract_dehumanized {
+
+ label "process_single"
+ label "process_high_memory"
+
+ errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
+
+ conda "bioconda::pyfastx=2.01"
+ container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+
input:
tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
path taxonomy_dir
@@ -231,11 +287,11 @@ process extract_dehumanized {
process bgzip_extracted_taxa {
- label 'process_medium'
+ label "process_medium"
- publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: "copy"
- conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+ conda "bioconda::tabix=1.11"
container "${params.wf.container}:${params.wf.container_version}"
input:
@@ -243,7 +299,7 @@ process bgzip_extracted_taxa {
val(prefix)
output:
tuple val(unique_id), path("*.f*q.gz")
- tuple val(unique_id), path("virus*.f*q.gz"), emit: virus, optional:true
+ tuple val(unique_id), path("viral*_and_unclassified*.f*q.gz"), emit: virus, optional:true
script:
"""
for f in \$(ls *.f*q)
@@ -255,9 +311,9 @@ process bgzip_extracted_taxa {
process merge_read_summary {
- label 'process_single'
+ label "process_single"
- publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: "copy"
container "${params.wf.container}:${params.wf.container_version}"
@@ -335,21 +391,23 @@ workflow extract_fractions {
if ( params.paired ){
extract_paired_dehumanized(full_extract_ch, taxonomy_dir)
extract_paired_virus_and_unclassified(full_extract_ch, taxonomy_dir)
+ extract_paired_virus(full_extract_ch, taxonomy_dir)
extract_paired_dehumanized.out.reads
- .concat(extract_paired_virus_and_unclassified.out.reads)
+ .concat(extract_paired_virus_and_unclassified.out.reads, extract_paired_virus.out.reads)
.set {extracted_fractions}
extract_paired_dehumanized.out.summary
- .concat(extract_paired_virus_and_unclassified.out.summary)
+ .concat(extract_paired_virus_and_unclassified.out.summary, extract_paired_virus.out.summary)
.groupTuple()
.set {fractions_summary_ch}
} else {
extract_dehumanized(full_extract_ch, taxonomy_dir)
extract_virus_and_unclassified(full_extract_ch, taxonomy_dir)
+ extract_virus(full_extract_ch, taxonomy_dir)
extract_dehumanized.out.reads
- .concat(extract_virus_and_unclassified.out.reads)
+ .concat(extract_virus_and_unclassified.out.reads, extract_virus.out.reads)
.set {extracted_fractions}
extract_dehumanized.out.summary
- .concat(extract_virus_and_unclassified.out.summary)
+ .concat(extract_virus_and_unclassified.out.summary, extract_virus.out.summary)
.groupTuple()
.set {fractions_summary_ch}
}
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index e564587..4136ebc 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -6,7 +6,7 @@ process make_report {
publishDir path: "${params.outdir}/${unique_id}/", mode: 'copy'
- conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+ conda "anaconda::Mako=1.2.3"
container "${params.wf.container}:${params.wf.container_version}"
input:
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index d6e53eb..dd3295d 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -3,21 +3,23 @@ kraken_compute = params.kraken_clients == 1 ? 1 : params.kraken_clients - 1
process kraken2_client {
- label 'process_low'
- label 'error_retry'
+ label "process_low"
+ label "error_retry"
maxForks kraken_compute
- conda "epi2melabs::kraken2-server=0.1.3"
+ conda "nanoporetech::kraken2-server=0.1.7"
container "${params.wf.container}:${params.wf.container_version}"
containerOptions {workflow.profile != "singularity" ? "--network host" : ""}
- publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy"
input:
tuple val(unique_id), path(fastq)
+
output:
tuple val(unique_id), path("${params.database_set}.kraken_assignments.tsv"), emit: assignments
tuple val(unique_id), path("${params.database_set}.kraken_report.txt"), emit: report
+
script:
"""
kraken2_client \
@@ -52,9 +54,9 @@ process determine_bracken_length {
// this fails if the kraken file input is empty - currently have no check that it is populated
process bracken {
- label 'process_low'
+ label "process_low"
- publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy"
conda "bioconda::bracken=2.7"
container "biocontainers/bracken:2.9--py39h1f90b4d_0"
@@ -81,9 +83,9 @@ process bracken_to_json {
label "process_low"
- publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy"
- conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+ conda "bioconda::taxonkit=0.15.1 python=3.10"
container "${params.wf.container}:${params.wf.container_version}"
input:
@@ -109,9 +111,9 @@ process kraken_to_json {
label "process_low"
- publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy'
+ publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy"
- conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+ conda "bioconda::taxonkit=0.15.1 python=3.10"
container "${params.wf.container}:${params.wf.container_version}"
diff --git a/modules/kraken_server.nf b/modules/kraken_server.nf
index 9b6f651..20b3022 100644
--- a/modules/kraken_server.nf
+++ b/modules/kraken_server.nf
@@ -71,6 +71,7 @@ process kraken_server {
label "process_long"
memory { 8.GB * task.attempt }
cpus params.threads
+ conda "nanoporetech::kraken2-server=0.1.7"
container "${params.wf.container}:${params.wf.container_version}"
containerOptions {workflow.profile != "singularity" ? "--network host" : ""}
input:
@@ -91,6 +92,7 @@ process kraken_server {
process stop_kraken_server {
label "process_single"
+ conda "nanoporetech::kraken2-server=0.1.7"
container "${params.wf.container}:${params.wf.container_version}"
containerOptions {workflow.profile != "singularity" ? "--network host" : ""}
// this shouldn't happen, but we'll keep retrying
diff --git a/modules/preprocess.nf b/modules/preprocess.nf
index c785944..daae63b 100644
--- a/modules/preprocess.nf
+++ b/modules/preprocess.nf
@@ -1,8 +1,8 @@
process fastp_paired {
- label 'process_medium'
+ label "process_medium"
- publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy'
+ publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy"
container "${params.wf.container}:${params.wf.container_version}"
@@ -39,9 +39,9 @@ process fastp_paired {
process fastp_single {
- label 'process_medium'
+ label "process_medium"
- publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy'
+ publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy"
container "${params.wf.container}:${params.wf.container_version}"
@@ -71,9 +71,9 @@ process fastp_single {
process paired_concatenate {
- label 'process_low'
+ label "process_low"
- publishDir "${params.outdir}/${unique_id}/preprocess/", mode: 'copy'
+ publishDir "${params.outdir}/${unique_id}/preprocess/", mode: "copy"
container "${params.wf.container}:${params.wf.container_version}"
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index 7834a4c..e1c138a 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -4,7 +4,7 @@ process read_stats {
label "process_medium"
- conda "epi2melabs::kraken2-server=0.1.3"
+ conda "nanoporetech::fastcat=0.15.1"
container "${params.wf.container}:${params.wf.container_version}"
input:
@@ -25,7 +25,6 @@ process publish_stats {
publishDir "${params.outdir}/${unique_id}/qc", mode: 'copy'
- conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
container "${params.wf.container}:${params.wf.container_version}"
input: