From fb407b4d698fbf2f7dc9b3f3600c0bcc89fbeed3 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:23:42 +0100
Subject: [PATCH 01/77] cloudy refactor

---
 bin/combine_kreports.py                       |  87 ++++--
 conf/base.config                              |  66 +++++
 dockerfiles/general_container/Dockerfile      |   9 +
 dockerfiles/general_container/environment.yml |  12 +
 modules/assemble_taxa.nf                      |  66 +++++
 modules/extract_taxa.nf                       |   9 +
 modules/fastp.nf                              |  70 +++++
 modules/generate_report.nf                    |  11 +-
 modules/kraken_client.nf                      |  34 ++-
 modules/qc_checks.nf                          |  15 +-
 nextflow.config                               | 251 ++++++++++++------
 subworkflows/ingest.nf                        |  28 +-
 12 files changed, 538 insertions(+), 120 deletions(-)
 create mode 100644 conf/base.config
 create mode 100644 dockerfiles/general_container/Dockerfile
 create mode 100644 dockerfiles/general_container/environment.yml
 create mode 100644 modules/fastp.nf

diff --git a/bin/combine_kreports.py b/bin/combine_kreports.py
index 541e6be..b7c76cf 100755
--- a/bin/combine_kreports.py
+++ b/bin/combine_kreports.py
@@ -1,28 +1,36 @@
 #!/usr/bin/env python
 
 import sys
-import gzip
 from Bio import SeqIO
 import argparse
 from datetime import datetime
 
+
 def parse_depth(name):
     parse_name = name.split(" ")
     depth = 0
     for i in parse_name:
-        if i!="":
+        if i != "":
             break
         depth += 1
-    depth = int(depth/2)
+    depth = int(depth / 2)
     return depth
 
+
 def get_kraken_hierarchy(kraken_file, entries={}, total=0):
     with open(kraken_file, "r") as f:
         hierarchy = []
         for line in f:
             if line.startswith("% of Seqs"):
                 continue
-            percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t")
+            (
+                percentage,
+                num_clade_root,
+                num_direct,
+                raw_rank,
+                ncbi,
+                name,
+            ) = line.strip().split("\t")
             percentage = float(percentage)
             num_clade_root = int(num_clade_root)
             num_direct = int(num_direct)
@@ -37,8 +45,18 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0):
                 entries[ncbi]["count"] += num_direct
                 entries[ncbi]["count_descendants"] += num_clade_root
             else:
-                entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root,
-            "raw_rank": raw_rank, "rank": rank, "depth": depth, "ncbi": ncbi, "name": name, "parents":[], "children":set()}
+                entries[ncbi] = {
+                    "percentage": percentage,
+                    "count": num_direct,
+                    "count_descendants": num_clade_root,
+                    "raw_rank": raw_rank,
+                    "rank": rank,
+                    "depth": depth,
+                    "ncbi": ncbi,
+                    "name": name,
+                    "parents": [],
+                    "children": set(),
+                }
 
             if len(hierarchy) > 1:
                 parent = hierarchy[-2]
@@ -46,7 +64,8 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0):
                 entries[ncbi]["parents"] = hierarchy[:-1]
                 entries[parent]["children"].add(ncbi)
 
-    return entries,total
+    return entries, total
+
 
 def combine_kraken_reports(kreports):
     entries = {}
@@ -55,12 +74,27 @@ def combine_kraken_reports(kreports):
         entries, total = get_kraken_hierarchy(kreport, entries, total)
         print(kreport, len(entries), total)
     for ncbi in entries:
-        entries[ncbi]["percentage"] = entries[ncbi]["count_descendants"] / float(total) *100
+        entries[ncbi]["percentage"] = (
+            entries[ncbi]["count_descendants"] / float(total) * 100
+        )
     return entries
 
+
 def write_entry(out_handle, entry):
     offset = entry["depth"]
-    out_handle.write("%f\t%i\t%i\t%s\t%s\t%s%s\n" %(entry["percentage"], entry["count"], entry["count_descendants"], entry["raw_rank"], entry["ncbi"], 2*offset*" ", entry["name"] ))
+    out_handle.write(
+        "%f\t%i\t%i\t%s\t%s\t%s%s\n"
+        % (
+            entry["percentage"],
+            entry["count"],
+            entry["count_descendants"],
+            entry["raw_rank"],
+            entry["ncbi"],
+            2 * offset * " ",
+            entry["name"],
+        )
+    )
+
 
 def choose_next(entries, taxa, ncbi="0"):
     if len(taxa) == 0:
@@ -86,38 +120,49 @@ def choose_next(entries, taxa, ncbi="0"):
 
     return next, taxa
 
+
 def write_kraken_report(outfile, entries):
     taxa = list(entries.keys())
     with open(outfile, "w") as out_report:
-        out_report.write("% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n")
+        out_report.write(
+            "% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n"
+        )
         current = "0"
         while len(taxa) > 0:
             write_entry(out_report, entries[current])
             current, taxa = choose_next(entries, taxa, ncbi=current)
 
+
 def main():
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument('-r', dest='kraken_report_files', required=True, nargs='+',
-        help='Kraken report files to parse')
-    parser.add_argument('-o', "--outfile",dest='outfile', required=True,
-        help='Output name')
-
-    args=parser.parse_args()
-
-    #Start Program
+    parser.add_argument(
+        "-r",
+        dest="kraken_report_files",
+        required=True,
+        nargs="+",
+        help="Kraken report files to parse",
+    )
+    parser.add_argument(
+        "-o", "--outfile", dest="outfile", required=True, help="Output name"
+    )
+
+    args = parser.parse_args()
+
+    # Start Program
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM START TIME: " + time + '\n')
+    sys.stdout.write("PROGRAM START TIME: " + time + "\n")
 
     entries = combine_kraken_reports(args.kraken_report_files)
     write_kraken_report(args.outfile, entries)
 
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM END TIME: " + time + '\n')
+    sys.stdout.write("PROGRAM END TIME: " + time + "\n")
 
     sys.exit(0)
 
+
 if __name__ == "__main__":
     main()
diff --git a/conf/base.config b/conf/base.config
new file mode 100644
index 0000000..da4b559
--- /dev/null
+++ b/conf/base.config
@@ -0,0 +1,66 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    nf-core/tmp Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    A 'blank slate' config file, appropriate for general use on most high performance
+    compute environments. Assumes that all software is installed and available on
+    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    // TODO nf-core: Check the defaults for all processes
+    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
+    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    // Process-specific resource requirements
+    // NOTE - Please try and re-use the labels below as much as possible.
+    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
+    //        If possible, it would be nice to keep the same label naming convention when
+    //        adding in your local modules too.
+    // TODO nf-core: Customise requirements for specific processes.
+    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
+    withLabel:process_single {
+        cpus   = { check_max( 1                  , 'cpus'    ) }
+        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_low {
+        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_medium {
+        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_long {
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+        
+    }
+    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+        cache = false
+    }
+}
diff --git a/dockerfiles/general_container/Dockerfile b/dockerfiles/general_container/Dockerfile
new file mode 100644
index 0000000..bcec54f
--- /dev/null
+++ b/dockerfiles/general_container/Dockerfile
@@ -0,0 +1,9 @@
+FROM condaforge/mambaforge:latest AS conda
+
+COPY environment.yml .
+
+RUN /opt/conda/bin/mamba env create -f /environment.yml
+
+ENV PATH=/opt/conda/envs/scylla_general/bin:$PATH
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/dockerfiles/general_container/environment.yml b/dockerfiles/general_container/environment.yml
new file mode 100644
index 0000000..97c010f
--- /dev/null
+++ b/dockerfiles/general_container/environment.yml
@@ -0,0 +1,12 @@
+name: scylla_general
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - python<=3.10
+  - pip
+  - biopython
+  - Mako
+  - pandas
+  - numpy
\ No newline at end of file
diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf
index da7134a..bbbc92b 100644
--- a/modules/assemble_taxa.nf
+++ b/modules/assemble_taxa.nf
@@ -8,15 +8,26 @@
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
 process extract_reads {
+    tag "$meta.id"
+    label 'process_medium'
+
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
+
+    conda 'bioconda::biopython=1.78'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython%3A1.76' :
+        'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }"
+    
     input:
         val unique_id
         path fastq
         path kraken_assignments
         path kraken_report
         path bracken_report
+        
     output:
         path "reads.*.f*.gz"
+
     script:
     """
     $projectDir/../bin/extract_kraken_reads.py \
@@ -42,7 +53,62 @@ process extract_reads {
 // either be as a result of the pipeline failing for one of many reasons, or because the file generated at the end
 // of a successful pipeline run is empty, in which case it would be meaningless to copy
 // I've not had a successful test case with the very small numbers of reads/database on my laptop
+
+process flye_assemble {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::flye=2.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/flye:2.9--py39h6935b12_1' :
+        'biocontainers/flye:2.9--py39h6935b12_1' }"
+
+    input:
+        val unique_id
+        path fastq
+    output:
+        path "assembly.*.f*.gz"
+
+    script:
+    outfile = "${fastq.name}".replaceAll("reads","assembly")
+
+    """
+    flye \
+        --nano-raw ${fastq} \
+        -o "assembly"
+    if [ -s assembly/final.contigs.fa ]
+    then
+        mv assembly/final.contigs.fa ${outfile}
+        gzip ${outfile}
+    fi
+    """
+    
+}
+
+process megahit_assemble {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::megahit=1.2.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' :
+        'biocontainers/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' }"
+
+    script:
+    """
+    megahit \
+        --r ${fastq} \
+        -o "assembly"
+    if [ -s assembly/final.contigs.fa ]
+    then
+        mv assembly/final.contigs.fa ${outfile}
+        gzip ${outfile}
+    fi
+    """
+}
+
 process denovo_assemble {
+
     errorStrategy 'ignore'
     publishDir path: "${params.out_dir}/${unique_id}/assemblies_by_taxa/", mode: 'copy'
     input:
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index b563700..7794fda 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -8,7 +8,16 @@
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
 process extract_reads {
+    tag "$meta.id"
+    label 'process_medium'
+
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
+
+    conda 'bioconda::biopython=1.78'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython%3A1.76' :
+        'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }"    
+
     input:
         val unique_id
         path fastq
diff --git a/modules/fastp.nf b/modules/fastp.nf
new file mode 100644
index 0000000..8ddd44d
--- /dev/null
+++ b/modules/fastp.nf
@@ -0,0 +1,70 @@
+process fastp_paired {
+    tag "$meta.id"
+    label 'process_medium'
+
+    publishDir "${params.out_dir}/preprocess/", mode: 'copy'
+
+    conda "bioconda::fastp=0.23.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
+        'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+
+    input:
+        val(prefix)
+        path(fastq_1)
+        path(fastq_2)
+
+    output:
+        val(prefix)
+        path("${prefix}_1.fastp.fastq.gz")
+        path("${prefix}_2.fastp.fastq.gz")
+        path("${prefix}.fastp.fastq.gz"), emit: processed_fastq
+        path("${prefix}.fastp.json")
+
+    script:
+    """
+    fastp \\
+        --in1 ${fastq_1} \\
+        --in2 ${fastq_2} \\
+        --out1 ${prefix}_1.fastp.fastq.gz \\
+        --out2 ${prefix}_2.fastp.fastq.gz \\
+        --merged-out ${prefix}.fastp.fastq.gz \\
+        --json ${prefix}.fastp.json \\
+        --thread $task.cpus \\
+        --detect_adapter_for_pe \\
+        2> ${prefix}.fastp.log
+    """
+
+}
+
+process fastp_single {
+    tag "$meta.id"
+    label 'process_medium'
+
+    publishDir "${params.out_dir}/preprocess/", mode: 'copy'
+
+    conda "bioconda::fastp=0.23.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
+        'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+
+    input:
+        val(prefix)
+        path(fastq)
+
+    output:
+        val(prefix)
+        path("${prefix}.fastp.fastq.gz"), emit: processed_fastq
+        path("${prefix}.fastp.json")
+
+    script:
+
+    """
+    fastp \\
+        --in1 ${fastq} \\
+        --out1 ${prefix}.fastp.fastq.gz \\
+        --json ${prefix}.fastp.json \\
+        --thread $task.cpus \\
+        2> ${prefix}.fastp.log
+    """
+}
\ No newline at end of file
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index 4a4e171..ffdf0d5 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -1,8 +1,13 @@
 // module to make a single sample report during ingest containing kraken summary and qc stats
 process make_report {
+    tag "$meta.id"
+    label "process_low"
+
     publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy'
-    maxForks 1
-    cpus 1
+    
+    conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+    container "biowilko/scylla_general:0.0.1"
+    
     input:
         val unique_id
         path stats
@@ -29,7 +34,7 @@ workflow generate_report {
         bracken_jsons
     main:
         // Acquire report template
-        template = file("$projectDir/../bin/scylla.mako.html")
+        template = file("$baseDir/bin/scylla.mako.html")
 
         make_report(unique_id, stats, bracken_jsons, template)
     emit:
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index 7302e04..756f290 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -1,18 +1,14 @@
 include { qc_checks } from '../modules/qc_checks'
 include { generate_report } from '../modules/generate_report'
 
-
-kraken_compute = params.threads == 1 ? 1 : params.threads - 1
-
 process kraken2_client {
-    errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' }
-    maxRetries 5
-    label "scylla"
+    tag "$meta.id"
+    label 'process_low'
+    label 'error_retry'
+
+    conda "epi2melabs::kraken2-server=0.1.3"
+    container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681'
 
-    containerOptions {workflow.profile != "singularity" ? "--network host" : ""}
-    // retry if server responds out of resource
-    errorStrategy = { task.exitStatus in [8] ? 'retry' : 'finish' }
-    maxForks kraken_compute
     input:
         path fastq
     output:
@@ -28,6 +24,8 @@ process kraken2_client {
 }
 
 process combine_kraken_outputs {
+
+
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     input:
         val unique_id
@@ -72,7 +70,16 @@ process determine_bracken_length {
 
 // this fails if the kraken file input is empty - currently have no check that it is populated
 process bracken {
+    tag "$meta.id"
+    label 'process_low'
+
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+
+    conda "bioconda::bracken=2.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0':
+        'biocontainers/bracken:2.7--py39hc16433a_0' }"
+
     input:
         val unique_id
         path kraken_report
@@ -93,7 +100,14 @@ process bracken {
 }
 
 process bracken_to_json {
+    tag "$meta.id"
+    label "process_low"
+
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+    
+    conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+    container "biowilko/scylla_general:0.0.1"
+
     input:
         val unique_id
         path kraken_report
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index aa139ca..a6777a4 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -1,6 +1,12 @@
 // module for other ingest qc checks
 
 process read_stats {
+    tag "$meta.id"
+    label "process_medium"
+
+    conda "epi2melabs::kraken2-server=0.1.3"
+    container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681'
+
     input:
         path fastq
     output:
@@ -14,7 +20,14 @@ process read_stats {
 }
 
 process combine_stats {
+    tag "$meta.id"
+    label "process_low"
+
     publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
+
+    conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
+    container "biowilko/scylla_general:0.0.1"
+
     input:
         val unique_id
         path stats
@@ -24,7 +37,7 @@ process combine_stats {
     script:
     """
     cp ${stats} "raw_reads.stats"
-    $projectDir/../bin/fastcat_histogram.py \
+    fastcat_histogram.py \
             --sample_id "raw_reads" \
             "raw_reads.stats" "raw_reads.json"
     """
diff --git a/nextflow.config b/nextflow.config
index e09769a..393b000 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,6 +11,31 @@
 
 
 params {
+    // TODO nf-core: Specify your pipeline's command line flags
+    // Input options
+    input                      = null
+
+    // Boilerplate options
+    outdir                     = null
+    tracedir                   = "${params.outdir}/pipeline_info"
+    publish_dir_mode           = 'copy'
+    email                      = null
+    email_on_fail              = null
+    plaintext_email            = false
+    monochrome_logs            = false
+    hook_url                   = null
+    help                       = false
+    version                    = false
+    validate_params            = true
+    show_hidden_params         = false
+    schema_ignore_params       = 'genomes'
+
+    // Max resource options
+    // Defaults only, expecting to be overwritten
+    max_memory                 = '128.GB'
+    max_cpus                   = 16
+    max_time                   = '240.h'
+
     help = false
     version = false
     wfversion = "v0.0.1"
@@ -29,29 +54,6 @@ params {
     bracken_length = null
     bracken_level = 'S'
     database_set = "Viral"
-    database_sets = [
-        'Viral': [
-            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20221209.tar.gz',
-            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
-        ],
-        'EuPath': [
-            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113.tar.gz',
-            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
-        ],
-        'PlusPF-8': [
-            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspf_8gb_20210517.tar.gz',
-            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
-        ],
-        'ncbi_16s_18s': [
-                    'reference': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_16s_18s.fna',
-                    'refindex': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_16s_18s.fna.fai',
-                    'database': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_kraken2.tar.gz',
-                    'kmer_dist': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/database1000mers.kmer_distrib',
-                    'ref2taxid': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ref2taxid.targloci.tsv',
-                    'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
-        ]
-    ]
-
 
     kraken_report = null
     bracken_report = null
@@ -62,16 +64,6 @@ params {
     max_human_reads_before_rejection = 1000
     read_type = "illumina"
 
-
-
-
-
-
-
-
-
-
-
     disable_ping = false
     threads = 2
     k2_port = 8080
@@ -95,6 +87,8 @@ params {
     }
 }
 
+// Load base.config by default for all pipelines
+includeConfig 'conf/base.config'
 
 manifest {
     name            = 'snowy-leopard/scylla'
@@ -106,57 +100,164 @@ manifest {
     version         = 'v0.0.1'
 }
 
-
-executor {
-    $local {
-        cpus = 8
-        memory = "8 GB"
+profiles {
+    debug {
+        dumpHashes             = true
+        process.beforeScript   = 'echo $HOSTNAME'
+        cleanup = false
     }
-}
-
-// used by default for "standard" (docker) and singularity profiles,
-// other profiles may override.
-process {
-    withLabel:scylla {
-        container = "scylla:latest"
+    conda {
+        conda.enabled          = true
+        docker.enabled         = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
     }
-    shell = ['/bin/bash', '-euo', 'pipefail']
-}
-
-
-profiles {
-    // the "standard" profile is used implicitely by nextflow
-    // if no other profile is given on the CLI
-    standard {
-        docker {
-            enabled = true
-            // this ensures container is run as host user and group, but
-            //    also adds host user to the within-container group
-            runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
-        }
+    mamba {
+        conda.enabled          = true
+        conda.useMamba         = true
+        docker.enabled         = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
+    }
+    docker {
+        docker.enabled         = true
+        docker.registry        = 'quay.io'
+        docker.userEmulation   = true
+        conda.enabled          = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
+    }
+    arm {
+        docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64'
     }
-
-    // using singularity instead of docker
     singularity {
-        singularity {
-            enabled = true
-            autoMounts = true
-        }
+        singularity.enabled    = true
+        singularity.autoMounts = true
+        conda.enabled          = false
+        docker.enabled         = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
+    }
+    podman {
+        podman.enabled         = true
+        podman.registry        = 'quay.io'
+        conda.enabled          = false
+        docker.enabled         = false
+        singularity.enabled    = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
+    }
+    shifter {
+        shifter.enabled        = true
+        conda.enabled          = false
+        docker.enabled         = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        charliecloud.enabled   = false
+        apptainer.enabled      = false
+    }
+    charliecloud {
+        charliecloud.enabled   = true
+        conda.enabled          = false
+        docker.enabled         = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        apptainer.enabled      = false
     }
+    apptainer {
+        apptainer.enabled      = true
+        conda.enabled          = false
+        docker.enabled         = false
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+    }
+    gitpod {
+        executor.name          = 'local'
+        executor.cpus          = 16
+        executor.memory        = 60.GB
+    }
+    test      { includeConfig 'conf/test.config'      }
+    test_full { includeConfig 'conf/test_full.config' }
+}
 
 
-    conda {
-        conda.enabled = true
-    }
+// Export these variables to prevent local Python/R libraries from conflicting with those in the container
+// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container.
+// See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable.
 
-    // local profile for simplified development testing
-    local {
-        process.executor = 'local'
-    }
+env {
+    PYTHONNOUSERSITE = 1
+    R_PROFILE_USER   = "/.Rprofile"
+    R_ENVIRON_USER   = "/.Renviron"
+    JULIA_DEPOT_PATH = "/usr/local/share/julia"
 }
 
+// Capture exit codes from upstream processes when piping
+process.shell = ['/bin/bash', '-euo', 'pipefail']
 
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
+timeline {
+    enabled = true
+    file    = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
+}
 report {
-  enabled = true
-  file = "${params.out_dir}/execution/report.html"
-}
\ No newline at end of file
+    enabled = true
+    file    = "${params.tracedir}/execution_report_${trace_timestamp}.html"
+}
+trace {
+    enabled = true
+    file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+}
+dag {
+    enabled = true
+    file    = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html"
+}
+
+
+// Function to ensure that resource requirements don't go beyond
+// a maximum limit
+def check_max(obj, type) {
+    if (type == 'memory') {
+        try {
+            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+                return params.max_memory as nextflow.util.MemoryUnit
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'time') {
+        try {
+            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+                return params.max_time as nextflow.util.Duration
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'cpus') {
+        try {
+            return Math.min( obj, params.max_cpus as int )
+        } catch (all) {
+            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+            return obj
+        }
+    }
+}
diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index eed53a1..5e0d885 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -1,7 +1,8 @@
 // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
-include { get_params_and_versions } from '../modules/get_params_and_versions'
-include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
-include { extract_taxa } from '../modules/extract_taxa'
+include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions'
+include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline'
+include { extract_taxa } from '$baseDir/modules/extract_taxa'
+include { fastp_single, fastp_paired} from '$baseDir/modules/fastp'
 
 workflow ingest {
     take:
@@ -19,13 +20,20 @@ workflow ingest {
 }
 
 workflow {
-    // check input fastq exists
-    input_fastq = file("${params.fastq}", type: "file", checkIfExists:true)
-
-    unique_id = "${params.unique_id}"
-    if (unique_id == "null") {
-        unique_id = "${input_fastq.simpleName}"
+    // check input fastq exists and run fastp
+    
+    if ($params.paired) {
+        input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true)
+        input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true)
+        fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2)
+        fastp_paired.out.processed_fastq
+            .set {processed_fastq}
+    } else {
+        input_fastq = file("${params.fastq}", type: "file", checkIfExists:true)
+        fastp_single("${params.unique_id}", input_fastq)
+        fastp_single.out.processed_fastq
+            .set {processed_fastq}
     }
 
-    ingest(unique_id, input_fastq)
+    ingest(unique_id, processed_fastq)
 }
\ No newline at end of file

From d21eebd6eba519dada01ed91dff02305aff170ba Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:38:11 +0100
Subject: [PATCH 02/77] better db handling

---
 subworkflows/kraken_pipeline.nf | 48 +++++++++++++++++----------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index e176c40..4ac86ee 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -9,32 +9,34 @@ workflow kraken_pipeline {
     main:
 
         // Check source param is valid
-        sources = params.database_sets
-        source_name = params.database_set
-        source_data = sources.get(source_name, false)
-        if (!sources.containsKey(source_name) || !source_data) {
-            keys = sources.keySet()
-            throw new Exception("Source $params.source is invalid, must be one of $keys")
-        }
+        // sources = params.database_sets
+        // source_name = params.database_set
+        // source_data = sources.get(source_name, false)
+        // if (!sources.containsKey(source_name) || !source_data) {
+        //     keys = sources.keySet()
+        //     throw new Exception("Source $params.source is invalid, must be one of $keys")
+        // }
 
         // Grab taxonomy files
-        taxonomy = file(sources[source_name]["taxonomy"], type: "file")
-        if (params.taxonomy) {
-            log.info("Checking custom taxonomy mapping exists")
-            taxonomy = file(params.taxonomy, type: "dir", checkIfExists:true)
-        }
+        // taxonomy = file(sources[source_name]["taxonomy"], type: "file")
+        // if (params.taxonomy) {
+        //     log.info("Checking custom taxonomy mapping exists")
+            
+        // }
+
+        taxonomy = file("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", type: "dir", checkIfExists:true)
 
-        if (database) {
-            source_database = database
-        } else {
-            source_database = source_data.get("database", false)
-            if (!source_database) {
-                throw new Exception(
-                    "Error: Source $source_name does not include a database for "
-                    + "use with kraken, please choose another source, "
-                    + "provide a custom database or disable kraken.")
-            }
-        database = file(source_database, type: "file")
+        // if (database) {
+        //     source_database = database
+        // } else {
+        //     source_database = source_data.get("database", false)
+        //     if (!source_database) {
+        //         throw new Exception(
+        //             "Error: Source $source_name does not include a database for "
+        //             + "use with kraken, please choose another source, "
+        //             + "provide a custom database or disable kraken.")
+        //     }
+        database = file($params.db, type: "dir", checkIfExists:true)
 
         // start_server(database, taxonomy)
         run_kraken_and_bracken(unique_id, fastq, database, taxonomy)

From ad91dd3234cd065838a6f801c801ba38a015b5c8 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:45:00 +0100
Subject: [PATCH 03/77] it needs a main.nf....

---
 main.nf | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 main.nf

diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..5e0d885
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,39 @@
+// workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
+include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions'
+include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline'
+include { extract_taxa } from '$baseDir/modules/extract_taxa'
+include { fastp_single, fastp_paired} from '$baseDir/modules/fastp'
+
+workflow ingest {
+    take:
+        unique_id
+        fastq
+        //metadata
+    main:
+        get_params_and_versions()
+        //clean_metadata(metadata)
+        kraken_pipeline(unique_id, fastq)
+        extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
+    emit:
+        html_report = kraken_pipeline.out.report
+
+}
+
+workflow {
+    // check input fastq exists and run fastp
+    
+    if ($params.paired) {
+        input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true)
+        input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true)
+        fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2)
+        fastp_paired.out.processed_fastq
+            .set {processed_fastq}
+    } else {
+        input_fastq = file("${params.fastq}", type: "file", checkIfExists:true)
+        fastp_single("${params.unique_id}", input_fastq)
+        fastp_single.out.processed_fastq
+            .set {processed_fastq}
+    }
+
+    ingest(unique_id, processed_fastq)
+}
\ No newline at end of file

From ad9ed8677bbed3af957815ff2ae4634784023948 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:48:10 +0100
Subject: [PATCH 04/77] properly define mainscript

---
 main.nf         | 39 ---------------------------------------
 nextflow.config |  2 +-
 2 files changed, 1 insertion(+), 40 deletions(-)
 delete mode 100644 main.nf

diff --git a/main.nf b/main.nf
deleted file mode 100644
index 5e0d885..0000000
--- a/main.nf
+++ /dev/null
@@ -1,39 +0,0 @@
-// workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
-include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions'
-include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline'
-include { extract_taxa } from '$baseDir/modules/extract_taxa'
-include { fastp_single, fastp_paired} from '$baseDir/modules/fastp'
-
-workflow ingest {
-    take:
-        unique_id
-        fastq
-        //metadata
-    main:
-        get_params_and_versions()
-        //clean_metadata(metadata)
-        kraken_pipeline(unique_id, fastq)
-        extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
-    emit:
-        html_report = kraken_pipeline.out.report
-
-}
-
-workflow {
-    // check input fastq exists and run fastp
-    
-    if ($params.paired) {
-        input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true)
-        input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true)
-        fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2)
-        fastp_paired.out.processed_fastq
-            .set {processed_fastq}
-    } else {
-        input_fastq = file("${params.fastq}", type: "file", checkIfExists:true)
-        fastp_single("${params.unique_id}", input_fastq)
-        fastp_single.out.processed_fastq
-            .set {processed_fastq}
-    }
-
-    ingest(unique_id, processed_fastq)
-}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 393b000..9646fcf 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -95,7 +95,7 @@ manifest {
     author          = 'Snowy Leopard'
     homePage        = 'https://github.com/snowy-leopard/scylla'
     description     = 'Classify metagenomic sequence data from human respiratory infections.'
-    mainScript      = 'ingest.nf'
+    mainScript      = 'subworkflows/ingest.nf'
     nextflowVersion = '>=20.10.0'
     version         = 'v0.0.1'
 }

From d5ae2f3e22d5b8f3c33984be44097038eb1ea76a Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:51:53 +0100
Subject: [PATCH 05/77] why use commas like everybody else?

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 5e0d885..9f23394 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -2,7 +2,7 @@
 include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions'
 include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline'
 include { extract_taxa } from '$baseDir/modules/extract_taxa'
-include { fastp_single, fastp_paired} from '$baseDir/modules/fastp'
+include { fastp_single; fastp_paired} from '$baseDir/modules/fastp'
 
 workflow ingest {
     take:

From 59fc786709364edf71ab9a6acd31091e8ed36d75 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:54:29 +0100
Subject: [PATCH 06/77] fix includes again

---
 subworkflows/ingest.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 9f23394..0daf513 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -1,8 +1,8 @@
 // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
-include { get_params_and_versions } from '$baseDir/modules/get_params_and_versions'
-include { kraken_pipeline } from '$baseDir/subworkflows/kraken_pipeline'
-include { extract_taxa } from '$baseDir/modules/extract_taxa'
-include { fastp_single; fastp_paired} from '$baseDir/modules/fastp'
+include { get_params_and_versions } from '../modules/get_params_and_versions'
+include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
+include { extract_taxa } from '../modules/extract_taxa'
+include { fastp_single; fastp_paired} from '../modules/fastp'
 
 workflow ingest {
     take:

From 504ca03fd85f4fc9ce07faeaf8b95d5f880b1018 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:56:04 +0100
Subject: [PATCH 07/77] not $params

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 0daf513..9517f5b 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -22,7 +22,7 @@ workflow ingest {
 workflow {
     // check input fastq exists and run fastp
     
-    if ($params.paired) {
+    if (params.paired) {
         input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true)
         input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true)
         fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2)

From e98551d307163eb325fbba179d9b66f77ecb0ba1 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 11:59:59 +0100
Subject: [PATCH 08/77] params not strings?

---
 subworkflows/ingest.nf | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 9517f5b..98b9096 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -23,14 +23,14 @@ workflow {
     // check input fastq exists and run fastp
     
     if (params.paired) {
-        input_fastq_1 = file("${params.fastq1}", type: "file", checkIfExists:true)
-        input_fastq_2 = file("${params.fastq2}", type: "file", checkIfExists:true)
-        fastp_paired("${params.unique_id}", input_fastq_1, input_fastq_2)
+        input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+        input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+        fastp_paired(params.unique_id, input_fastq_1, input_fastq_2)
         fastp_paired.out.processed_fastq
             .set {processed_fastq}
     } else {
-        input_fastq = file("${params.fastq}", type: "file", checkIfExists:true)
-        fastp_single("${params.unique_id}", input_fastq)
+        input_fastq = file(params.fastq, type: "file", checkIfExists:true)
+        fastp_single(params.unique_id, input_fastq)
         fastp_single.out.processed_fastq
             .set {processed_fastq}
     }

From 2fd603b0094b691246792d60f705f23589296b05 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 12:14:06 +0100
Subject: [PATCH 09/77] remove $meta.id

---
 modules/assemble_taxa.nf           | 6 +++---
 modules/extract_taxa.nf            | 2 +-
 modules/fastp.nf                   | 4 ++--
 modules/generate_report.nf         | 2 +-
 modules/get_params_and_versions.nf | 3 +++
 modules/kraken_client.nf           | 6 +++---
 modules/qc_checks.nf               | 4 ++--
 subworkflows/ingest.nf             | 2 +-
 subworkflows/kraken_pipeline.nf    | 2 +-
 9 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf
index bbbc92b..01d741f 100644
--- a/modules/assemble_taxa.nf
+++ b/modules/assemble_taxa.nf
@@ -8,7 +8,7 @@
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
 process extract_reads {
-    tag "$meta.id"
+    
     label 'process_medium'
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
@@ -55,7 +55,7 @@ process extract_reads {
 // I've not had a successful test case with the very small numbers of reads/database on my laptop
 
 process flye_assemble {
-    tag "$meta.id"
+    
     label 'process_high'
 
     conda "bioconda::flye=2.9"
@@ -86,7 +86,7 @@ process flye_assemble {
 }
 
 process megahit_assemble {
-    tag "$meta.id"
+    
     label 'process_high'
 
     conda "bioconda::megahit=1.2.9"
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 7794fda..66a3cbc 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -8,7 +8,7 @@
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
 process extract_reads {
-    tag "$meta.id"
+    
     label 'process_medium'
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
diff --git a/modules/fastp.nf b/modules/fastp.nf
index 8ddd44d..bc8c8f5 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -1,5 +1,5 @@
 process fastp_paired {
-    tag "$meta.id"
+    
     label 'process_medium'
 
     publishDir "${params.out_dir}/preprocess/", mode: 'copy'
@@ -38,7 +38,7 @@ process fastp_paired {
 }
 
 process fastp_single {
-    tag "$meta.id"
+    
     label 'process_medium'
 
     publishDir "${params.out_dir}/preprocess/", mode: 'copy'
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index ffdf0d5..0f8afff 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -1,6 +1,6 @@
 // module to make a single sample report during ingest containing kraken summary and qc stats
 process make_report {
-    tag "$meta.id"
+    
     label "process_low"
 
     publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy'
diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf
index 629c54e..cea7a8b 100644
--- a/modules/get_params_and_versions.nf
+++ b/modules/get_params_and_versions.nf
@@ -3,6 +3,9 @@
 import groovy.json.JsonBuilder
 
 process get_versions {
+
+    
+
     label "scylla"
     publishDir "${params.out_dir}/execution", mode: 'copy'
     cpus 1
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index 756f290..9850087 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -2,7 +2,7 @@ include { qc_checks } from '../modules/qc_checks'
 include { generate_report } from '../modules/generate_report'
 
 process kraken2_client {
-    tag "$meta.id"
+    
     label 'process_low'
     label 'error_retry'
 
@@ -70,7 +70,7 @@ process determine_bracken_length {
 
 // this fails if the kraken file input is empty - currently have no check that it is populated
 process bracken {
-    tag "$meta.id"
+    
     label 'process_low'
 
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
@@ -100,7 +100,7 @@ process bracken {
 }
 
 process bracken_to_json {
-    tag "$meta.id"
+    
     label "process_low"
 
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index a6777a4..ef32b98 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -1,7 +1,7 @@
 // module for other ingest qc checks
 
 process read_stats {
-    tag "$meta.id"
+    
     label "process_medium"
 
     conda "epi2melabs::kraken2-server=0.1.3"
@@ -20,7 +20,7 @@ process read_stats {
 }
 
 process combine_stats {
-    tag "$meta.id"
+    
     label "process_low"
 
     publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 98b9096..7a9c194 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -35,5 +35,5 @@ workflow {
             .set {processed_fastq}
     }
 
-    ingest(unique_id, processed_fastq)
+    ingest(params.unique_id, processed_fastq)
 }
\ No newline at end of file
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index 4ac86ee..ee80101 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -36,7 +36,7 @@ workflow kraken_pipeline {
         //             + "use with kraken, please choose another source, "
         //             + "provide a custom database or disable kraken.")
         //     }
-        database = file($params.db, type: "dir", checkIfExists:true)
+        database = file(params.db, type: "dir", checkIfExists:true)
 
         // start_server(database, taxonomy)
         run_kraken_and_bracken(unique_id, fastq, database, taxonomy)

From 414cb12f3d76dd384b40c97095638cebca24d6a6 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:28:25 +0100
Subject: [PATCH 10/77] Make main.nf

---
 dockerfiles/general_container/environment.yml | 12 -----
 .../{general_container => scylla}/Dockerfile  |  4 +-
 dockerfiles/scylla/environment.yml            | 16 +++++++
 main.nf                                       | 27 ++++++++++++
 modules/kraken_client.nf                      | 15 +++++--
 modules/qc_checks.nf                          |  4 +-
 nextflow.config                               |  2 +-
 subworkflows/ingest.nf                        | 44 +++++++++++--------
 8 files changed, 86 insertions(+), 38 deletions(-)
 delete mode 100644 dockerfiles/general_container/environment.yml
 rename dockerfiles/{general_container => scylla}/Dockerfile (65%)
 create mode 100644 dockerfiles/scylla/environment.yml
 create mode 100644 main.nf

diff --git a/dockerfiles/general_container/environment.yml b/dockerfiles/general_container/environment.yml
deleted file mode 100644
index 97c010f..0000000
--- a/dockerfiles/general_container/environment.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: scylla_general
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - python<=3.10
-  - pip
-  - biopython
-  - Mako
-  - pandas
-  - numpy
\ No newline at end of file
diff --git a/dockerfiles/general_container/Dockerfile b/dockerfiles/scylla/Dockerfile
similarity index 65%
rename from dockerfiles/general_container/Dockerfile
rename to dockerfiles/scylla/Dockerfile
index bcec54f..20980ca 100644
--- a/dockerfiles/general_container/Dockerfile
+++ b/dockerfiles/scylla/Dockerfile
@@ -4,6 +4,6 @@ COPY environment.yml .
 
 RUN /opt/conda/bin/mamba env create -f /environment.yml
 
-ENV PATH=/opt/conda/envs/scylla_general/bin:$PATH
+ENV PATH=/opt/conda/envs/scylla/bin:$PATH
 
-CMD ["/bin/bash"]
\ No newline at end of file
+CMD ["/bin/bash"]
diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
new file mode 100644
index 0000000..667eba1
--- /dev/null
+++ b/dockerfiles/scylla/environment.yml
@@ -0,0 +1,16 @@
+name: scylla
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+  - epi2melabs
+dependencies:
+  - python<=3.10
+  - pysam
+  - pip
+  - kraken2-server # not on mac
+  - bracken # needs to be manually installed from git and softlinked if Mac M1
+  - fastcat
+  - biopython
+  - pandas
+  - numpy
\ No newline at end of file
diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..0c936b6
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,27 @@
+include { ingest } from './submodules/ingest'
+include { fastp_single; fastp_paired} from '../modules/fastp'
+
+workflow {
+    // check input fastq exists and run fastp
+    
+    if (params.paired) {
+        Channel.of(file(params.fastq1, type: "file", checkIfExists:true))
+            .set {input_fastq_1_ch}
+
+        Channel.of(file(params.fastq2, type: "file", checkIfExists:true))
+            .set {input_fastq_2_ch}
+
+        fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch)
+        fastp_paired.out.processed_fastq
+            .set {processed_fastq}
+    } else {
+        Channel.of(file(params.fastq, type: "file", checkIfExists:true))
+            .set {input_fastq_ch}
+
+        fastp_single(params.unique_id, input_fastq_ch)
+        fastp_single.out.processed_fastq
+            .set {processed_fastq}
+    }
+
+    ingest(params.unique_id, processed_fastq)
+}
\ No newline at end of file
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index 9850087..8b25fda 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -7,7 +7,7 @@ process kraken2_client {
     label 'error_retry'
 
     conda "epi2melabs::kraken2-server=0.1.3"
-    container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681'
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
 
     input:
         path fastq
@@ -25,8 +25,12 @@ process kraken2_client {
 
 process combine_kraken_outputs {
 
+    label 'process_single'
+
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
 
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+    
     input:
         val unique_id
         path kraken_reports
@@ -52,7 +56,11 @@ process combine_kraken_outputs {
 }
 
 process determine_bracken_length {
-    label "scylla"
+    label "process_low"
+
+    conda "anaconda::sed=4.8"
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+
     input:
         path database
     output:
@@ -106,7 +114,8 @@ process bracken_to_json {
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
-    container "biowilko/scylla_general:0.0.1"
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+
 
     input:
         val unique_id
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index ef32b98..c825041 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -5,7 +5,7 @@ process read_stats {
     label "process_medium"
 
     conda "epi2melabs::kraken2-server=0.1.3"
-    container 'ontresearch/wf-metagenomics:shac290da60032a3a6c9c01808d58a71a0f17957681'
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
 
     input:
         path fastq
@@ -26,7 +26,7 @@ process combine_stats {
     publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
 
     conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
-    container "biowilko/scylla_general:0.0.1"
+    container "biowilko/scylla@sha256:362e51bf7f4fe4222aa674b7c75f514fe90d875263016b033ff69258f2eeae43"
 
     input:
         val unique_id
diff --git a/nextflow.config b/nextflow.config
index 9646fcf..0ec4253 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -95,7 +95,7 @@ manifest {
     author          = 'Snowy Leopard'
     homePage        = 'https://github.com/snowy-leopard/scylla'
     description     = 'Classify metagenomic sequence data from human respiratory infections.'
-    mainScript      = 'subworkflows/ingest.nf'
+    mainScript      = 'main.nf'
     nextflowVersion = '>=20.10.0'
     version         = 'v0.0.1'
 }
diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 7a9c194..1e21a35 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -2,7 +2,6 @@
 include { get_params_and_versions } from '../modules/get_params_and_versions'
 include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
 include { extract_taxa } from '../modules/extract_taxa'
-include { fastp_single; fastp_paired} from '../modules/fastp'
 
 workflow ingest {
     take:
@@ -10,7 +9,7 @@ workflow ingest {
         fastq
         //metadata
     main:
-        get_params_and_versions()
+        // get_params_and_versions()
         //clean_metadata(metadata)
         kraken_pipeline(unique_id, fastq)
         extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
@@ -19,21 +18,30 @@ workflow ingest {
 
 }
 
-workflow {
-    // check input fastq exists and run fastp
+// workflow {
+//     // check input fastq exists and run fastp
     
-    if (params.paired) {
-        input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
-        input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
-        fastp_paired(params.unique_id, input_fastq_1, input_fastq_2)
-        fastp_paired.out.processed_fastq
-            .set {processed_fastq}
-    } else {
-        input_fastq = file(params.fastq, type: "file", checkIfExists:true)
-        fastp_single(params.unique_id, input_fastq)
-        fastp_single.out.processed_fastq
-            .set {processed_fastq}
-    }
+//     if (params.paired) {
+//         Channel.of(file(file(params.fastq1, type: "file", checkIfExists:true)))
+//             .set {input_fastq_1_ch}
 
-    ingest(params.unique_id, processed_fastq)
-}
\ No newline at end of file
+//         Channel.of(file(file(params.fastq2, type: "file", checkIfExists:true)))
+//             .set {input_fastq_2_ch}
+
+//         // input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+//         // input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+//         fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch)
+//         fastp_paired.out.processed_fastq
+//             .set {processed_fastq}
+//     } else {
+//         Channel.of(file(file(params.fastq, type: "file", checkIfExists:true)))
+//             .set {input_fastq_ch}
+
+//         // input_fastq = file(params.fastq, type: "file", checkIfExists:true)
+//         fastp_single(params.unique_id, input_fastq_ch)
+//         fastp_single.out.processed_fastq
+//             .set {processed_fastq}
+//     }
+
+//     ingest(params.unique_id, processed_fastq)
+// }
\ No newline at end of file

From 40a4d1cc7051776e107d56abdb7191dbff9336d5 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:30:31 +0100
Subject: [PATCH 11/77] quote localhost

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 0ec4253..7bdf98f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -67,7 +67,7 @@ params {
     disable_ping = false
     threads = 2
     k2_port = 8080
-    k2_host = localhost
+    k2_host = 'localhost'
     port = 8080
     process_label = "scylla"
     monochrome_logs = false

From 2b5df2530deed0cfce5b42de7f3f049f2bbe9ddc Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:31:51 +0100
Subject: [PATCH 12/77] properly include ingest

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 0c936b6..7daaee3 100644
--- a/main.nf
+++ b/main.nf
@@ -1,4 +1,4 @@
-include { ingest } from './submodules/ingest'
+include { ingest } from './subworkflows/ingest'
 include { fastp_single; fastp_paired} from '../modules/fastp'
 
 workflow {

From 0385711753662ecd0bdaf95029b9d986beda8a91 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:32:26 +0100
Subject: [PATCH 13/77] properly include fastp

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 7daaee3..0d67642 100644
--- a/main.nf
+++ b/main.nf
@@ -1,5 +1,5 @@
 include { ingest } from './subworkflows/ingest'
-include { fastp_single; fastp_paired} from '../modules/fastp'
+include { fastp_single; fastp_paired} from './modules/fastp'
 
 workflow {
     // check input fastq exists and run fastp

From 1f137a7a687ca0ec56249fbfc2abb4b6aa3847cb Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:40:07 +0100
Subject: [PATCH 14/77] Fix combine stats container

---
 modules/generate_report.nf | 2 +-
 modules/qc_checks.nf       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index 0f8afff..e4baf1e 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -6,7 +6,7 @@ process make_report {
     publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
-    container "biowilko/scylla_general:0.0.1"
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
     
     input:
         val unique_id
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index c825041..57735e9 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -26,7 +26,8 @@ process combine_stats {
     publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
 
     conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
-    container "biowilko/scylla@sha256:362e51bf7f4fe4222aa674b7c75f514fe90d875263016b033ff69258f2eeae43"
+    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    
 
     input:
         val unique_id

From 8fa35dc745d52420c26e41272238cc716eac8bff Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:44:03 +0100
Subject: [PATCH 15/77] no need to use paths to bin

---
 modules/assemble_taxa.nf   | 2 +-
 modules/extract_taxa.nf    | 2 +-
 modules/generate_report.nf | 2 +-
 modules/kraken_client.nf   | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf
index 01d741f..a2811ab 100644
--- a/modules/assemble_taxa.nf
+++ b/modules/assemble_taxa.nf
@@ -30,7 +30,7 @@ process extract_reads {
 
     script:
     """
-    $projectDir/../bin/extract_kraken_reads.py \
+    extract_kraken_reads.py \
         -s ${fastq} \
         -k ${kraken_assignments} \
         -r ${kraken_report} \
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 66a3cbc..996be81 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -29,7 +29,7 @@ process extract_reads {
         path "reads_summary.json", emit: summary
     script:
     """
-    $projectDir/../bin/extract_kraken_reads.py \
+    extract_kraken_reads.py \
         -s ${fastq} \
         -k ${kraken_assignments} \
         -r ${kraken_report} \
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index e4baf1e..bc8dadb 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -18,7 +18,7 @@ process make_report {
     script:
         report_name = "${unique_id}"
     """
-    $projectDir/../bin/make_report.py \
+    make_report.py \
         --prefix "${report_name}" \
         --read_counts ${stats} \
         --assignments ${lineages} \
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index 8b25fda..3ba91aa 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -46,7 +46,7 @@ process combine_kraken_outputs {
         """
     } else {
         """
-        $projectDir/../bin/combine_kreports.py \\
+        combine_kreports.py \\
                 -r ${kraken_reports} \\
                 -o "${params.database_set}.kraken_report.txt"
 
@@ -128,7 +128,7 @@ process bracken_to_json {
     cat "${bracken_summary}" | cut -f2,6 | tail -n+2 > taxacounts.txt
     cat "${bracken_summary}" | cut -f2 | tail -n+2 > taxa.txt
     taxonkit lineage --data-dir ${taxonomy_dir}  -R taxa.txt  > lineages.txt
-    $projectDir/../bin/aggregate_lineages_bracken.py \\
+    aggregate_lineages_bracken.py \\
             -i "lineages.txt" -b "taxacounts.txt" \\
             -u "${kraken_report}" \\
             -p "temp_bracken"

From b1349ac62f26b53e94db57ce0504cd1c642fba56 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:45:13 +0100
Subject: [PATCH 16/77] init paired to prevent error msg

---
 conf/base.config | 6 +-----
 nextflow.config  | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index da4b559..2bfc384 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -58,9 +58,5 @@ process {
     withLabel:error_retry {
         errorStrategy = 'retry'
         maxRetries    = 2
-        
-    }
-    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
-        cache = false
-    }
+
 }
diff --git a/nextflow.config b/nextflow.config
index 7bdf98f..088fb81 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -71,6 +71,7 @@ params {
     port = 8080
     process_label = "scylla"
     monochrome_logs = false
+    paired = false
 
     validate_params = true
     show_hidden_params = false

From ca5785baf68a079251029007915ff3527a244b70 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:46:04 +0100
Subject: [PATCH 17/77] fix base.config formatting

---
 conf/base.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/base.config b/conf/base.config
index 2bfc384..ca1ea86 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -58,5 +58,5 @@ process {
     withLabel:error_retry {
         errorStrategy = 'retry'
         maxRetries    = 2
-
+    }
 }

From 6579a0fe00200e6ddea91113b636c66c247de4f2 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 16:56:11 +0100
Subject: [PATCH 18/77] parameterise container checksum

---
 dockerfiles/scylla/environment.yml | 3 ++-
 modules/generate_report.nf         | 2 +-
 modules/kraken_client.nf           | 8 ++++----
 modules/qc_checks.nf               | 4 ++--
 nextflow.config                    | 2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
index 667eba1..b3ac398 100644
--- a/dockerfiles/scylla/environment.yml
+++ b/dockerfiles/scylla/environment.yml
@@ -13,4 +13,5 @@ dependencies:
   - fastcat
   - biopython
   - pandas
-  - numpy
\ No newline at end of file
+  - numpy
+  - taxonkit
\ No newline at end of file
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index bc8dadb..928c146 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -6,7 +6,7 @@ process make_report {
     publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
     
     input:
         val unique_id
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index 3ba91aa..dd89ea8 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -7,7 +7,7 @@ process kraken2_client {
     label 'error_retry'
 
     conda "epi2melabs::kraken2-server=0.1.3"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
         path fastq
@@ -27,7 +27,7 @@ process combine_kraken_outputs {
 
     label 'process_single'
 
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     
@@ -59,7 +59,7 @@ process determine_bracken_length {
     label "process_low"
 
     conda "anaconda::sed=4.8"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
         path database
@@ -114,7 +114,7 @@ process bracken_to_json {
     publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
 
     input:
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index 57735e9..45b23e8 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -5,7 +5,7 @@ process read_stats {
     label "process_medium"
 
     conda "epi2melabs::kraken2-server=0.1.3"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
         path fastq
@@ -26,7 +26,7 @@ process combine_stats {
     publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
 
     conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
-    container "biowilko/scylla@sha256:35fa2117d7dfe8b595dc25a422036888044ea5ae66b65fa26b82cc4ff457a7d9"
+    container "biowilko/scylla@${params.wf.container_sha}"
     
 
     input:
diff --git a/nextflow.config b/nextflow.config
index 088fb81..0832834 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -84,7 +84,7 @@ params {
           "--fastq test_data/barcode01/reads.fastq.gz",
       ]
       agent = null
-      container_sha = "sha156f44a25dd22e39eb10e82d12e0584d41dde909"
+      container_sha = "sha256:aeaf567b0e05bea87081a8d0f86d92a06664707abd0e61d84146fad26597cb1f"
     }
 }
 

From e20bca44856b9f0a19ffa88d611f14ce35ca8a2a Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:03:31 +0100
Subject: [PATCH 19/77] use existing taxonomy dir

---
 nextflow.config                 | 1 +
 subworkflows/kraken_pipeline.nf | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 0832834..2629838 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -72,6 +72,7 @@ params {
     process_label = "scylla"
     monochrome_logs = false
     paired = false
+    taxonomy_dir = "/shared/public/db/taxonomy/"
 
     validate_params = true
     show_hidden_params = false
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index ee80101..d4b92ed 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -24,7 +24,7 @@ workflow kraken_pipeline {
             
         // }
 
-        taxonomy = file("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", type: "dir", checkIfExists:true)
+        taxonomy = file($params.taxonomy_dir, type: "dir", checkIfExists:true)
 
         // if (database) {
         //     source_database = database

From e5e12383ccc988783eb1353ccfa257e7e051d5e6 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:04:26 +0100
Subject: [PATCH 20/77] Holla holla no $

---
 subworkflows/kraken_pipeline.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index d4b92ed..a05ef0d 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -24,7 +24,7 @@ workflow kraken_pipeline {
             
         // }
 
-        taxonomy = file($params.taxonomy_dir, type: "dir", checkIfExists:true)
+        taxonomy = file(params.taxonomy_dir, type: "dir", checkIfExists:true)
 
         // if (database) {
         //     source_database = database

From 19d649e1249ae4e52bf7cacb67f912d911135149 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:20:35 +0100
Subject: [PATCH 21/77] bgzip fastp out

---
 modules/fastp.nf | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index bc8c8f5..e75dc0a 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -16,8 +16,8 @@ process fastp_paired {
 
     output:
         val(prefix)
-        path("${prefix}_1.fastp.fastq.gz")
-        path("${prefix}_2.fastp.fastq.gz")
+        path("${prefix}_1.fastp.fastq.gz"), optional: true
+        path("${prefix}_2.fastp.fastq.gz"), optional: true
         path("${prefix}.fastp.fastq.gz"), emit: processed_fastq
         path("${prefix}.fastp.json")
 
@@ -26,13 +26,26 @@ process fastp_paired {
     fastp \\
         --in1 ${fastq_1} \\
         --in2 ${fastq_2} \\
-        --out1 ${prefix}_1.fastp.fastq.gz \\
-        --out2 ${prefix}_2.fastp.fastq.gz \\
-        --merged-out ${prefix}.fastp.fastq.gz \\
+        --out1 ${prefix}_1.fastp.fastq \\
+        --out2 ${prefix}_2.fastp.fastq \\
+        --merged-out ${prefix}.fastp.fastq \\
         --json ${prefix}.fastp.json \\
         --thread $task.cpus \\
         --detect_adapter_for_pe \\
         2> ${prefix}.fastp.log
+    
+    if [ -s ${prefix}_1.fastp.fastq ]; then
+        bgzip --threads -c $task.cpus > ${prefix}_1.fastp.fastq.gz
+    fi
+
+    if [ -s ${prefix}_2.fastp.fastq ]; then
+        bgzip --threads -c $task.cpus > ${prefix}_2.fastp.fastq.gz
+    fi
+
+    if [ -s ${prefix}.fastp.fastq ]; then
+        bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz
+    fi
+
     """
 
 }
@@ -62,9 +75,13 @@ process fastp_single {
     """
     fastp \\
         --in1 ${fastq} \\
-        --out1 ${prefix}.fastp.fastq.gz \\
+        --out1 ${prefix}.fastp.fastq \\
         --json ${prefix}.fastp.json \\
         --thread $task.cpus \\
         2> ${prefix}.fastp.log
+
+    if [ -s ${prefix}.fastp.fastq ]; then
+        bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz
+    fi
     """
 }
\ No newline at end of file

From 88a79dd3a4bf7f475fe0c91ed926a5e27c4aae21 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:42:19 +0100
Subject: [PATCH 22/77] move fastp to scylla container

---
 modules/fastp.nf | 18 ++++++------------
 nextflow.config  |  2 +-
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index e75dc0a..c9a6dc9 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -4,10 +4,7 @@ process fastp_paired {
 
     publishDir "${params.out_dir}/preprocess/", mode: 'copy'
 
-    conda "bioconda::fastp=0.23.4"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
-        'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
         val(prefix)
@@ -35,15 +32,15 @@ process fastp_paired {
         2> ${prefix}.fastp.log
     
     if [ -s ${prefix}_1.fastp.fastq ]; then
-        bgzip --threads -c $task.cpus > ${prefix}_1.fastp.fastq.gz
+        bgzip --threads $task.cpus -c > ${prefix}_1.fastp.fastq.gz
     fi
 
     if [ -s ${prefix}_2.fastp.fastq ]; then
-        bgzip --threads -c $task.cpus > ${prefix}_2.fastp.fastq.gz
+        bgzip --threads $task.cpus -c > ${prefix}_2.fastp.fastq.gz
     fi
 
     if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz
+        bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz
     fi
 
     """
@@ -56,10 +53,7 @@ process fastp_single {
 
     publishDir "${params.out_dir}/preprocess/", mode: 'copy'
 
-    conda "bioconda::fastp=0.23.4"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
-        'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+    container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
         val(prefix)
@@ -81,7 +75,7 @@ process fastp_single {
         2> ${prefix}.fastp.log
 
     if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads -c $task.cpus > ${prefix}.fastp.fastq.gz
+        bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz
     fi
     """
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 2629838..cfb36b2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -85,7 +85,7 @@ params {
           "--fastq test_data/barcode01/reads.fastq.gz",
       ]
       agent = null
-      container_sha = "sha256:aeaf567b0e05bea87081a8d0f86d92a06664707abd0e61d84146fad26597cb1f"
+      container_sha = "sha256:6cfdca7d4a731f0af221f0cfc842e3dbf4524db0503e71198121cba96b7e7700"
     }
 }
 

From 4e913d7407e5102c567000f283c0b3a910053d1c Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:45:24 +0100
Subject: [PATCH 23/77] ooops wrong hash

---
 dockerfiles/scylla/environment.yml | 4 +++-
 nextflow.config                    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
index b3ac398..96884ce 100644
--- a/dockerfiles/scylla/environment.yml
+++ b/dockerfiles/scylla/environment.yml
@@ -14,4 +14,6 @@ dependencies:
   - biopython
   - pandas
   - numpy
-  - taxonkit
\ No newline at end of file
+  - taxonkit
+  - fastp
+  - tabix
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index cfb36b2..7a93445 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -85,7 +85,7 @@ params {
           "--fastq test_data/barcode01/reads.fastq.gz",
       ]
       agent = null
-      container_sha = "sha256:6cfdca7d4a731f0af221f0cfc842e3dbf4524db0503e71198121cba96b7e7700"
+      container_sha = "sha256:f0a53699c2c367dc98c4dc37e9a683972219ca6242032666a76cf2b081ff5825"
     }
 }
 

From 48882d985876a2c8d8151e0293c5576c3043ce94 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 17:52:01 +0100
Subject: [PATCH 24/77] actually compress the fastqs dummy

---
 modules/fastp.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index c9a6dc9..82fd0bb 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -32,15 +32,15 @@ process fastp_paired {
         2> ${prefix}.fastp.log
     
     if [ -s ${prefix}_1.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c > ${prefix}_1.fastp.fastq.gz
+        bgzip --threads $task.cpus -c ${prefix}_1.fastp.fastq > ${prefix}_1.fastp.fastq.gz
     fi
 
     if [ -s ${prefix}_2.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c > ${prefix}_2.fastp.fastq.gz
+        bgzip --threads $task.cpus -c ${prefix}_2.fastp.fastq > ${prefix}_2.fastp.fastq.gz
     fi
 
     if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz
+        bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz
     fi
 
     """
@@ -75,7 +75,7 @@ process fastp_single {
         2> ${prefix}.fastp.log
 
     if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c > ${prefix}.fastp.fastq.gz
+        bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz
     fi
     """
 }
\ No newline at end of file

From 415f191d3159d9c68b2846bc5d0d56961cba7eab Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 18:03:39 +0100
Subject: [PATCH 25/77] add mako to container

---
 dockerfiles/scylla/environment.yml | 3 ++-
 nextflow.config                    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
index 96884ce..a41a800 100644
--- a/dockerfiles/scylla/environment.yml
+++ b/dockerfiles/scylla/environment.yml
@@ -16,4 +16,5 @@ dependencies:
   - numpy
   - taxonkit
   - fastp
-  - tabix
\ No newline at end of file
+  - tabix
+  - mako
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 7a93445..d38bceb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -85,7 +85,7 @@ params {
           "--fastq test_data/barcode01/reads.fastq.gz",
       ]
       agent = null
-      container_sha = "sha256:f0a53699c2c367dc98c4dc37e9a683972219ca6242032666a76cf2b081ff5825"
+      container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f"
     }
 }
 

From b23736a35950ed19009f38bf620f1afa2722c0d6 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 18:15:04 +0100
Subject: [PATCH 26/77] use pathlib in report

---
 bin/make_report.py | 55 ++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/bin/make_report.py b/bin/make_report.py
index b7680b3..2aea734 100755
--- a/bin/make_report.py
+++ b/bin/make_report.py
@@ -16,9 +16,11 @@
 from Bio import SeqIO
 
 
-def make_output_report(report_to_generate, template, version, sample, data_for_report={}):
+def make_output_report(
+    report_to_generate, template, version, sample, data_for_report={}
+):
     template_dir = os.path.abspath(os.path.dirname(__file__))
-    mylookup = TemplateLookup(directories=[template_dir]) #absolute or relative works
+    mylookup = TemplateLookup(directories=[template_dir])  # absolute or relative works
     mytemplate = mylookup.get_template(template)
 
     buf = StringIO()
@@ -26,26 +28,28 @@ def make_output_report(report_to_generate, template, version, sample, data_for_r
     print(template_dir)
     js_path = os.path.join(template_dir, "report_utils", "sankey.js")
     print(js_path)
-    with open(js_path,'r') as f:
+    with open(js_path, "r") as f:
         data_for_report["sankey_js"] = f.read()
 
-    ctx = Context(buf,
-                    date = date.today(),
-                    version = version,
-                    sample=sample,
-                    data_for_report = data_for_report)
+    ctx = Context(
+        buf,
+        date=date.today(),
+        version=version,
+        sample=sample,
+        data_for_report=data_for_report,
+    )
 
     try:
         mytemplate.render_context(ctx)
 
     except:
         traceback = RichTraceback()
-        for (filename, lineno, function, line) in traceback.traceback:
+        for filename, lineno, function, line in traceback.traceback:
             print("File %s, line %s, in %s" % (filename, lineno, function))
             print(line, "\n")
         print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
 
-    with open(report_to_generate, 'w') as fw:
+    with open(report_to_generate, "w") as fw:
         print("Generating: " + f"{report_to_generate}")
         fw.write(buf.getvalue())
 
@@ -53,34 +57,43 @@ def make_output_report(report_to_generate, template, version, sample, data_for_r
 def main():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--assignments", help="JSON file of kraken/bracken assignments", required=True)
-    parser.add_argument("--read_counts", help="JSON file of read_counts", required=False)
-    parser.add_argument("--sample_id", help="Unique ID of sample", required=False, default="sample")
+    parser.add_argument(
+        "--assignments",
+        help="JSON file of kraken/bracken assignments",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--read_counts", help="JSON file of read_counts", required=False, type=Path
+    )
+    parser.add_argument(
+        "--sample_id", help="Unique ID of sample", required=False, default="sample"
+    )
 
     parser.add_argument("--prefix", help="HTML output prefix ", default="scylla")
 
-    parser.add_argument("--template", help="HTML template for report", default="scylla.mako.html")
+    parser.add_argument(
+        "--template", help="HTML template for report", default="scylla.mako.html"
+    )
     parser.add_argument("--version", help="Scylla version", default="unknown")
 
     args = parser.parse_args()
 
     sample = args.sample_id
 
-    with open(args.assignments, 'r') as bracken_file:
+    with open(args.assignments.resolve(), "r") as bracken_file:
         assignments = bracken_file.read().strip().replace('"', '\\"')
 
     if args.read_counts:
-        with open(args.read_counts, 'r') as qc_file:
-                read_counts = qc_file.read().strip().replace('"', '\\"')
+        with open(args.read_counts.resolve(), "r") as qc_file:
+            read_counts = qc_file.read().strip().replace('"', '\\"')
     else:
         read_counts = {}
-    data_for_report = {"sankey_data":assignments, "read_count_data": read_counts}
-
-
+    data_for_report = {"sankey_data": assignments, "read_count_data": read_counts}
 
     outfile = args.prefix + "_report.html"
     make_output_report(outfile, args.template, args.version, sample, data_for_report)
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 135311533d72369b41801d2cf570533895cabaaf Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 18:18:15 +0100
Subject: [PATCH 27/77] specify json is a text file?

---
 bin/make_report.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/make_report.py b/bin/make_report.py
index 2aea734..55f7c1b 100755
--- a/bin/make_report.py
+++ b/bin/make_report.py
@@ -81,11 +81,11 @@ def main():
 
     sample = args.sample_id
 
-    with open(args.assignments.resolve(), "r") as bracken_file:
+    with open(args.assignments.resolve(), "rt") as bracken_file:
         assignments = bracken_file.read().strip().replace('"', '\\"')
 
     if args.read_counts:
-        with open(args.read_counts.resolve(), "r") as qc_file:
+        with open(args.read_counts.resolve(), "rt") as qc_file:
             read_counts = qc_file.read().strip().replace('"', '\\"')
     else:
         read_counts = {}

From 26ed5ef87301cf9dd705b35df5ec426bf57dbcad Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 18:23:34 +0100
Subject: [PATCH 28/77] treat bracken jsons in a sane manner

---
 modules/kraken_client.nf        |  1 +
 subworkflows/kraken_pipeline.nf | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index dd89ea8..dcc2265 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -124,6 +124,7 @@ process bracken_to_json {
         path bracken_summary
     output:
         path "${params.database_set}.bracken.json"
+
     """
     cat "${bracken_summary}" | cut -f2,6 | tail -n+2 > taxacounts.txt
     cat "${bracken_summary}" | cut -f2 | tail -n+2 > taxa.txt
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index a05ef0d..8b076b8 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -40,13 +40,18 @@ workflow kraken_pipeline {
 
         // start_server(database, taxonomy)
         run_kraken_and_bracken(unique_id, fastq, database, taxonomy)
+
+        run_kraken_and_bracken.out.json
+            .collect()
+            .set { kraken_jsons }
+
         // stop_server(run_kraken_and_bracken.out.bracken_report.collect())
         qc_checks(unique_id, fastq)
-        all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json")
-                    .concat(run_kraken_and_bracken.out.json)
-                    .unique {it.getName()}
-                    .collect()
-        generate_report(unique_id, qc_checks.out, all_bracken_jsons )
+        // all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json")
+        //             .concat(run_kraken_and_bracken.out.json)
+        //             .unique {it.getName()}
+        //             .collect()
+        generate_report(unique_id, qc_checks.out, bracken_jsons )
     emit:
         bracken_report = run_kraken_and_bracken.out.bracken_report
         kraken_report = run_kraken_and_bracken.out.kraken_report

From c1dd4ed844c25b07a3a1afa598d8eaae436bd7c7 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 15 Jun 2023 18:24:36 +0100
Subject: [PATCH 29/77] bracken not kraken dumdum

---
 subworkflows/kraken_pipeline.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index 8b076b8..bb44191 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -43,7 +43,7 @@ workflow kraken_pipeline {
 
         run_kraken_and_bracken.out.json
             .collect()
-            .set { kraken_jsons }
+            .set { bracken_jsons }
 
         // stop_server(run_kraken_and_bracken.out.bracken_report.collect())
         qc_checks(unique_id, fastq)

From cd6f267befb324b3b07f0515f394cc0bb51b6d1e Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Fri, 16 Jun 2023 09:00:57 +0100
Subject: [PATCH 30/77] properly merge paired reads

---
 modules/fastp.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index 82fd0bb..23616f8 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -26,9 +26,9 @@ process fastp_paired {
         --out1 ${prefix}_1.fastp.fastq \\
         --out2 ${prefix}_2.fastp.fastq \\
         --merged-out ${prefix}.fastp.fastq \\
+        -m \\
         --json ${prefix}.fastp.json \\
         --thread $task.cpus \\
-        --detect_adapter_for_pe \\
         2> ${prefix}.fastp.log
     
     if [ -s ${prefix}_1.fastp.fastq ]; then

From 041fddc0351d11111d3c165bfbdfbaf6ba920cdc Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Fri, 16 Jun 2023 09:06:29 +0100
Subject: [PATCH 31/77] fix fastp paired params

---
 modules/fastp.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index 23616f8..cdf146c 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -25,7 +25,7 @@ process fastp_paired {
         --in2 ${fastq_2} \\
         --out1 ${prefix}_1.fastp.fastq \\
         --out2 ${prefix}_2.fastp.fastq \\
-        --merged-out ${prefix}.fastp.fastq \\
+        --merged_out ${prefix}.fastp.fastq \\
         -m \\
         --json ${prefix}.fastp.json \\
         --thread $task.cpus \\

From 05f047efc5244c0cd3ecc5d2158454e21c406bd7 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 19 Jun 2023 09:31:58 +0100
Subject: [PATCH 32/77] Add k2_server dockerfile

---
 dockerfiles/kraken2-server/Dockerfile      | 9 +++++++++
 dockerfiles/kraken2-server/environment.yml | 8 ++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 dockerfiles/kraken2-server/Dockerfile
 create mode 100644 dockerfiles/kraken2-server/environment.yml

diff --git a/dockerfiles/kraken2-server/Dockerfile b/dockerfiles/kraken2-server/Dockerfile
new file mode 100644
index 0000000..b2df307
--- /dev/null
+++ b/dockerfiles/kraken2-server/Dockerfile
@@ -0,0 +1,9 @@
+FROM condaforge/mambaforge:latest AS conda
+
+COPY environment.yml .
+
+RUN /opt/conda/bin/mamba env create -f /environment.yml
+
+ENV PATH=/opt/conda/envs/kraken2-server/bin:$PATH
+
+CMD ["/bin/bash"]
diff --git a/dockerfiles/kraken2-server/environment.yml b/dockerfiles/kraken2-server/environment.yml
new file mode 100644
index 0000000..ade0536
--- /dev/null
+++ b/dockerfiles/kraken2-server/environment.yml
@@ -0,0 +1,8 @@
+name: kraken2-server
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+  - epi2melabs
+dependencies:
+  - kraken2-server # not on mac
\ No newline at end of file

From 26d5e30516dc1569a885c9d884b9dda0152b191c Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 19 Jun 2023 10:23:20 +0100
Subject: [PATCH 33/77] Make fastp output into correct dir

---
 modules/fastp.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index cdf146c..8720534 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -2,7 +2,7 @@ process fastp_paired {
     
     label 'process_medium'
 
-    publishDir "${params.out_dir}/preprocess/", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
 
     container "biowilko/scylla@${params.wf.container_sha}"
 

From bf7cb65525f457454df219609abf7049915df866 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 19 Jun 2023 16:59:05 +0100
Subject: [PATCH 34/77] fix publish dir

---
 modules/assemble_taxa.nf           | 2 +-
 modules/extract_taxa.nf            | 2 +-
 modules/fastp.nf                   | 4 ++--
 modules/generate_report.nf         | 2 +-
 modules/get_params_and_versions.nf | 6 ++----
 modules/kraken_client.nf           | 6 +++---
 modules/qc_checks.nf               | 2 +-
 7 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf
index a2811ab..be71157 100644
--- a/modules/assemble_taxa.nf
+++ b/modules/assemble_taxa.nf
@@ -11,7 +11,7 @@ process extract_reads {
     
     label 'process_medium'
 
-    publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 996be81..b4c792c 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -11,7 +11,7 @@ process extract_reads {
     
     label 'process_medium'
 
-    publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/fastp.nf b/modules/fastp.nf
index 8720534..e451a70 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -2,7 +2,7 @@ process fastp_paired {
     
     label 'process_medium'
 
-    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
+    publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy'
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
@@ -51,7 +51,7 @@ process fastp_single {
     
     label 'process_medium'
 
-    publishDir "${params.out_dir}/preprocess/", mode: 'copy'
+    publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy'
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index 928c146..232fa7d 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -3,7 +3,7 @@ process make_report {
     
     label "process_low"
 
-    publishDir path: "${params.out_dir}/${unique_id}", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
     container "biowilko/scylla@${params.wf.container_sha}"
diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf
index cea7a8b..7822757 100644
--- a/modules/get_params_and_versions.nf
+++ b/modules/get_params_and_versions.nf
@@ -5,9 +5,7 @@ import groovy.json.JsonBuilder
 process get_versions {
 
     
-
-    label "scylla"
-    publishDir "${params.out_dir}/execution", mode: 'copy'
+    publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy'
     cpus 1
     output:
         path "versions.txt"
@@ -21,7 +19,7 @@ process get_versions {
 
 
 process get_params {
-    publishDir "${params.out_dir}/execution", mode: 'copy'
+    publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy'
     cpus 1
     output:
         path "params.json"
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index dcc2265..a4979db 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -29,7 +29,7 @@ process combine_kraken_outputs {
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
-    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
     
     input:
         val unique_id
@@ -81,7 +81,7 @@ process bracken {
     
     label 'process_low'
 
-    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
 
     conda "bioconda::bracken=2.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -111,7 +111,7 @@ process bracken_to_json {
     
     label "process_low"
 
-    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
     container "biowilko/scylla@${params.wf.container_sha}"
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index 45b23e8..2d384bd 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -23,7 +23,7 @@ process combine_stats {
     
     label "process_low"
 
-    publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
+    publishDir "${params.out_dir}/${params.unique_id}/qc", mode: 'copy'
 
     conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
     container "biowilko/scylla@${params.wf.container_sha}"

From 4a534768f4568af942628c8304514c2780d71c45 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Tue, 20 Jun 2023 10:27:43 +0100
Subject: [PATCH 35/77] Don't exit 2 if too much human

---
 bin/extract_kraken_reads.py | 382 +++++++++++++++++++++++++++---------
 1 file changed, 285 insertions(+), 97 deletions(-)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index 569098e..33bc107 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -14,30 +14,34 @@
 import json
 from datetime import datetime
 
+
 def mean(l):
     if len(l) == 0:
         return 0
-    return sum(l)/len(l)
+    return sum(l) / len(l)
+
 
 def median(l):
-    if len(l)%2 == 0:
-        i = (len(l))/2
+    if len(l) % 2 == 0:
+        i = (len(l)) / 2
     else:
-        i = (len(l) + 1)/2
+        i = (len(l) + 1) / 2
     i = int(i)
     l = sorted(l)
     return l[i]
 
+
 def parse_depth(name):
     parse_name = name.split(" ")
     depth = 0
     for i in parse_name:
-        if i!="":
+        if i != "":
             break
         depth += 1
-    depth = int(depth/2)
+    depth = int(depth / 2)
     return depth
 
+
 def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None):
     first = kraken_file
     if not kraken_file:
@@ -49,24 +53,43 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None):
         for line in f:
             if line.startswith("% of Seqs"):
                 continue
-            percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t")
+            (
+                percentage,
+                num_clade_root,
+                num_direct,
+                raw_rank,
+                ncbi,
+                name,
+            ) = line.strip().split("\t")
             percentage = float(percentage)
             num_clade_root = int(num_clade_root)
             num_direct = int(num_direct)
             depth = parse_depth(name)
             name = name.strip()
             rank = raw_rank[0]
-            hierarchy = hierarchy[:depth - 1]
+            hierarchy = hierarchy[: depth - 1]
             hierarchy.append(ncbi)
 
-            if name in ['Homo sapiens', 'unclassified', 'root']:
-                if max_human and name == 'Homo sapiens' and num_direct > max_human:
-                    sys.stderr.write("ERROR: found %i human reads, max allowed is %i\n" %(num_direct, max_human))
-                    sys.exit(2)
+            if name in ["Homo sapiens", "unclassified", "root"]:
+                if max_human and name == "Homo sapiens" and num_direct > max_human:
+                    sys.stderr.write(
+                        "ERROR: found %i human reads, max allowed is %i\n"
+                        % (num_direct, max_human)
+                    )
+                    # sys.exit(2)
                 continue
 
-            entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root,
-            "raw_rank": raw_rank, "rank": rank, "depth": depth, "name": name, "parents":[], "children":[]}
+            entries[ncbi] = {
+                "percentage": percentage,
+                "count": num_direct,
+                "count_descendants": num_clade_root,
+                "raw_rank": raw_rank,
+                "rank": rank,
+                "depth": depth,
+                "name": name,
+                "parents": [],
+                "children": [],
+            }
 
             if len(hierarchy) > 1:
                 parent = hierarchy[-2]
@@ -77,22 +100,45 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None):
     if kraken_file and bracken_file:
         with open(bracken_file, "r") as f:
             for line in f:
-                percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t")
+                (
+                    percentage,
+                    num_clade_root,
+                    num_direct,
+                    raw_rank,
+                    ncbi,
+                    name,
+                ) = line.strip().split("\t")
                 percentage = float(percentage)
                 num_clade_root = int(num_clade_root)
                 num_direct = int(num_direct)
                 name = name.strip()
 
-                if name in ['Homo sapiens', 'unclassified', 'root']:
-                        continue
+                if name in ["Homo sapiens", "unclassified", "root"]:
+                    continue
 
-                entries[ncbi].update({"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root})
+                entries[ncbi].update(
+                    {
+                        "percentage": percentage,
+                        "count": num_direct,
+                        "count_descendants": num_clade_root,
+                    }
+                )
 
-    sys.stdout.write("FOUND %i TAXA IN BRACKEN REPORT\n" %len(entries))
+    sys.stdout.write("FOUND %i TAXA IN BRACKEN REPORT\n" % len(entries))
     return entries
 
-def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=None,min_count_descendants=None,
-min_percent=None, top_n=None, include_parents=False, include_children=False):
+
+def get_taxon_id_lists(
+    bracken_hierarchy,
+    names=[],
+    target_ranks=[],
+    min_count=None,
+    min_count_descendants=None,
+    min_percent=None,
+    top_n=None,
+    include_parents=False,
+    include_children=False,
+):
     lists_to_extract = {}
     for taxon in bracken_hierarchy:
         entry = bracken_hierarchy[taxon]
@@ -116,8 +162,7 @@ def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=N
                 child = children.pop()
                 lists_to_extract[taxon].append(child)
                 children.extend(bracken_hierarchy[child]["children"])
-    sys.stdout.write("SELECTED %i TAXA TO EXTRACT\n" %len(lists_to_extract))
-
+    sys.stdout.write("SELECTED %i TAXA TO EXTRACT\n" % len(lists_to_extract))
 
     if top_n and len(lists_to_extract) > top_n:
         X = list(lists_to_extract.keys())
@@ -126,18 +171,18 @@ def get_taxon_id_lists(bracken_hierarchy, names=[], target_ranks=[], min_count=N
         to_delete = ordered[top_n:]
         for taxon in to_delete:
             del lists_to_extract[taxon]
-        sys.stdout.write("REDUCED TO %i TAXA TO EXTRACT\n" %len(lists_to_extract))
-
+        sys.stdout.write("REDUCED TO %i TAXA TO EXTRACT\n" % len(lists_to_extract))
 
     return lists_to_extract
 
+
 def check_read_files(reads):
-    if(reads[-3:] == '.gz'):
-        read_file = gzip.open(reads,'rt')
-        zipped=True
+    if reads[-3:] == ".gz":
+        read_file = gzip.open(reads, "rt")
+        zipped = True
     else:
-        read_file = open(reads,'rt')
-        zipped=False
+        read_file = open(reads, "rt")
+        zipped = False
     first = read_file.readline()
     if len(first) == 0:
         sys.stderr.write("ERROR: sequence file's first line is blank\n")
@@ -151,10 +196,11 @@ def check_read_files(reads):
         sys.exit(1)
     return filetype, zipped
 
+
 def parse_kraken_assignment_line(line):
-    line_vals = line.strip().split('\t')
+    line_vals = line.strip().split("\t")
     if len(line_vals) < 5:
-        return -1, ''
+        return -1, ""
     if "taxid" in line_vals[2]:
         temp = line_vals[2].split("taxid ")[-1]
         tax_id = temp[:-1]
@@ -162,15 +208,30 @@ def parse_kraken_assignment_line(line):
         tax_id = line_vals[2]
 
     read_id = line_vals[1]
-    if (tax_id == 'A'):
+    if tax_id == "A":
         tax_id = 81077
     else:
         tax_id = tax_id
     return tax_id, read_id
 
 
-def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, reads2, prefix, max_human=None, names=[], target_ranks=[],min_count=None,
-min_count_descendants=None, min_percent=None, top_n=None, include_parents=False, include_children=False):
+def extract_taxa(
+    kraken_file,
+    bracken_file,
+    kraken_assignment_file,
+    reads1,
+    reads2,
+    prefix,
+    max_human=None,
+    names=[],
+    target_ranks=[],
+    min_count=None,
+    min_count_descendants=None,
+    min_percent=None,
+    top_n=None,
+    include_parents=False,
+    include_children=False,
+):
     # open read files
     filetype, zipped = check_read_files(reads1)
     s_file1 = SeqIO.index(reads1, filetype)
@@ -179,8 +240,17 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read
 
     # get taxids to extract
     bracken_hierarchy = get_bracken_hierarchy(kraken_file, bracken_file, max_human)
-    lists_to_extract = get_taxon_id_lists(bracken_hierarchy, names, target_ranks,min_count,min_count_descendants,
-    min_percent, top_n, include_parents, include_children)
+    lists_to_extract = get_taxon_id_lists(
+        bracken_hierarchy,
+        names,
+        target_ranks,
+        min_count,
+        min_count_descendants,
+        min_percent,
+        top_n,
+        include_parents,
+        include_children,
+    )
 
     # open output files
     outfile_handles = {}
@@ -194,16 +264,18 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read
             if key not in keys:
                 keys[key] = []
             keys[key].append(taxon)
-        outfile_handles[taxon] = open("%s.%s.%s" %(prefix, taxon, filetype), "w")
-        print("opening %s.%s.%s" %(prefix, taxon, filetype))
+        outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w")
+        print("opening %s.%s.%s" % (prefix, taxon, filetype))
         out_counts[taxon] = 0
         quals[taxon] = []
         lens[taxon] = []
-    sys.stdout.write("INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES\n" %len(keys))
-    sys.stdout.write("[%s]\n" %','.join(keys))
-
+    sys.stdout.write(
+        "INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES\n"
+        % len(keys)
+    )
+    sys.stdout.write("[%s]\n" % ",".join(keys))
 
-    with open(kraken_assignment_file, 'r') as kfile:
+    with open(kraken_assignment_file, "r") as kfile:
         for line in kfile:
             tax_id, read_id = parse_kraken_assignment_line(line)
             if tax_id in keys:
@@ -212,13 +284,17 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read
                 elif reads2 and read_id in s_file2:
                     read = s_file2[read_id]
                 else:
-                    sys.stderr.write("ERROR: read id %s not found in read files\n" %read_id)
+                    sys.stderr.write(
+                        "ERROR: read id %s not found in read files\n" % read_id
+                    )
                     sys.exit(1)
 
                 for taxon in keys[tax_id]:
                     SeqIO.write(read, outfile_handles[taxon], filetype)
                     out_counts[taxon] += 1
-                    quals[taxon].append(median(read.letter_annotations["phred_quality"]))
+                    quals[taxon].append(
+                        median(read.letter_annotations["phred_quality"])
+                    )
                     lens[taxon].append(len(read))
 
     for handle in outfile_handles:
@@ -227,81 +303,192 @@ def extract_taxa(kraken_file, bracken_file, kraken_assignment_file, reads1, read
 
     summary = []
     for taxon in lists_to_extract:
-        summary.append({"human_readable": bracken_hierarchy[taxon]["name"], "taxon": taxon, "tax_level": bracken_hierarchy[taxon]["rank"], "filename":"%s.%s.%s" %(prefix, taxon, filetype), "qc_metrics":{"num_reads":out_counts[taxon], "avg_qual":mean(quals[taxon]), "mean_len":mean(lens[taxon])}})
-    with open("%s_summary.json" %prefix, 'w') as f:
+        summary.append(
+            {
+                "human_readable": bracken_hierarchy[taxon]["name"],
+                "taxon": taxon,
+                "tax_level": bracken_hierarchy[taxon]["rank"],
+                "filename": "%s.%s.%s" % (prefix, taxon, filetype),
+                "qc_metrics": {
+                    "num_reads": out_counts[taxon],
+                    "avg_qual": mean(quals[taxon]),
+                    "mean_len": mean(lens[taxon]),
+                },
+            }
+        )
+    with open("%s_summary.json" % prefix, "w") as f:
         print(summary)
         json.dump(summary, f)
     return out_counts
 
-#Main method
+
+# Main method
 def main():
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument('-k', dest='kraken_assignment_file', required=True,
-        help='Kraken assignment file to parse')
-    parser.add_argument('-r', dest='kraken_report_file', required=False,
-            help='Kraken file of taxon relationships and quantities')
-    parser.add_argument('-b', dest='bracken_report_file', required=False,
-        help='Bracken file of taxon relationships and quantities. Quantities used in preference over kraken')
-    parser.add_argument('-s','-s1', '-1', dest='reads1', required=True,
-        help='FASTA/FASTQ File containing the raw reads.')
-    parser.add_argument('-s2', '-2', dest='reads2', default= "",
-        help='2nd FASTA/FASTQ File containing the raw reads (paired).')
-
-    parser.add_argument('-p', "--prefix",dest='prefix', required=True, default="taxid",
-        help='Prefix for output files')
-
-    parser.add_argument("--taxid",dest='taxid', required=False, nargs='*',
-            help='List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file')
-    parser.add_argument("--rank",dest='rank', required=False, nargs='*',
-                help='Rank(s) to extract')
-    parser.add_argument("--max_human",dest='max_human', required=False, type=int,
-                        help='Maximum human reads to allow')
-    parser.add_argument("--min_count",dest='min_count', required=False, type=int,
-                    help='Minimum direct read count')
-    parser.add_argument("--min_count_descendants",dest='min_count_descendants', required=False, type=int,
-                        help='Minimum read count at taxon level or descendants')
-    parser.add_argument("--min_percent",dest='min_percent', required=False, type=float,
-                        help='Minimum percentage of reads e.g 4')
-    parser.add_argument("--n",dest='top_n', required=False, type=int,
-                        help='Maximum number of taxa to extract (top n)')
-    parser.add_argument('--include_parents', dest="include_parents", required=False,
-        action='store_true',default=False,
-        help='Include reads classified at parent levels of the specified taxids')
-    parser.add_argument('--include_children', dest='include_children', required=False,
-        action='store_true',default=False,
-        help='Include reads classified more specifically than the specified taxids')
+    parser.add_argument(
+        "-k",
+        dest="kraken_assignment_file",
+        required=True,
+        help="Kraken assignment file to parse",
+    )
+    parser.add_argument(
+        "-r",
+        dest="kraken_report_file",
+        required=False,
+        help="Kraken file of taxon relationships and quantities",
+    )
+    parser.add_argument(
+        "-b",
+        dest="bracken_report_file",
+        required=False,
+        help="Bracken file of taxon relationships and quantities. Quantities used in preference over kraken",
+    )
+    parser.add_argument(
+        "-s",
+        "-s1",
+        "-1",
+        dest="reads1",
+        required=True,
+        help="FASTA/FASTQ File containing the raw reads.",
+    )
+    parser.add_argument(
+        "-s2",
+        "-2",
+        dest="reads2",
+        default="",
+        help="2nd FASTA/FASTQ File containing the raw reads (paired).",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--prefix",
+        dest="prefix",
+        required=True,
+        default="taxid",
+        help="Prefix for output files",
+    )
+
+    parser.add_argument(
+        "--taxid",
+        dest="taxid",
+        required=False,
+        nargs="*",
+        help="List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file",
+    )
+    parser.add_argument(
+        "--rank", dest="rank", required=False, nargs="*", help="Rank(s) to extract"
+    )
+    parser.add_argument(
+        "--max_human",
+        dest="max_human",
+        required=False,
+        type=int,
+        help="Maximum human reads to allow",
+    )
+    parser.add_argument(
+        "--min_count",
+        dest="min_count",
+        required=False,
+        type=int,
+        help="Minimum direct read count",
+    )
+    parser.add_argument(
+        "--min_count_descendants",
+        dest="min_count_descendants",
+        required=False,
+        type=int,
+        help="Minimum read count at taxon level or descendants",
+    )
+    parser.add_argument(
+        "--min_percent",
+        dest="min_percent",
+        required=False,
+        type=float,
+        help="Minimum percentage of reads e.g 4",
+    )
+    parser.add_argument(
+        "--n",
+        dest="top_n",
+        required=False,
+        type=int,
+        help="Maximum number of taxa to extract (top n)",
+    )
+    parser.add_argument(
+        "--include_parents",
+        dest="include_parents",
+        required=False,
+        action="store_true",
+        default=False,
+        help="Include reads classified at parent levels of the specified taxids",
+    )
+    parser.add_argument(
+        "--include_children",
+        dest="include_children",
+        required=False,
+        action="store_true",
+        default=False,
+        help="Include reads classified more specifically than the specified taxids",
+    )
     parser.set_defaults(append=False)
 
-    args=parser.parse_args()
+    args = parser.parse_args()
 
     if not args.kraken_report_file and not args.bracken_report_file:
-        sys.stderr.write("ERROR: require at least one report file from bracken or kraken\n")
+        sys.stderr.write(
+            "ERROR: require at least one report file from bracken or kraken\n"
+        )
         sys.exit(1)
 
-    #Start Program
+    # Start Program
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM START TIME: " + time + '\n')
-
-    rank_dict = {'kingdom':'K','domain':'D','phylum':'P','class':'C','order':'O','family':'F','genus':'G','species':'S',
-    'K':'K', 'D':'D', 'P':'P', 'C':'C', 'O':'O', 'F':'F', 'G':'G', 'S':'S'}
+    sys.stdout.write("PROGRAM START TIME: " + time + "\n")
+
+    rank_dict = {
+        "kingdom": "K",
+        "domain": "D",
+        "phylum": "P",
+        "class": "C",
+        "order": "O",
+        "family": "F",
+        "genus": "G",
+        "species": "S",
+        "K": "K",
+        "D": "D",
+        "P": "P",
+        "C": "C",
+        "O": "O",
+        "F": "F",
+        "G": "G",
+        "S": "S",
+    }
     if args.rank:
         target_ranks = [rank_dict[r] for r in args.rank]
     else:
         target_ranks = []
     print(target_ranks)
 
-    out_counts = extract_taxa(args.kraken_report_file, args.bracken_report_file, args.kraken_assignment_file,
-                              args.reads1, args.reads2, args.prefix,
-                              max_human = args.max_human, target_ranks=target_ranks,min_count=args.min_count,
-                              min_count_descendants=args.min_count_descendants,
-                              min_percent=args.min_percent, top_n=args.top_n, include_parents=args.include_parents,
-                              include_children=args.include_children)
+    out_counts = extract_taxa(
+        args.kraken_report_file,
+        args.bracken_report_file,
+        args.kraken_assignment_file,
+        args.reads1,
+        args.reads2,
+        args.prefix,
+        max_human=args.max_human,
+        target_ranks=target_ranks,
+        min_count=args.min_count,
+        min_count_descendants=args.min_count_descendants,
+        min_percent=args.min_percent,
+        top_n=args.top_n,
+        include_parents=args.include_parents,
+        include_children=args.include_children,
+    )
 
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM END TIME: " + time + '\n')
+    sys.stdout.write("PROGRAM END TIME: " + time + "\n")
 
     sys.stdout.write("READ COUNTS: \n")
 
@@ -310,5 +497,6 @@ def main():
 
     sys.exit(0)
 
+
 if __name__ == "__main__":
     main()

From a6e87101ea47cdb82a8e0148c9e1d5f571e88321 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 11:35:02 +0100
Subject: [PATCH 36/77] move params to own file

---
 conf/params.config | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 nextflow.config    | 78 +-------------------------------------------
 2 files changed, 81 insertions(+), 77 deletions(-)
 create mode 100644 conf/params.config

diff --git a/conf/params.config b/conf/params.config
new file mode 100644
index 0000000..f935a9b
--- /dev/null
+++ b/conf/params.config
@@ -0,0 +1,80 @@
+params {
+    // TODO nf-core: Specify your pipeline's command line flags
+    // Input options
+    input                      = null
+
+    // Boilerplate options
+    outdir                     = null
+    tracedir                   = "${params.outdir}/pipeline_info"
+    publish_dir_mode           = 'copy'
+    email                      = null
+    email_on_fail              = null
+    plaintext_email            = false
+    monochrome_logs            = false
+    hook_url                   = null
+    help                       = false
+    version                    = false
+    validate_params            = true
+    show_hidden_params         = false
+    schema_ignore_params       = 'genomes'
+
+    // Max resource options
+    // Defaults only, expecting to be overwritten
+    max_memory                 = '128.GB'
+    max_cpus                   = 16
+    max_time                   = '240.h'
+
+    help = false
+    version = false
+    wfversion = "v0.0.1"
+
+    out_dir = "output"
+    store_dir = "store_dir"
+
+    unique_id = null
+    fastq = null
+    fastq1 = null
+    fastq2 = null
+    fastq_dir = null
+    metadata = null
+
+    database = null
+    taxonomy = null
+    bracken_dist = null
+    bracken_length = null
+    bracken_level = 'S'
+    database_set = "Viral"
+
+    kraken_report = null
+    bracken_report = null
+    kraken_assignments = null
+    assembly_rank="S G"
+    assembly_min_reads = 10
+    assembly_min_percent = 1
+    max_human_reads_before_rejection = 1000
+    read_type = "illumina"
+
+    disable_ping = false
+    threads = 2
+    k2_port = 8080
+    k2_host = 'localhost'
+    port = 8080
+    process_label = "scylla"
+    monochrome_logs = false
+    paired = false
+    taxonomy_dir = "/shared/public/db/taxonomy/"
+
+    validate_params = true
+    show_hidden_params = false
+
+    analyse_unclassified = false
+    schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,pangolin_version,wfversion,wf,process_label'
+
+    wf {
+      example_cmd = [
+          "--fastq test_data/barcode01/reads.fastq.gz",
+      ]
+      agent = null
+      container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f"
+    }
+}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index d38bceb..bb304ab 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -10,87 +10,11 @@
 // for further help editing this file.
 
 
-params {
-    // TODO nf-core: Specify your pipeline's command line flags
-    // Input options
-    input                      = null
 
-    // Boilerplate options
-    outdir                     = null
-    tracedir                   = "${params.outdir}/pipeline_info"
-    publish_dir_mode           = 'copy'
-    email                      = null
-    email_on_fail              = null
-    plaintext_email            = false
-    monochrome_logs            = false
-    hook_url                   = null
-    help                       = false
-    version                    = false
-    validate_params            = true
-    show_hidden_params         = false
-    schema_ignore_params       = 'genomes'
-
-    // Max resource options
-    // Defaults only, expecting to be overwritten
-    max_memory                 = '128.GB'
-    max_cpus                   = 16
-    max_time                   = '240.h'
-
-    help = false
-    version = false
-    wfversion = "v0.0.1"
-
-    out_dir = "output"
-    store_dir = "store_dir"
-
-    unique_id = null
-    fastq = null
-    fastq_dir = null
-    metadata = null
-
-    database = null
-    taxonomy = null
-    bracken_dist = null
-    bracken_length = null
-    bracken_level = 'S'
-    database_set = "Viral"
-
-    kraken_report = null
-    bracken_report = null
-    kraken_assignments = null
-    assembly_rank="S G"
-    assembly_min_reads = 10
-    assembly_min_percent = 1
-    max_human_reads_before_rejection = 1000
-    read_type = "illumina"
-
-    disable_ping = false
-    threads = 2
-    k2_port = 8080
-    k2_host = 'localhost'
-    port = 8080
-    process_label = "scylla"
-    monochrome_logs = false
-    paired = false
-    taxonomy_dir = "/shared/public/db/taxonomy/"
-
-    validate_params = true
-    show_hidden_params = false
-
-    analyse_unclassified = false
-    schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,pangolin_version,wfversion,wf,process_label'
-
-    wf {
-      example_cmd = [
-          "--fastq test_data/barcode01/reads.fastq.gz",
-      ]
-      agent = null
-      container_sha = "sha256:4ff33113bd8f0b87ca971ec1e313d8e022b6131829fdd33694d490b399254f9f"
-    }
-}
 
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
+includeConfig 'conf/params.config'
 
 manifest {
     name            = 'snowy-leopard/scylla'

From 823c75ccc671049d4b5bda934bb4cad5ee813a73 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 11:54:26 +0100
Subject: [PATCH 37/77] add additional_bracken_jsons parameter: to test on
 CLIMB

---
 conf/params.config              |  1 +
 subworkflows/kraken_pipeline.nf | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/conf/params.config b/conf/params.config
index f935a9b..68c3a6a 100644
--- a/conf/params.config
+++ b/conf/params.config
@@ -48,6 +48,7 @@ params {
     kraken_report = null
     bracken_report = null
     kraken_assignments = null
+    additional_bracken_jsons = null
     assembly_rank="S G"
     assembly_min_reads = 10
     assembly_min_percent = 1
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index bb44191..a3ba436 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -41,16 +41,20 @@ workflow kraken_pipeline {
         // start_server(database, taxonomy)
         run_kraken_and_bracken(unique_id, fastq, database, taxonomy)
 
-        run_kraken_and_bracken.out.json
-            .collect()
-            .set { bracken_jsons }
-
         // stop_server(run_kraken_and_bracken.out.bracken_report.collect())
         qc_checks(unique_id, fastq)
-        // all_bracken_jsons = Channel.fromPath("${params.out_dir}/${unique_id}/classifications/*.bracken.json")
-        //             .concat(run_kraken_and_bracken.out.json)
-        //             .unique {it.getName()}
-        //             .collect()
+        if (params.additional_bracken_jsons) {
+            Channel.of(file(params.additional_bracken_jsons, type: "file", checkIfExists:true))
+                .concat(run_kraken_and_bracken.out.json)
+                .unique {it.getName()}
+                .collect()
+                .set { bracken_jsons }
+        } else {
+            run_kraken_and_bracken.out.json
+                .collect()
+                .set { bracken_jsons }
+        }
+
         generate_report(unique_id, qc_checks.out, bracken_jsons )
     emit:
         bracken_report = run_kraken_and_bracken.out.bracken_report

From 1d0f445950989606918c0911043d3dfcc67e30f7 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:08:26 +0100
Subject: [PATCH 38/77] allow carry through of self discovered unique_id

---
 main.nf                            |  6 ++++--
 modules/assemble_taxa.nf           |  2 +-
 modules/extract_taxa.nf            |  2 +-
 modules/fastp.nf                   |  6 ++++--
 modules/generate_report.nf         |  2 +-
 modules/get_params_and_versions.nf | 14 ++++++++++----
 modules/kraken_client.nf           |  6 +++---
 modules/qc_checks.nf               |  2 +-
 8 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/main.nf b/main.nf
index 0d67642..e819e83 100644
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,8 @@ include { ingest } from './subworkflows/ingest'
 include { fastp_single; fastp_paired} from './modules/fastp'
 
 workflow {
+    unique_id = "${params.unique_id}"
+
     // check input fastq exists and run fastp
     
     if (params.paired) {
@@ -11,14 +13,14 @@ workflow {
         Channel.of(file(params.fastq2, type: "file", checkIfExists:true))
             .set {input_fastq_2_ch}
 
-        fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch)
+        fastp_paired(unique_id, unique_id, input_fastq_1_ch, input_fastq_2_ch)
         fastp_paired.out.processed_fastq
             .set {processed_fastq}
     } else {
         Channel.of(file(params.fastq, type: "file", checkIfExists:true))
             .set {input_fastq_ch}
 
-        fastp_single(params.unique_id, input_fastq_ch)
+        fastp_single(unique_id, unique_id input_fastq_ch)
         fastp_single.out.processed_fastq
             .set {processed_fastq}
     }
diff --git a/modules/assemble_taxa.nf b/modules/assemble_taxa.nf
index be71157..a2811ab 100644
--- a/modules/assemble_taxa.nf
+++ b/modules/assemble_taxa.nf
@@ -11,7 +11,7 @@ process extract_reads {
     
     label 'process_medium'
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index b4c792c..996be81 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -11,7 +11,7 @@ process extract_reads {
     
     label 'process_medium'
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/reads_by_taxa", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/fastp.nf b/modules/fastp.nf
index e451a70..747e431 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -2,11 +2,12 @@ process fastp_paired {
     
     label 'process_medium'
 
-    publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
+        val(unique_id)
         val(prefix)
         path(fastq_1)
         path(fastq_2)
@@ -51,11 +52,12 @@ process fastp_single {
     
     label 'process_medium'
 
-    publishDir "${params.out_dir}/${params.unique_id}/preprocess/", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
+        val(unique_id)
         val(prefix)
         path(fastq)
 
diff --git a/modules/generate_report.nf b/modules/generate_report.nf
index 232fa7d..0fccc76 100644
--- a/modules/generate_report.nf
+++ b/modules/generate_report.nf
@@ -3,7 +3,7 @@ process make_report {
     
     label "process_low"
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
     container "biowilko/scylla@${params.wf.container_sha}"
diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf
index 7822757..90f9359 100644
--- a/modules/get_params_and_versions.nf
+++ b/modules/get_params_and_versions.nf
@@ -5,8 +5,10 @@ import groovy.json.JsonBuilder
 process get_versions {
 
     
-    publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/execution", mode: 'copy'
     cpus 1
+    input:
+        val unique_id
     output:
         path "versions.txt"
     script:
@@ -19,8 +21,10 @@ process get_versions {
 
 
 process get_params {
-    publishDir "${params.out_dir}/${params.unique_id}/execution", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/execution", mode: 'copy'
     cpus 1
+    input:
+        val unique_id
     output:
         path "params.json"
     script:
@@ -32,9 +36,11 @@ process get_params {
 }
 
 workflow get_params_and_versions {
+    take:
+        unique_ud
     main:
-        software_versions = get_versions()
-        workflow_params = get_params()
+        software_versions = get_versions(unique_id)
+        workflow_params = get_params(unique_id)
     emit:
         software_versions
         workflow_params
diff --git a/modules/kraken_client.nf b/modules/kraken_client.nf
index a4979db..dcc2265 100644
--- a/modules/kraken_client.nf
+++ b/modules/kraken_client.nf
@@ -29,7 +29,7 @@ process combine_kraken_outputs {
 
     container "biowilko/scylla@${params.wf.container_sha}"
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     
     input:
         val unique_id
@@ -81,7 +81,7 @@ process bracken {
     
     label 'process_low'
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
 
     conda "bioconda::bracken=2.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -111,7 +111,7 @@ process bracken_to_json {
     
     label "process_low"
 
-    publishDir path: "${params.out_dir}/${params.unique_id}/classifications", mode: 'copy'
+    publishDir path: "${params.out_dir}/${unique_id}/classifications", mode: 'copy'
     
     conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
     container "biowilko/scylla@${params.wf.container_sha}"
diff --git a/modules/qc_checks.nf b/modules/qc_checks.nf
index 2d384bd..45b23e8 100644
--- a/modules/qc_checks.nf
+++ b/modules/qc_checks.nf
@@ -23,7 +23,7 @@ process combine_stats {
     
     label "process_low"
 
-    publishDir "${params.out_dir}/${params.unique_id}/qc", mode: 'copy'
+    publishDir "${params.out_dir}/${unique_id}/qc", mode: 'copy'
 
     conda "conda-forge::pandas=1.2.4 conda-forge::numpy=1.20.3"
     container "biowilko/scylla@${params.wf.container_sha}"

From b1f67dd50ee60c70ba8808a14305e2dfe83c8ea6 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:11:56 +0100
Subject: [PATCH 39/77] typo

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index e819e83..c05d18a 100644
--- a/main.nf
+++ b/main.nf
@@ -20,7 +20,7 @@ workflow {
         Channel.of(file(params.fastq, type: "file", checkIfExists:true))
             .set {input_fastq_ch}
 
-        fastp_single(unique_id, unique_id input_fastq_ch)
+        fastp_single(unique_id, unique_id, input_fastq_ch)
         fastp_single.out.processed_fastq
             .set {processed_fastq}
     }

From d6509ac028ed058c9694d6d732f148ee7a3f2e19 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:14:18 +0100
Subject: [PATCH 40/77] unify input declaration style in fastp

---
 modules/fastp.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index 747e431..22f2620 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -57,14 +57,14 @@ process fastp_single {
     container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
-        val(unique_id)
-        val(prefix)
-        path(fastq)
+        val unique_id
+        val prefix
+        path fastq
 
     output:
-        val(prefix)
-        path("${prefix}.fastp.fastq.gz"), emit: processed_fastq
-        path("${prefix}.fastp.json")
+        val prefix
+        path "${prefix}.fastp.fastq.gz", emit: processed_fastq
+        path "${prefix}.fastp.json"
 
     script:
 

From 9dc4e87fff634e929cb7c8f0e3824ef8c432c55a Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:17:26 +0100
Subject: [PATCH 41/77] unify input declaration style in fastp

---
 modules/fastp.nf | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/fastp.nf b/modules/fastp.nf
index 22f2620..4faf898 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -7,17 +7,17 @@ process fastp_paired {
     container "biowilko/scylla@${params.wf.container_sha}"
 
     input:
-        val(unique_id)
-        val(prefix)
-        path(fastq_1)
-        path(fastq_2)
+        val unique_id
+        val prefix
+        path fastq_1
+        path fastq_2
 
     output:
-        val(prefix)
-        path("${prefix}_1.fastp.fastq.gz"), optional: true
-        path("${prefix}_2.fastp.fastq.gz"), optional: true
-        path("${prefix}.fastp.fastq.gz"), emit: processed_fastq
-        path("${prefix}.fastp.json")
+        val prefix
+        path "${prefix}_1.fastp.fastq.gz", optional: true
+        path "${prefix}_2.fastp.fastq.gz", optional: true
+        path "${prefix}.fastp.fastq.gz", emit: processed_fastq
+        path "${prefix}.fastp.json"
 
     script:
     """

From 90e629e17fae07ab5d52ff3d8d0c605604b10723 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:20:00 +0100
Subject: [PATCH 42/77] typo

---
 modules/get_params_and_versions.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/get_params_and_versions.nf b/modules/get_params_and_versions.nf
index 90f9359..3b12fc5 100644
--- a/modules/get_params_and_versions.nf
+++ b/modules/get_params_and_versions.nf
@@ -37,7 +37,7 @@ process get_params {
 
 workflow get_params_and_versions {
     take:
-        unique_ud
+        unique_id
     main:
         software_versions = get_versions(unique_id)
         workflow_params = get_params(unique_id)

From 9726dadc1948b147ceefff041878f9aff8c549d5 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Tue, 20 Jun 2023 12:23:09 +0100
Subject: [PATCH 43/77] remove workflow - it's exactly in main.nf

---
 subworkflows/ingest.nf | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 1e21a35..ebcc2f7 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -17,31 +17,3 @@ workflow ingest {
         html_report = kraken_pipeline.out.report
 
 }
-
-// workflow {
-//     // check input fastq exists and run fastp
-    
-//     if (params.paired) {
-//         Channel.of(file(file(params.fastq1, type: "file", checkIfExists:true)))
-//             .set {input_fastq_1_ch}
-
-//         Channel.of(file(file(params.fastq2, type: "file", checkIfExists:true)))
-//             .set {input_fastq_2_ch}
-
-//         // input_fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
-//         // input_fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
-//         fastp_paired(params.unique_id, input_fastq_1_ch, input_fastq_2_ch)
-//         fastp_paired.out.processed_fastq
-//             .set {processed_fastq}
-//     } else {
-//         Channel.of(file(file(params.fastq, type: "file", checkIfExists:true)))
-//             .set {input_fastq_ch}
-
-//         // input_fastq = file(params.fastq, type: "file", checkIfExists:true)
-//         fastp_single(params.unique_id, input_fastq_ch)
-//         fastp_single.out.processed_fastq
-//             .set {processed_fastq}
-//     }
-
-//     ingest(params.unique_id, processed_fastq)
-// }
\ No newline at end of file

From 770d30b7fcc4f4040ac2b533e348898d696657d6 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 09:53:53 +0100
Subject: [PATCH 44/77] fastp -> preprocess, new pair concat method

---
 bin/concatenate_reads.py | 167 +++++++++++++++++++++++++++++++++++++++
 main.nf                  |   9 ++-
 modules/extract_taxa.nf  |  78 ++++++++++++------
 modules/fastp.nf         |  83 -------------------
 modules/preprocess.nf    |  94 ++++++++++++++++++++++
 5 files changed, 322 insertions(+), 109 deletions(-)
 create mode 100644 bin/concatenate_reads.py
 delete mode 100644 modules/fastp.nf
 create mode 100644 modules/preprocess.nf

diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py
new file mode 100644
index 0000000..4d91af7
--- /dev/null
+++ b/bin/concatenate_reads.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+"""
+No dependency Python script for joining two paired end FASTQ files.
+Supports concatenating reads with a separator ("NNNNN") or interleaving reads via the
+--interleave option. Auto-detects gzip'd files, offers header checking via a --strict flag,
+and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag).
+"""
+import argparse
+import gzip
+import sys
+
+
+def enforce_headers(f1_header, f2_header):
+    if f1_header[0] != "@" or f2_header[0] != "@":
+        raise Exception("Invalid input FASTQ files.")
+    if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
+        raise Exception(
+            "Input FASTQ files do not share headers. " "Check and re-run w/o --strict."
+        )
+
+
+def join_w_separator(f1, f2, f_out, strict=False, sep="|"):
+    readlines = True
+    ix = 0
+
+    # Use 'G', which is in most Phred scores.
+    # http://en.wikipedia.org/wiki/FASTQ_format
+    phred_sep = "|" * len(sep)
+
+    while readlines:
+        f1_line = f1.readline()
+        f2_line = f2.readline()
+
+        if f1_line == "":
+            readlines = False
+            f1.close()
+            f2.close()
+            f_out.close()
+            if f2_line != "":
+                raise Exception("Input FASTQ files do not match in length.")
+            continue
+
+        if ix % 4 == 0:  # Header
+            if strict:  # Fail if they don't match up to the first whitespace
+                enforce_headers(f1_line, f2_line)
+
+            # Write the header out
+            f_out.write(f1_line)
+
+        elif (ix - 1) % 4 == 0:  # Sequence
+            f_out.write(f1_line.strip() + sep + f2_line)
+        elif (ix - 2) % 4 == 0:  # Separator
+            f_out.write("+\n")
+        else:
+            f_out.write(f1_line.strip() + phred_sep + f2_line)
+
+        ix += 1
+
+
+def join_interleaved(f1, f2, f_out, strict=False):
+    readlines = True
+    ix = 0
+    f1_lines = []
+    f2_lines = []
+
+    def flush_buffer(f_out, f1_lines, f2_lines):
+        if f1_lines and f2_lines:
+            assert len(f1_lines) == 4
+            assert len(f2_lines) == 4
+            f_out.write(f1_lines[0])
+            f_out.write(f1_lines[1])
+            f_out.write(f1_lines[2])
+            f_out.write(f1_lines[3])
+            f_out.write(f2_lines[0])
+            f_out.write(f2_lines[1])
+            f_out.write(f2_lines[2])
+            f_out.write(f2_lines[3])
+
+            f1_lines = []
+            f2_lines = []
+
+        return f1_lines, f2_lines
+
+    while readlines:
+        f1_line = f1.readline()
+        f2_line = f2.readline()
+
+        if f1_line == "":
+            readlines = False
+            if f2_line != "":
+                raise Exception("Input FASTQ files do not match in length.")
+            break
+
+        if ix % 4 == 0:  # Header #1
+            if strict:
+                enforce_headers(f1_line, f2_line)
+
+            f1_lines, f2_lines = flush_buffer(f_out, f1_lines, f2_lines)
+
+        # Fill buffer up to 4 lines
+        f1_lines.append(f1_line)
+        f2_lines.append(f2_line)
+        ix += 1
+
+    _, _ = flush_buffer(f_out, f1_lines, f2_lines)
+    f1.close()
+    f2.close()
+    f_out.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=(
+            "Join two FASTQ paired end read files. Defaults to interleaving reads. "
+            "Note: Requires input files to be sorted."
+        )
+    )
+    parser.add_argument("fastq1", help="First input FASTQ.")
+    parser.add_argument("fastq2", help="Second input FASTQ.")
+    parser.add_argument(
+        "output_fastq",
+        nargs="?",
+        help=("Output FASTQ file name (optional, " "streams to STDOUT if missing."),
+    )
+    parser.add_argument(
+        "--sep",
+        default="|",
+        help=("Optional separator to override default (|)."),
+    )
+    parser.add_argument(
+        "--no-interleave",
+        action="store_true",
+        help="Concatenate the two reads using a separate (--sep) rather than "
+        "interleaving them (default).",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help=(
+            "Enforce that the headers of the input "
+            "FASTQ files match until the first whitespace "
+            "delimiter (e.g., a space)."
+        ),
+    )
+    parser.add_argument("--gzip", action="store_true", help=("Gzip output_fastq."))
+    args = parser.parse_args()
+
+    if ".gz" in args.fastq1 or ".gzip" in args.fastq1:
+        f1 = gzip.open(args.fastq1, mode="r")
+    else:
+        f1 = open(args.fastq1, mode="r")
+    if ".gz" in args.fastq2 or ".gzip" in args.fastq2:
+        f2 = gzip.open(args.fastq2, mode="r")
+    else:
+        f2 = open(args.fastq2, mode="r")
+
+    if args.output_fastq is None:
+        f_out = sys.stdout
+    elif args.gzip:
+        f_out = gzip.open(args.output_fastq, mode="w")
+    else:
+        f_out = open(args.output_fastq, mode="w")
+
+    if not args.no_interleave:
+        join_interleaved(f1, f2, f_out, strict=args.strict)
+    else:
+        join_w_separator(f1, f2, f_out, strict=args.strict, sep=args.sep)
diff --git a/main.nf b/main.nf
index c05d18a..a132eb8 100644
--- a/main.nf
+++ b/main.nf
@@ -1,5 +1,5 @@
 include { ingest } from './subworkflows/ingest'
-include { fastp_single; fastp_paired} from './modules/fastp'
+include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess'
 
 workflow {
     unique_id = "${params.unique_id}"
@@ -13,14 +13,17 @@ workflow {
         Channel.of(file(params.fastq2, type: "file", checkIfExists:true))
             .set {input_fastq_2_ch}
 
-        fastp_paired(unique_id, unique_id, input_fastq_1_ch, input_fastq_2_ch)
+        fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch)
+
+
+
         fastp_paired.out.processed_fastq
             .set {processed_fastq}
     } else {
         Channel.of(file(params.fastq, type: "file", checkIfExists:true))
             .set {input_fastq_ch}
 
-        fastp_single(unique_id, unique_id, input_fastq_ch)
+        fastp_single(unique_id, input_fastq_ch)
         fastp_single.out.processed_fastq
             .set {processed_fastq}
     }
diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 996be81..26019f5 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -20,7 +20,8 @@ process extract_reads {
 
     input:
         val unique_id
-        path fastq
+        path fastq1
+        path fastq2
         path kraken_assignments
         path kraken_report
         path bracken_report
@@ -28,24 +29,46 @@ process extract_reads {
         path "reads.*.f*.gz", emit: reads
         path "reads_summary.json", emit: summary
     script:
-    """
-    extract_kraken_reads.py \
-        -s ${fastq} \
-        -k ${kraken_assignments} \
-        -r ${kraken_report} \
-        -b ${bracken_report} \
-        -p reads \
-        --include_children \
-        --max_human ${params.max_human_reads_before_rejection} \
-        --min_count_descendants ${params.assembly_min_reads} \
-        --rank ${params.assembly_rank} \
-        --min_percent ${params.assembly_min_percent}
-
-    for f in \$(ls reads.*.f*)
-      do
-        gzip \$f
-      done
-    """
+    if (params.paired) {
+        """
+        extract_kraken_reads.py \
+            -s1 ${fastq1} \
+            -s2 ${fastq2} \
+            -k ${kraken_assignments} \
+            -r ${kraken_report} \
+            -b ${bracken_report} \
+            -p reads \
+            --include_children \
+            --max_human ${params.max_human_reads_before_rejection} \
+            --min_count_descendants ${params.assembly_min_reads} \
+            --rank ${params.assembly_rank} \
+            --min_percent ${params.assembly_min_percent}
+
+        for f in \$(ls reads.*.f*)
+          do
+            gzip \$f
+          done        
+        """
+    } else {
+        """
+        extract_kraken_reads.py \
+            -s ${fastq1} \
+            -k ${kraken_assignments} \
+            -r ${kraken_report} \
+            -b ${bracken_report} \
+            -p reads \
+            --include_children \
+            --max_human ${params.max_human_reads_before_rejection} \
+            --min_count_descendants ${params.assembly_min_reads} \
+            --rank ${params.assembly_rank} \
+            --min_percent ${params.assembly_min_percent}
+
+        for f in \$(ls reads.*.f*)
+        do
+            gzip \$f
+        done
+        """
+    }
 }
 
 
@@ -53,16 +76,25 @@ process extract_reads {
 workflow extract_taxa {
     take:
         unique_id
-        fastq
+        fastq_1
+        fastq_2
         kraken_assignments
         kraken_report
         bracken_report
     main:
-        extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report)
+        extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
 }
 
 workflow {
-    fastq = file("${params.fastq}", type: "file", checkIfExists:true)
+
+    if (params.paired) {
+            fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+
+            fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+    } else { 
+            fastq_1 = file(params.fastq, type: "file", checkIfExists:true)
+            fastq_2 = None
+    }
 
     unique_id = "${params.unique_id}"
     if ("${params.unique_id}" == "null") {
@@ -73,5 +105,5 @@ workflow {
     bracken_report = file("${params.bracken_report}")
     kraken_report = file("${params.kraken_report}")
 
-    extract_taxa(unique_id, fastq, kraken_assignments, kraken_report, bracken_report)
+    extract_taxa(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
 }
\ No newline at end of file
diff --git a/modules/fastp.nf b/modules/fastp.nf
deleted file mode 100644
index 4faf898..0000000
--- a/modules/fastp.nf
+++ /dev/null
@@ -1,83 +0,0 @@
-process fastp_paired {
-    
-    label 'process_medium'
-
-    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
-
-    container "biowilko/scylla@${params.wf.container_sha}"
-
-    input:
-        val unique_id
-        val prefix
-        path fastq_1
-        path fastq_2
-
-    output:
-        val prefix
-        path "${prefix}_1.fastp.fastq.gz", optional: true
-        path "${prefix}_2.fastp.fastq.gz", optional: true
-        path "${prefix}.fastp.fastq.gz", emit: processed_fastq
-        path "${prefix}.fastp.json"
-
-    script:
-    """
-    fastp \\
-        --in1 ${fastq_1} \\
-        --in2 ${fastq_2} \\
-        --out1 ${prefix}_1.fastp.fastq \\
-        --out2 ${prefix}_2.fastp.fastq \\
-        --merged_out ${prefix}.fastp.fastq \\
-        -m \\
-        --json ${prefix}.fastp.json \\
-        --thread $task.cpus \\
-        2> ${prefix}.fastp.log
-    
-    if [ -s ${prefix}_1.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c ${prefix}_1.fastp.fastq > ${prefix}_1.fastp.fastq.gz
-    fi
-
-    if [ -s ${prefix}_2.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c ${prefix}_2.fastp.fastq > ${prefix}_2.fastp.fastq.gz
-    fi
-
-    if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz
-    fi
-
-    """
-
-}
-
-process fastp_single {
-    
-    label 'process_medium'
-
-    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
-
-    container "biowilko/scylla@${params.wf.container_sha}"
-
-    input:
-        val unique_id
-        val prefix
-        path fastq
-
-    output:
-        val prefix
-        path "${prefix}.fastp.fastq.gz", emit: processed_fastq
-        path "${prefix}.fastp.json"
-
-    script:
-
-    """
-    fastp \\
-        --in1 ${fastq} \\
-        --out1 ${prefix}.fastp.fastq \\
-        --json ${prefix}.fastp.json \\
-        --thread $task.cpus \\
-        2> ${prefix}.fastp.log
-
-    if [ -s ${prefix}.fastp.fastq ]; then
-        bgzip --threads $task.cpus -c ${prefix}.fastp.fastq > ${prefix}.fastp.fastq.gz
-    fi
-    """
-}
\ No newline at end of file
diff --git a/modules/preprocess.nf b/modules/preprocess.nf
new file mode 100644
index 0000000..9e68dba
--- /dev/null
+++ b/modules/preprocess.nf
@@ -0,0 +1,94 @@
+process fastp_paired {
+    
+    label 'process_medium'
+
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
+
+    container "biowilko/scylla@${params.wf.container_sha}"
+
+    input:
+        val unique_id
+        path fastq_1 
+        path fastq_2
+
+    output:
+        val unique_id
+        path "${unique_id}_1.fastp.fastq", emit: processed_fastq_1
+        path "${unique_id}_2.fastp.fastq", emit: processed_fastq_2
+        path "${unique_id}.fastp.json"
+
+    script:
+    """
+    fastp \\
+        --in1 ${fastq_1} \\
+        --in2 ${fastq_2} \\
+        --out1 ${unique_id}_1.fastp.fastq \\
+        --out2 ${unique_id}_2.fastp.fastq \\
+        --json ${unique_id}.fastp.json \\
+        --thread $task.cpus \\
+        2> ${unique_id}.fastp.log
+    
+    """
+
+}
+
+process fastp_single {
+    
+    label 'process_medium'
+
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
+
+    container "biowilko/scylla@${params.wf.container_sha}"
+
+    input:
+        val unique_id
+        path fastq
+
+    output:
+        val unique_id
+        path "${unique_id}.fastp.fastq.gz", emit: processed_fastq
+        path "${unique_id}.fastp.json"
+
+    script:
+
+    """
+    fastp \\
+        --in1 ${fastq} \\
+        --out1 ${unique_id}.fastp.fastq \\
+        --json ${unique_id}.fastp.json \\
+        --thread $task.cpus \\
+        2> ${unique_id}.fastp.log
+
+    if [ -s ${unique_id}.fastp.fastq ]; then
+        bgzip --threads $task.cpus -c ${unique_id}.fastp.fastq > ${unique_id}.fastp.fastq.gz
+    fi
+    """
+}
+
+process paired_concatenate {
+
+    label 'process_low'
+
+    publishDir "${params.out_dir}/${unique_id}/preprocess/", mode: 'copy'
+
+    container "biowilko/scylla@${params.wf.container_sha}"
+
+    input:
+        val unique_id
+        path processed_fastq_1
+        path processed_fastq_2
+
+    output:
+        val unique_id
+        path "${unique_id}.fastp.fastq.gz", emit: concatented_fastq
+
+    script:
+    """
+    concatenate_reads.py --no-interleave --strict \\
+        ${processed_fastq_1} ${processed_fastq_2} \\
+        > ${unique_id}.concatenated.fastq
+
+    
+    bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.fastp.fastq.gz
+    """
+}
\ No newline at end of file

From a059471346a768e3caf4467f5390f0119e50a37a Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:04:09 +0100
Subject: [PATCH 45/77] Fix paired concatenate

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index a132eb8..f36503b 100644
--- a/main.nf
+++ b/main.nf
@@ -15,9 +15,9 @@ workflow {
 
         fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch)
 
+        paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2)
 
-
-        fastp_paired.out.processed_fastq
+        paired_concatenate.out.concatenated_fastq
             .set {processed_fastq}
     } else {
         Channel.of(file(params.fastq, type: "file", checkIfExists:true))

From 71f8a2067fda939056f856c1753dcb0fb3af54b8 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:05:59 +0100
Subject: [PATCH 46/77] fix typo

---
 modules/preprocess.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/preprocess.nf b/modules/preprocess.nf
index 9e68dba..da3eacf 100644
--- a/modules/preprocess.nf
+++ b/modules/preprocess.nf
@@ -80,7 +80,7 @@ process paired_concatenate {
 
     output:
         val unique_id
-        path "${unique_id}.fastp.fastq.gz", emit: concatented_fastq
+        path "${unique_id}.fastp.fastq.gz", emit: concatenated_fastq
 
     script:
     """

From 6debe4c7f5f9b276e46873239501801aae646e5a Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:12:36 +0100
Subject: [PATCH 47/77] extract taxa workflow rejigger

---
 subworkflows/ingest.nf | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index ebcc2f7..3ef74ac 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -12,7 +12,17 @@ workflow ingest {
         // get_params_and_versions()
         //clean_metadata(metadata)
         kraken_pipeline(unique_id, fastq)
-        extract_taxa(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
+
+        if (params.paired) {
+                fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+
+                fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+        } else { 
+                fastq_1 = file(params.fastq, type: "file", checkIfExists:true)
+                fastq_2 = None
+        }
+
+        extract_taxa(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
     emit:
         html_report = kraken_pipeline.out.report
 

From 8922f55e0f3fb20d34b336ed6897368115117482 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:18:19 +0100
Subject: [PATCH 48/77] make concat executable

---
 bin/concatenate_reads.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 bin/concatenate_reads.py

diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py
old mode 100644
new mode 100755

From 494ddb1f415379fdac75bdbcc43471f347ab149b Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:24:07 +0100
Subject: [PATCH 49/77] proper concatenated.fastq bgzip

---
 modules/preprocess.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/preprocess.nf b/modules/preprocess.nf
index da3eacf..9bdc480 100644
--- a/modules/preprocess.nf
+++ b/modules/preprocess.nf
@@ -80,7 +80,7 @@ process paired_concatenate {
 
     output:
         val unique_id
-        path "${unique_id}.fastp.fastq.gz", emit: concatenated_fastq
+        path "${unique_id}.concatenated.fastq.gz", emit: concatenated_fastq
 
     script:
     """
@@ -89,6 +89,6 @@ process paired_concatenate {
         > ${unique_id}.concatenated.fastq
 
     
-    bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.fastp.fastq.gz
+    bgzip --threads $task.cpus -c ${unique_id}.concatenated.fastq > ${unique_id}.concatenated.fastq.gz
     """
 }
\ No newline at end of file

From dbaa0ea968349fc0c396e872834078a3ba82e166 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:35:20 +0100
Subject: [PATCH 50/77] Move around extract reads (sorry)

---
 main.nf                | 24 +-----------------------
 modules/preprocess.nf  | 13 ++++++++++---
 subworkflows/ingest.nf | 36 +++++++++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/main.nf b/main.nf
index f36503b..271d457 100644
--- a/main.nf
+++ b/main.nf
@@ -5,28 +5,6 @@ workflow {
     unique_id = "${params.unique_id}"
 
     // check input fastq exists and run fastp
-    
-    if (params.paired) {
-        Channel.of(file(params.fastq1, type: "file", checkIfExists:true))
-            .set {input_fastq_1_ch}
 
-        Channel.of(file(params.fastq2, type: "file", checkIfExists:true))
-            .set {input_fastq_2_ch}
-
-        fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch)
-
-        paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2)
-
-        paired_concatenate.out.concatenated_fastq
-            .set {processed_fastq}
-    } else {
-        Channel.of(file(params.fastq, type: "file", checkIfExists:true))
-            .set {input_fastq_ch}
-
-        fastp_single(unique_id, input_fastq_ch)
-        fastp_single.out.processed_fastq
-            .set {processed_fastq}
-    }
-
-    ingest(params.unique_id, processed_fastq)
+    ingest(params.unique_id)
 }
\ No newline at end of file
diff --git a/modules/preprocess.nf b/modules/preprocess.nf
index 9bdc480..3349da9 100644
--- a/modules/preprocess.nf
+++ b/modules/preprocess.nf
@@ -13,8 +13,8 @@ process fastp_paired {
 
     output:
         val unique_id
-        path "${unique_id}_1.fastp.fastq", emit: processed_fastq_1
-        path "${unique_id}_2.fastp.fastq", emit: processed_fastq_2
+        path "${unique_id}_1.fastp.fastq.gz", emit: processed_fastq_1
+        path "${unique_id}_2.fastp.fastq.gz", emit: processed_fastq_2
         path "${unique_id}.fastp.json"
 
     script:
@@ -27,7 +27,14 @@ process fastp_paired {
         --json ${unique_id}.fastp.json \\
         --thread $task.cpus \\
         2> ${unique_id}.fastp.log
-    
+
+    if [ -s ${unique_id}_1.fastp.fastq ]; then
+        bgzip --threads $task.cpus -c ${unique_id}_1.fastp.fastq > ${unique_id}_1.fastp.fastq.gz
+    fi
+
+    if [ -s ${unique_id}_2.fastp.fastq ]; then
+        bgzip --threads $task.cpus -c ${unique_id}_2.fastp.fastq > ${unique_id}_2.fastp.fastq.gz
+    fi
     """
 
 }
diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 3ef74ac..6f9d181 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -1,28 +1,50 @@
 // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
 include { get_params_and_versions } from '../modules/get_params_and_versions'
 include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
-include { extract_taxa } from '../modules/extract_taxa'
+include { extract_reads } from '../modules/extract_taxa'
 
 workflow ingest {
     take:
         unique_id
-        fastq
         //metadata
     main:
+
+        if (params.paired) {
+            Channel.of(file(params.fastq1, type: "file", checkIfExists:true))
+                .set {input_fastq_1_ch}
+
+            Channel.of(file(params.fastq2, type: "file", checkIfExists:true))
+                .set {input_fastq_2_ch}
+
+            fastp_paired(unique_id, input_fastq_1_ch, input_fastq_2_ch)
+
+            paired_concatenate(unique_id, fastp_paired.out.processed_fastq_1, fastp_paired.out.processed_fastq_2)
+
+            paired_concatenate.out.concatenated_fastq
+                .set {processed_fastq}
+        } else {
+            Channel.of(file(params.fastq, type: "file", checkIfExists:true))
+                .set {input_fastq_ch}
+
+            fastp_single(unique_id, input_fastq_ch)
+            fastp_single.out.processed_fastq
+            .set {processed_fastq}
+        }
+
         // get_params_and_versions()
         //clean_metadata(metadata)
-        kraken_pipeline(unique_id, fastq)
+        kraken_pipeline(unique_id, processed_fastq)
 
         if (params.paired) {
-                fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+                fastq_1 = fastp_paired.out.processed_fastq_1
 
-                fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+                fastq_2 = fastp_paired.out.processed_fastq_2
         } else { 
-                fastq_1 = file(params.fastq, type: "file", checkIfExists:true)
+                fastq_1 = proccessed_fastq
                 fastq_2 = None
         }
 
-        extract_taxa(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
+        extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
     emit:
         html_report = kraken_pipeline.out.report
 

From 4ee6c148e60f58f5e8ed8fe1f0ba839941ce6b9d Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:40:32 +0100
Subject: [PATCH 51/77] import preprocess in ingest

---
 subworkflows/ingest.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 6f9d181..d2eccad 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -2,6 +2,7 @@
 include { get_params_and_versions } from '../modules/get_params_and_versions'
 include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
 include { extract_reads } from '../modules/extract_taxa'
+include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess'
 
 workflow ingest {
     take:

From 593e82c7fcedb919d2cda5e5321c2ae60def184c Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:44:07 +0100
Subject: [PATCH 52/77] proper preprocess include statement

---
 main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.nf b/main.nf
index 271d457..37c4bdd 100644
--- a/main.nf
+++ b/main.nf
@@ -1,5 +1,4 @@
 include { ingest } from './subworkflows/ingest'
-include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess'
 
 workflow {
     unique_id = "${params.unique_id}"

From f67b115adb8a580f70cf4e25bc5c26033c3bcc4f Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 10:45:03 +0100
Subject: [PATCH 53/77] proper preprocess include statement

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index d2eccad..1027004 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -2,7 +2,7 @@
 include { get_params_and_versions } from '../modules/get_params_and_versions'
 include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
 include { extract_reads } from '../modules/extract_taxa'
-include { fastp_single; fastp_paired; paired_concatenate } from './modules/preprocess'
+include { fastp_single; fastp_paired; paired_concatenate } from '../modules/preprocess'
 
 workflow ingest {
     take:

From 272c6ced26be2035e1f5062348590cb69f231653 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 11:04:01 +0100
Subject: [PATCH 54/77] use bio.bgzf not gzip

---
 bin/concatenate_reads.py | 8 ++++----
 modules/preprocess.nf    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py
index 4d91af7..56890c9 100755
--- a/bin/concatenate_reads.py
+++ b/bin/concatenate_reads.py
@@ -6,7 +6,7 @@
 and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag).
 """
 import argparse
-import gzip
+from Bio import bgzf
 import sys
 
 
@@ -146,18 +146,18 @@ def flush_buffer(f_out, f1_lines, f2_lines):
     args = parser.parse_args()
 
     if ".gz" in args.fastq1 or ".gzip" in args.fastq1:
-        f1 = gzip.open(args.fastq1, mode="r")
+        f1 = bgzf.open(args.fastq1, mode="r")
     else:
         f1 = open(args.fastq1, mode="r")
     if ".gz" in args.fastq2 or ".gzip" in args.fastq2:
-        f2 = gzip.open(args.fastq2, mode="r")
+        f2 = bgzf.open(args.fastq2, mode="r")
     else:
         f2 = open(args.fastq2, mode="r")
 
     if args.output_fastq is None:
         f_out = sys.stdout
     elif args.gzip:
-        f_out = gzip.open(args.output_fastq, mode="w")
+        f_out = bgzf.open(args.output_fastq, mode="w")
     else:
         f_out = open(args.output_fastq, mode="w")
 
diff --git a/modules/preprocess.nf b/modules/preprocess.nf
index 3349da9..6fa67e5 100644
--- a/modules/preprocess.nf
+++ b/modules/preprocess.nf
@@ -34,7 +34,7 @@ process fastp_paired {
 
     if [ -s ${unique_id}_2.fastp.fastq ]; then
         bgzip --threads $task.cpus -c ${unique_id}_2.fastp.fastq > ${unique_id}_2.fastp.fastq.gz
-    fi
+    fi    
     """
 
 }

From 465334de1a12aebc59d4aa789a60cd7739661a3c Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:18:04 +0100
Subject: [PATCH 55/77] typo

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 1027004..49c1040 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -41,7 +41,7 @@ workflow ingest {
 
                 fastq_2 = fastp_paired.out.processed_fastq_2
         } else { 
-                fastq_1 = proccessed_fastq
+                fastq_1 = processed_fastq
                 fastq_2 = None
         }
 

From 12e36d24d3597eaf21df86eff9eb0ecb19206561 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:18:53 +0100
Subject: [PATCH 56/77] try null

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 49c1040..7957350 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -42,7 +42,7 @@ workflow ingest {
                 fastq_2 = fastp_paired.out.processed_fastq_2
         } else { 
                 fastq_1 = processed_fastq
-                fastq_2 = None
+                fastq_2 = null
         }
 
         extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)

From 41f96d896316db78a42bfde35fb28215afc4e36f Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:25:14 +0100
Subject: [PATCH 57/77] split if else for paired

---
 modules/extract_taxa.nf | 47 ++++++++++++++++++++++++++++-------------
 subworkflows/ingest.nf  |  8 +++----
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 26019f5..5e9e8f5 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -7,7 +7,7 @@
 // probably want to count up how many have been found here for run log
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
-process extract_reads {
+process extract_paired_reads {
     
     label 'process_medium'
 
@@ -29,7 +29,6 @@ process extract_reads {
         path "reads.*.f*.gz", emit: reads
         path "reads_summary.json", emit: summary
     script:
-    if (params.paired) {
         """
         extract_kraken_reads.py \
             -s1 ${fastq1} \
@@ -49,7 +48,29 @@ process extract_reads {
             gzip \$f
           done        
         """
-    } else {
+}
+
+process extract_reads {
+
+    label 'process_medium'
+
+    publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
+
+    conda 'bioconda::biopython=1.78'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython%3A1.76' :
+        'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }"
+
+    input:
+        val unique_id
+        path fastq
+        path kraken_assignments
+        path kraken_report
+        path bracken_report
+    output:
+        path "reads.*.f*.gz", emit: reads
+        path "reads_summary.json", emit: summary
+    script:
         """
         extract_kraken_reads.py \
             -s ${fastq1} \
@@ -68,7 +89,6 @@ process extract_reads {
             gzip \$f
         done
         """
-    }
 }
 
 
@@ -86,16 +106,6 @@ workflow extract_taxa {
 }
 
 workflow {
-
-    if (params.paired) {
-            fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
-
-            fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
-    } else { 
-            fastq_1 = file(params.fastq, type: "file", checkIfExists:true)
-            fastq_2 = None
-    }
-
     unique_id = "${params.unique_id}"
     if ("${params.unique_id}" == "null") {
         unique_id = "${fastq.simpleName}"
@@ -105,5 +115,12 @@ workflow {
     bracken_report = file("${params.bracken_report}")
     kraken_report = file("${params.kraken_report}")
 
-    extract_taxa(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
+    if (params.paired) {
+            fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+            fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+            extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
+    } else {
+            fastq = file(params.fastq, type: "file", checkIfExists:true)
+            extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report)
+    }
 }
\ No newline at end of file
diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 7957350..6a95af7 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -38,14 +38,14 @@ workflow ingest {
 
         if (params.paired) {
                 fastq_1 = fastp_paired.out.processed_fastq_1
-
                 fastq_2 = fastp_paired.out.processed_fastq_2
+                extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
+
         } else { 
-                fastq_1 = processed_fastq
-                fastq_2 = null
+                fastq = processed_fastq
+                extract_reads(unique_id, fastq, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
         }
 
-        extract_reads(unique_id, fastq_1, fastq_2, kraken_pipeline.out.kraken_assignments, kraken_pipeline.out.kraken_report, kraken_pipeline.out.bracken_report)
     emit:
         html_report = kraken_pipeline.out.report
 

From d515aabbe88ad2c6a4efcbf1db8105a162c520c0 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:25:52 +0100
Subject: [PATCH 58/77] add import

---
 subworkflows/ingest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/ingest.nf b/subworkflows/ingest.nf
index 6a95af7..3a2a67c 100644
--- a/subworkflows/ingest.nf
+++ b/subworkflows/ingest.nf
@@ -1,7 +1,7 @@
 // workflow to run kraken, check for human, run qc checks and generate html report for a single sample fastq
 include { get_params_and_versions } from '../modules/get_params_and_versions'
 include { kraken_pipeline } from '../subworkflows/kraken_pipeline'
-include { extract_reads } from '../modules/extract_taxa'
+include { extract_reads; extract_paired_reads } from '../modules/extract_taxa'
 include { fastp_single; fastp_paired; paired_concatenate } from '../modules/preprocess'
 
 workflow ingest {

From 46c545c7b81ade3066516fbb934d53d6d4981d49 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:29:27 +0100
Subject: [PATCH 59/77] EXPERIMENTAL: overwrite some params with climb
 specifics

---
 conf/climb.config               | 14 ++++++++
 conf/params.config              | 20 +++++++++++
 nextflow.config                 |  4 +++
 subworkflows/kraken_pipeline.nf | 62 +++++++++++++++++++--------------
 4 files changed, 74 insertions(+), 26 deletions(-)
 create mode 100644 conf/climb.config

diff --git a/conf/climb.config b/conf/climb.config
new file mode 100644
index 0000000..745cafa
--- /dev/null
+++ b/conf/climb.config
@@ -0,0 +1,14 @@
+params {
+    // Max resource options
+    // Defaults only, expecting to be overwritten
+    max_memory                 = '128.GB'
+    max_cpus                   = 16
+    max_time                   = '240.h'
+
+    k2_port = 8080
+    k2_host = "10.1.185.58"
+
+    taxonomy = "/shared/public/db/taxonomy/"
+    database = "/shared/public/db/kraken2/k2_pluspf/"
+    database_set = "PlusPF"
+}
\ No newline at end of file
diff --git a/conf/params.config b/conf/params.config
index 68c3a6a..9ed6464 100644
--- a/conf/params.config
+++ b/conf/params.config
@@ -31,6 +31,8 @@ params {
     out_dir = "output"
     store_dir = "store_dir"
 
+    climb = false
+
     unique_id = null
     fastq = null
     fastq1 = null
@@ -44,6 +46,24 @@ params {
     bracken_length = null
     bracken_level = 'S'
     database_set = "Viral"
+    database_sets = [
+        'Viral': [
+            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20221209.tar.gz',
+            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
+        ],
+        'EuPath': [
+            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113.tar.gz',
+            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
+        ],
+        'PlusPF-8': [
+            'database': 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspf_8gb_20210517.tar.gz',
+            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
+        ],
+        'ncbi_16s_18s': [
+            'database': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/ncbi_16s_18s/ncbi_targeted_loci_kraken2.tar.gz',
+            'taxonomy': 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
+        ]
+    ]
 
     kraken_report = null
     bracken_report = null
diff --git a/nextflow.config b/nextflow.config
index bb304ab..2766db6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,6 +16,10 @@
 includeConfig 'conf/base.config'
 includeConfig 'conf/params.config'
 
+if (params.climb) {
+    includeConfig 'conf/climb.config'
+}
+
 manifest {
     name            = 'snowy-leopard/scylla'
     author          = 'Snowy Leopard'
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index a3ba436..268fa80 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -8,35 +8,45 @@ workflow kraken_pipeline {
         fastq
     main:
 
-        // Check source param is valid
-        // sources = params.database_sets
-        // source_name = params.database_set
-        // source_data = sources.get(source_name, false)
-        // if (!sources.containsKey(source_name) || !source_data) {
-        //     keys = sources.keySet()
-        //     throw new Exception("Source $params.source is invalid, must be one of $keys")
-        // }
 
-        // Grab taxonomy files
-        // taxonomy = file(sources[source_name]["taxonomy"], type: "file")
-        // if (params.taxonomy) {
-        //     log.info("Checking custom taxonomy mapping exists")
-            
-        // }
+        // Check source param is valid if applicable
+        if not (params.database and params.taxonomy_dir) {
+            sources = params.database_sets
+            source_name = params.database_set
+            source_data = sources.get(source_name, false)
+            if (!sources.containsKey(source_name) || !source_data) {
+                keys = sources.keySet()
+                throw new Exception("Source $params.source is invalid, must be one of $keys")
+            }
+        }
 
-        taxonomy = file(params.taxonomy_dir, type: "dir", checkIfExists:true)
+        // Grab taxonomy files
+        if (params.taxonomy) {
+            taxonomy = file(params.taxonomy, type: "dir", checkIfExists:true)
+        } else {
+            default_taxonomy = source_data.get("taxonomy", false)
+            if (!default_taxonomy) {
+                throw new Exception(
+                    "Error: Source $source_name does not include a taxonomy for "
+                    + "use with kraken, please choose another source or"
+                    + "provide a custom taxonomy.")
+            }
+            taxonomy = unpackTaxonomy(default_taxonomy)
+        }
 
-        // if (database) {
-        //     source_database = database
-        // } else {
-        //     source_database = source_data.get("database", false)
-        //     if (!source_database) {
-        //         throw new Exception(
-        //             "Error: Source $source_name does not include a database for "
-        //             + "use with kraken, please choose another source, "
-        //             + "provide a custom database or disable kraken.")
-        //     }
-        database = file(params.db, type: "dir", checkIfExists:true)
+       // Grab database files
+        if (params.database) {
+            database = file(params.database, type: "dir", checkIfExists:true)
+        } else {
+            default_database = source_data.get("database", false)
+            if (!default_database) {
+                throw new Exception(
+                    "Error: Source $source_name does not include a database for "
+                    + "use with kraken, please choose another source or "
+                    + "provide a custom database.")
+            }
+            database = unpackDatabase(default_database)
+        }
 
         // start_server(database, taxonomy)
         run_kraken_and_bracken(unique_id, fastq, database, taxonomy)

From ebe6083d2920a60e789353fe37a2b68241c8920b Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:32:00 +0100
Subject: [PATCH 60/77] debug

---
 subworkflows/kraken_pipeline.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index 268fa80..e6976fb 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -7,10 +7,8 @@ workflow kraken_pipeline {
         unique_id
         fastq
     main:
-
-
         // Check source param is valid if applicable
-        if not (params.database and params.taxonomy_dir) {
+        if (!params.database and !params.taxonomy_dir) {
             sources = params.database_sets
             source_name = params.database_set
             source_data = sources.get(source_name, false)
@@ -34,7 +32,7 @@ workflow kraken_pipeline {
             taxonomy = unpackTaxonomy(default_taxonomy)
         }
 
-       // Grab database files
+        // Grab database files
         if (params.database) {
             database = file(params.database, type: "dir", checkIfExists:true)
         } else {

From 3ae3b2654ee74d5ca66a3b5e7a98d218b51bd4ab Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:33:54 +0100
Subject: [PATCH 61/77] debug

---
 subworkflows/kraken_pipeline.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index e6976fb..28cbf3f 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -8,7 +8,7 @@ workflow kraken_pipeline {
         fastq
     main:
         // Check source param is valid if applicable
-        if (!params.database and !params.taxonomy_dir) {
+        if (!params.database and !params.taxonomy) {
             sources = params.database_sets
             source_name = params.database_set
             source_data = sources.get(source_name, false)

From efda31a3be77de5ca84fdd5ee4ed04fcca5370cc Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:35:39 +0100
Subject: [PATCH 62/77] debug

---
 subworkflows/kraken_pipeline.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
index 28cbf3f..aa97ece 100644
--- a/subworkflows/kraken_pipeline.nf
+++ b/subworkflows/kraken_pipeline.nf
@@ -1,4 +1,6 @@
 include { run_kraken_and_bracken } from '../modules/kraken_client'
+include { unpackDatabase } from '../modules/kraken_server'
+include { unpackTaxonomy } from '../modules/kraken_server'
 include { qc_checks } from '../modules/qc_checks'
 include { generate_report } from '../modules/generate_report'
 

From afe025253270b6b6d6a522b317851279f311f1bb Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 11:38:45 +0100
Subject: [PATCH 63/77] typo

---
 modules/extract_taxa.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 5e9e8f5..67187a1 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -73,7 +73,7 @@ process extract_reads {
     script:
         """
         extract_kraken_reads.py \
-            -s ${fastq1} \
+            -s ${fastq} \
             -k ${kraken_assignments} \
             -r ${kraken_report} \
             -b ${bracken_report} \

From 716191e8c9546148d520bbceb7e24dc341a32d85 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 11:41:17 +0100
Subject: [PATCH 64/77] Properly output paired reads

---
 bin/extract_kraken_reads.py | 80 +++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 22 deletions(-)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index 33bc107..fb14acb 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -264,8 +264,18 @@ def extract_taxa(
             if key not in keys:
                 keys[key] = []
             keys[key].append(taxon)
-        outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w")
-        print("opening %s.%s.%s" % (prefix, taxon, filetype))
+        if reads2:
+            outfile_handles[taxon] = {
+                1: open("%s.%s_1.%s" % (prefix, taxon, filetype), "w"),
+                2: open("%s.%s_2.%s" % (prefix, taxon, filetype), "w"),
+            }
+            print(
+                "opening %s.%s_1.%s and %s.%s_2.%s"
+                % (prefix, taxon, filetype, prefix, taxon, filetype)
+            )
+        else:
+            outfile_handles[taxon] = open("%s.%s.%s" % (prefix, taxon, filetype), "w")
+            print("opening %s.%s.%s" % (prefix, taxon, filetype))
         out_counts[taxon] = 0
         quals[taxon] = []
         lens[taxon] = []
@@ -279,27 +289,53 @@ def extract_taxa(
         for line in kfile:
             tax_id, read_id = parse_kraken_assignment_line(line)
             if tax_id in keys:
-                if read_id in s_file1:
-                    read = s_file1[read_id]
-                elif reads2 and read_id in s_file2:
-                    read = s_file2[read_id]
-                else:
-                    sys.stderr.write(
-                        "ERROR: read id %s not found in read files\n" % read_id
-                    )
-                    sys.exit(1)
+                if reads2:
+                    if read_id in s_file1 and read_id in s_file2:
+                        read1 = s_file1[read_id]
+                        read2 = s_file2[read_id]
+                    else:
+                        sys.stderr.write(
+                            "ERROR: read id %s not found in read files\n" % read_id
+                        )
+                        sys.exit(1)
+
+                    for taxon in keys[tax_id]:
+                        SeqIO.write(read1, outfile_handles[taxon][1], filetype)
+                        SeqIO.write(read2, outfile_handles[taxon][2], filetype)
+                        out_counts[taxon] += 2
+                        quals[taxon].append(
+                            median(read1.letter_annotations["phred_quality"])
+                        )
+                        quals[taxon].append(
+                            median(read2.letter_annotations["phred_quality"])
+                        )
+                        lens[taxon].append(len(read1))
+                        lens[taxon].append(len(read2))
 
-                for taxon in keys[tax_id]:
-                    SeqIO.write(read, outfile_handles[taxon], filetype)
-                    out_counts[taxon] += 1
-                    quals[taxon].append(
-                        median(read.letter_annotations["phred_quality"])
-                    )
-                    lens[taxon].append(len(read))
-
-    for handle in outfile_handles:
-        if outfile_handles[handle]:
-            outfile_handles[handle].close()
+                else:
+                    if read_id in s_file1:
+                        read = s_file1[read_id]
+                    else:
+                        sys.stderr.write(
+                            "ERROR: read id %s not found in read file\n" % read_id
+                        )
+                        sys.exit(1)
+
+                    for taxon in keys[tax_id]:
+                        SeqIO.write(read, outfile_handles[taxon], filetype)
+                        out_counts[taxon] += 1
+                        quals[taxon].append(
+                            median(read.letter_annotations["phred_quality"])
+                        )
+                        lens[taxon].append(len(read))
+    if reads2:
+        for handle_dict in outfile_handles:
+            outfile_handles[handle_dict][1].close()
+            outfile_handles[handle_dict][2].close()
+    else:
+        for handle in outfile_handles:
+            if outfile_handles[handle]:
+                outfile_handles[handle].close()
 
     summary = []
     for taxon in lists_to_extract:

From 983becbff49ca5e7a778b1231c72415c031f38f7 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 12:14:05 +0100
Subject: [PATCH 65/77] Comment out extract taxa workflow - unused

---
 modules/extract_taxa.nf | 62 ++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 67187a1..03204c0 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -93,34 +93,34 @@ process extract_reads {
 
 
 
-workflow extract_taxa {
-    take:
-        unique_id
-        fastq_1
-        fastq_2
-        kraken_assignments
-        kraken_report
-        bracken_report
-    main:
-        extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
-}
-
-workflow {
-    unique_id = "${params.unique_id}"
-    if ("${params.unique_id}" == "null") {
-        unique_id = "${fastq.simpleName}"
-    }
-
-    kraken_assignments = file("${params.kraken_assignments}")
-    bracken_report = file("${params.bracken_report}")
-    kraken_report = file("${params.kraken_report}")
-
-    if (params.paired) {
-            fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
-            fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
-            extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
-    } else {
-            fastq = file(params.fastq, type: "file", checkIfExists:true)
-            extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report)
-    }
-}
\ No newline at end of file
+// workflow extract_taxa {
+//     take:
+//         unique_id
+//         fastq_1
+//         fastq_2
+//         kraken_assignments
+//         kraken_report
+//         bracken_report
+//     main:
+//         extract_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
+// }
+
+// workflow {
+//     unique_id = "${params.unique_id}"
+//     if ("${params.unique_id}" == "null") {
+//         unique_id = "${fastq.simpleName}"
+//     }
+
+//     kraken_assignments = file("${params.kraken_assignments}")
+//     bracken_report = file("${params.bracken_report}")
+//     kraken_report = file("${params.kraken_report}")
+
+//     if (params.paired) {
+//             fastq_1 = file(params.fastq1, type: "file", checkIfExists:true)
+//             fastq_2 = file(params.fastq2, type: "file", checkIfExists:true)
+//             extract_paired_reads(unique_id, fastq_1, fastq_2, kraken_assignments, kraken_report, bracken_report)
+//     } else {
+//             fastq = file(params.fastq, type: "file", checkIfExists:true)
+//             extract_reads(unique_id, fastq, kraken_assignments, kraken_report, bracken_report)
+//     }
+// }
\ No newline at end of file

From 63582b5fbfa02cb65699e19fed13ceb0af1d84eb Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 12:16:21 +0100
Subject: [PATCH 66/77] bgzip extracted taxa fastqs

---
 modules/extract_taxa.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 03204c0..734f00e 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -45,7 +45,7 @@ process extract_paired_reads {
 
         for f in \$(ls reads.*.f*)
           do
-            gzip \$f
+            bgzip --threads $task.cpus \$f
           done        
         """
 }
@@ -85,9 +85,9 @@ process extract_reads {
             --min_percent ${params.assembly_min_percent}
 
         for f in \$(ls reads.*.f*)
-        do
-            gzip \$f
-        done
+          do
+            bgzip --threads $task.cpus \$f
+          done
         """
 }
 

From a709b5f279a8c78c3600505442a2cdce571592b3 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 12:31:29 +0100
Subject: [PATCH 67/77] run extract taxa in scylla container

---
 modules/extract_taxa.nf | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 734f00e..337b541 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -13,10 +13,8 @@ process extract_paired_reads {
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
-    conda 'bioconda::biopython=1.78'
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython%3A1.76' :
-        'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }"    
+    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    container "biowilko/scylla@${params.wf.container_sha}"   
 
     input:
         val unique_id
@@ -56,11 +54,9 @@ process extract_reads {
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
-    conda 'bioconda::biopython=1.78'
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython%3A1.76' :
-        'biocontainers/biopython@sha256:b0204cf662a3d858f6c28627124b83ed6f564e2b156b8788092f2dd9256c9290' }"
-
+    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    container "biowilko/scylla@${params.wf.container_sha}"   
+    
     input:
         val unique_id
         path fastq

From c189f717257530afef3c1ab06d699d1bbe5f2dc6 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 12:49:18 +0100
Subject: [PATCH 68/77] Properly output per-run execution reports

---
 conf/params.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/params.config b/conf/params.config
index 9ed6464..7443080 100644
--- a/conf/params.config
+++ b/conf/params.config
@@ -5,7 +5,7 @@ params {
 
     // Boilerplate options
     outdir                     = null
-    tracedir                   = "${params.outdir}/pipeline_info"
+    tracedir                   = "${params.out_dir}/${params.unique_id}/pipeline_info"
     publish_dir_mode           = 'copy'
     email                      = null
     email_on_fail              = null

From 519523036f212579a30455f0f72170f72bfc5a74 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 12:56:29 +0100
Subject: [PATCH 69/77] extract taxa summary, handle paired fnames

---
 bin/extract_kraken_reads.py | 46 ++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index fb14acb..5f7c9d8 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -339,19 +339,39 @@ def extract_taxa(
 
     summary = []
     for taxon in lists_to_extract:
-        summary.append(
-            {
-                "human_readable": bracken_hierarchy[taxon]["name"],
-                "taxon": taxon,
-                "tax_level": bracken_hierarchy[taxon]["rank"],
-                "filename": "%s.%s.%s" % (prefix, taxon, filetype),
-                "qc_metrics": {
-                    "num_reads": out_counts[taxon],
-                    "avg_qual": mean(quals[taxon]),
-                    "mean_len": mean(lens[taxon]),
-                },
-            }
-        )
+        if reads2:
+            summary.append(
+                {
+                    "human_readable": bracken_hierarchy[taxon]["name"],
+                    "taxon": taxon,
+                    "tax_level": bracken_hierarchy[taxon]["rank"],
+                    "filenames": [
+                        "%s.%s_1.%s" % (prefix, taxon, filetype),
+                        "%s.%s_2.%s" % (prefix, taxon, filetype),
+                    ],
+                    "qc_metrics": {
+                        "num_reads": out_counts[taxon],
+                        "avg_qual": mean(quals[taxon]),
+                        "mean_len": mean(lens[taxon]),
+                    },
+                }
+            )
+        else:
+            summary.append(
+                {
+                    "human_readable": bracken_hierarchy[taxon]["name"],
+                    "taxon": taxon,
+                    "tax_level": bracken_hierarchy[taxon]["rank"],
+                    "filenames": [
+                        "%s.%s.%s" % (prefix, taxon, filetype),
+                    ],
+                    "qc_metrics": {
+                        "num_reads": out_counts[taxon],
+                        "avg_qual": mean(quals[taxon]),
+                        "mean_len": mean(lens[taxon]),
+                    },
+                }
+            )
     with open("%s_summary.json" % prefix, "w") as f:
         print(summary)
         json.dump(summary, f)

From 61bf897112ffb9431d4d40c10168527e76311b87 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 13:05:02 +0100
Subject: [PATCH 70/77] pass through command line parameter

---
 bin/extract_kraken_reads.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index 5f7c9d8..7abbac6 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -533,6 +533,7 @@ def main():
         args.reads2,
         args.prefix,
         max_human=args.max_human,
+        names = args.taxid,
         target_ranks=target_ranks,
         min_count=args.min_count,
         min_count_descendants=args.min_count_descendants,

From 57ab71d85de012fbe935f68ace2450e11261b445 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 13:07:31 +0100
Subject: [PATCH 71/77] allow multi jsons as per another scylla branch and
 issue #11

---
 bin/make_report.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/bin/make_report.py b/bin/make_report.py
index 55f7c1b..586c6df 100755
--- a/bin/make_report.py
+++ b/bin/make_report.py
@@ -59,7 +59,8 @@ def main():
 
     parser.add_argument(
         "--assignments",
-        help="JSON file of kraken/bracken assignments",
+        help="JSON file(s) of kraken/bracken assignments",
+        nargs='+',
         required=True,
         type=Path,
     )
@@ -81,8 +82,14 @@ def main():
 
     sample = args.sample_id
 
-    with open(args.assignments.resolve(), "rt") as bracken_file:
-        assignments = bracken_file.read().strip().replace('"', '\\"')
+    assignments = None
+    for bracken_file in args.assignments:
+        with open(bracken_file.resolve(), "rt") as bracken_handle:
+            contents = bracken_handle.read().strip().replace('"', '\\"')
+            if not assignments:
+                assignments = contents
+            else:
+                assignments = assignments[:-1] + ", " + contents
 
     if args.read_counts:
         with open(args.read_counts.resolve(), "rt") as qc_file:

From 1869c2feac02aeea374852c421df793fd69e290c Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 13:22:26 +0100
Subject: [PATCH 72/77] Up resources available to bgzip taxon fastqs

---
 modules/extract_taxa.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index 337b541..e3e8198 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -9,7 +9,7 @@
 // in config so could be good dehuman sanity check
 process extract_paired_reads {
     
-    label 'process_medium'
+    label 'process_high'
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
@@ -56,7 +56,7 @@ process extract_reads {
 
     conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
     container "biowilko/scylla@${params.wf.container_sha}"   
-    
+
     input:
         val unique_id
         path fastq

From 3e017d2fbf064f9a74b98193b079785d04022e5d Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 13:28:19 +0100
Subject: [PATCH 73/77] Re-enable human threshold fail

---
 bin/extract_kraken_reads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index 7abbac6..c8e4b10 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -76,7 +76,7 @@ def get_bracken_hierarchy(kraken_file, bracken_file, max_human=None):
                         "ERROR: found %i human reads, max allowed is %i\n"
                         % (num_direct, max_human)
                     )
-                    # sys.exit(2)
+                    sys.exit(2)
                 continue
 
             entries[ncbi] = {
@@ -533,7 +533,7 @@ def main():
         args.reads2,
         args.prefix,
         max_human=args.max_human,
-        names = args.taxid,
+        names=args.taxid,
         target_ranks=target_ranks,
         min_count=args.min_count,
         min_count_descendants=args.min_count_descendants,

From 6ac4a84f9c567b020ba25ca32e3863f3739939f1 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 13:52:10 +0100
Subject: [PATCH 74/77] increase extract_reads resource again

---
 modules/extract_taxa.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index e3e8198..dd6b434 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -50,7 +50,7 @@ process extract_paired_reads {
 
 process extract_reads {
 
-    label 'process_medium'
+    label 'process_high'
 
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 

From ae6ef527d4da431444f330a11fe0082d87144eed Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 14:27:26 +0100
Subject: [PATCH 75/77] fix bug in extract_kraken_reads

---
 bin/extract_kraken_reads.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/extract_kraken_reads.py b/bin/extract_kraken_reads.py
index 7abbac6..325bdd5 100755
--- a/bin/extract_kraken_reads.py
+++ b/bin/extract_kraken_reads.py
@@ -430,6 +430,7 @@ def main():
         dest="taxid",
         required=False,
         nargs="*",
+        default=[],
         help="List of taxonomy ID[s] or names to extract (space-delimited) - each to their own file",
     )
     parser.add_argument(

From 194a8522efb3d3885945bc9b8f422ebbb284b46c Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Wed, 21 Jun 2023 14:27:40 +0100
Subject: [PATCH 76/77] add more dependancies

---
 environment.yml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index 2bc591f..77b0d01 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,4 +1,4 @@
-name: scylla
+name: scylla_dev
 channels:
   - bioconda
   - conda-forge
@@ -15,4 +15,10 @@ dependencies:
   - fastcat
   - biopython
   - flye
-  - megahit
\ No newline at end of file
+  - megahit
+  - pandas
+  - numpy
+  - taxonkit
+  - fastp
+  - tabix
+  - mako

From 0503fd0236b04d043748a6f27ccad087f309b458 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Wed, 21 Jun 2023 14:44:24 +0100
Subject: [PATCH 77/77] ignore extract taxa fails with exit code 2

---
 modules/extract_taxa.nf | 4 ++++
 nextflow.config         | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/extract_taxa.nf b/modules/extract_taxa.nf
index dd6b434..3484984 100644
--- a/modules/extract_taxa.nf
+++ b/modules/extract_taxa.nf
@@ -11,6 +11,8 @@ process extract_paired_reads {
     
     label 'process_high'
 
+    errorStrategy {task.exitStatus == 2 ? 'ignore' : 'terminate'}
+
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
@@ -52,6 +54,8 @@ process extract_reads {
 
     label 'process_high'
 
+    errorStrategy {task.exitStatus == 2 ? 'ignore' : 'terminate'}
+
     publishDir path: "${params.out_dir}/${unique_id}/reads_by_taxa", mode: 'copy'
 
     conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
diff --git a/nextflow.config b/nextflow.config
index 2766db6..b6980eb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -143,19 +143,19 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
 timeline {
     enabled = true
-    file    = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
+    file    = "${params.tracedir}/execution_timeline_${params.unique_id}.html"
 }
 report {
     enabled = true
-    file    = "${params.tracedir}/execution_report_${trace_timestamp}.html"
+    file    = "${params.tracedir}/execution_report_${params.unique_id}.html"
 }
 trace {
     enabled = true
-    file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+    file    = "${params.tracedir}/execution_trace_${params.unique_id}.txt"
 }
 dag {
     enabled = true
-    file    = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html"
+    file    = "${params.tracedir}/pipeline_dag_${params.unique_id}.html"
 }