From 44776b3e6fe3f31204bf0edc983aa7a8c16be742 Mon Sep 17 00:00:00 2001
From: Priyanka Surana <ps22@sanger.ac.uk>
Date: Wed, 27 Sep 2023 13:52:27 +0100
Subject: [PATCH] prepare genome subworkflow

---
 conf/modules.config                           | 16 +++--
 modules.json                                  | 10 +++
 modules/nf-core/windowmasker/mkcounts/main.nf | 55 +++++++++++++++
 .../nf-core/windowmasker/mkcounts/meta.yml    | 40 +++++++++++
 modules/nf-core/windowmasker/ustat/main.nf    | 69 +++++++++++++++++++
 modules/nf-core/windowmasker/ustat/meta.yml   | 48 +++++++++++++
 nextflow.config                               |  1 +
 nextflow_schema.json                          |  8 ++-
 subworkflows/local/prepare_genome.nf          | 49 +++++++++++++
 workflows/blobtoolkit.nf                      | 22 +++---
 10 files changed, 300 insertions(+), 18 deletions(-)
 create mode 100644 modules/nf-core/windowmasker/mkcounts/main.nf
 create mode 100644 modules/nf-core/windowmasker/mkcounts/meta.yml
 create mode 100644 modules/nf-core/windowmasker/ustat/main.nf
 create mode 100644 modules/nf-core/windowmasker/ustat/meta.yml
 create mode 100644 subworkflows/local/prepare_genome.nf

diff --git a/conf/modules.config b/conf/modules.config
index 069dea0b..d29e500f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -20,6 +20,14 @@ process {
         ]
     }
 
+    withName: "WINDOWMASKER_MKCOUNTS" {
+        ext.args = "-infmt fasta -sformat obinary"
+    }
+
+    withName: "WINDOWMASKER_USTAT" {
+        ext.args = "-infmt fasta -dust T -outfmt fasta"
+    }
+
     withName: "MINIMAP2_HIC" {
         ext.args = "-ax sr"
     }
@@ -40,6 +48,10 @@ process {
         ext.args = "-ax map-ont"
     }
 
+    withName: "SAMTOOLS_VIEW" {
+        ext.args = "--output-fmt bam --write-index"
+    }
+
     withName: "SAMTOOLS_INDEX" {
         ext.args = "-c"
     }
@@ -48,10 +60,6 @@ process {
         ext.args = "--lineage --busco"
     }
 
-    withName: "SAMTOOLS_VIEW" {
-        ext.args = "--output-fmt bam --write-index"
-    }
-
     withName: "BUSCO" {
         scratch = true
         // Overridden in the test profile, see at the end of this file
diff --git a/modules.json b/modules.json
index 5a525a75..ee078010 100644
--- a/modules.json
+++ b/modules.json
@@ -65,6 +65,16 @@
                         "branch": "master",
                         "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
                         "installed_by": ["modules"]
+                    },
+                    "windowmasker/mkcounts": {
+                        "branch": "master",
+                        "git_sha": "30c3ed32e8bd5ddaf349ba2f4f99d38182fdc08c",
+                        "installed_by": ["modules"]
+                    },
+                    "windowmasker/ustat": {
+                        "branch": "master",
+                        "git_sha": "726ee59cd9360a965d96ea9ea8770f16b8ddd6cc",
+                        "installed_by": ["modules"]
                     }
                 }
             },
diff --git a/modules/nf-core/windowmasker/mkcounts/main.nf b/modules/nf-core/windowmasker/mkcounts/main.nf
new file mode 100644
index 00000000..bfa66f35
--- /dev/null
+++ b/modules/nf-core/windowmasker/mkcounts/main.nf
@@ -0,0 +1,55 @@
+process WINDOWMASKER_MKCOUNTS {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::blast=2.14.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1':
+        'biocontainers/blast:2.14.0--h7d5a4b4_1' }"
+
+    input:
+    tuple val(meta), path(ref)
+
+    output:
+    tuple val(meta), path("*.txt")  , emit: counts
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args    = task.ext.args     ?: ""
+    def prefix  = task.ext.prefix   ?: "${meta.id}"
+
+    def memory = 3072
+    if (!task.memory) {
+        log.info '[WINDOWMASKER: MK_COUNTS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
+    } else {
+        memory = (task.memory.toMega()).intValue()
+    }
+
+    """
+    windowmasker -mk_counts \\
+        $args \\
+        -mem ${memory} \\
+        -in ${ref} \\
+        -out ${prefix}.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix  = task.ext.prefix   ?: "${meta.id}"
+
+    """
+    touch ${prefix}.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/windowmasker/mkcounts/meta.yml b/modules/nf-core/windowmasker/mkcounts/meta.yml
new file mode 100644
index 00000000..788dc96c
--- /dev/null
+++ b/modules/nf-core/windowmasker/mkcounts/meta.yml
@@ -0,0 +1,40 @@
+name: windowmasker_mkcounts
+description: A program to generate frequency counts of repetitive units.
+keywords:
+  - fasta
+  - interval
+  - windowmasker
+tools:
+  - windowmasker:
+      description: |
+        A program to mask highly repetitive and low complexity DNA sequences within a genome.
+      homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public
+      documentation: https://ncbi.github.io/cxx-toolkit/
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - ref:
+      type: file
+      description: An input nucleotide fasta file.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - intervals:
+      type: file
+      description: |
+        An output file containing genomic locations of low
+        complexity and highly repetitive regions
+      pattern: "${prefix}.txt"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@DLBPointon"
diff --git a/modules/nf-core/windowmasker/ustat/main.nf b/modules/nf-core/windowmasker/ustat/main.nf
new file mode 100644
index 00000000..72a19dbf
--- /dev/null
+++ b/modules/nf-core/windowmasker/ustat/main.nf
@@ -0,0 +1,69 @@
+process WINDOWMASKER_USTAT {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::blast=2.14.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1':
+        'biocontainers/blast:2.14.0--h7d5a4b4_1' }"
+
+    input:
+    tuple val(meta) , path(counts)
+    tuple val(meta2), path(ref)
+
+    output:
+    tuple val(meta), path("${output}")  , emit: intervals
+    path "versions.yml"                 , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args    =   task.ext.args         ?: ""
+    def prefix  =   task.ext.prefix       ?: "${meta.id}"
+    def outfmt  =   args.contains('-outfmt fasta')                ? 'fasta'               :
+                    args.contains('-outfmt maskinfo_asn1_bin')    ? 'maskinfo_asn1_bin'   :
+                    args.contains('-outfmt maskinfo_asn1_text')   ? 'maskinfo_asn1_text'  :
+                    args.contains('-outfmt maskinfo_xml')         ? 'maskinfo_xml'        :
+                    args.contains('-outfmt seqloc_asn1_bin')      ? 'seqloc_asn1_bin'     :
+                    args.contains('-outfmt seqloc_asn1_text')     ? 'seqloc_asn1_text'    :
+                    args.contains('-outfmt seqloc_xml')           ? 'seqloc_xml'          :
+                    'interval'
+
+    output  = "${prefix}.${outfmt}"
+
+    """
+    windowmasker -ustat \\
+        ${counts} \\
+        $args \\
+        -in ${ref} \\
+        -out ${output}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def args    =   task.ext.args         ?: ""
+    def prefix  =   task.ext.prefix       ?: "${meta.id}"
+    def outfmt  =   args.contains('-outfmt fasta')                ? 'fasta'               :
+                    args.contains('-outfmt maskinfo_asn1_bin')    ? 'maskinfo_asn1_bin'   :
+                    args.contains('-outfmt maskinfo_asn1_text')   ? 'maskinfo_asn1_text'  :
+                    args.contains('-outfmt maskinfo_xml')         ? 'maskinfo_xml'        :
+                    args.contains('-outfmt seqloc_asn1_bin')      ? 'seqloc_asn1_bin'     :
+                    args.contains('-outfmt seqloc_asn1_text')     ? 'seqloc_asn1_text'    :
+                    args.contains('-outfmt seqloc_xml')           ? 'seqloc_xml'          :
+                    'interval'
+
+    output  = "${prefix}.${outfmt}"
+    """
+    touch ${output}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/windowmasker/ustat/meta.yml b/modules/nf-core/windowmasker/ustat/meta.yml
new file mode 100644
index 00000000..6acf2e50
--- /dev/null
+++ b/modules/nf-core/windowmasker/ustat/meta.yml
@@ -0,0 +1,48 @@
+name: windowmasker_ustat
+description: A program to take a counts file and creates a file of genomic co-ordinates to be masked.
+keywords:
+  - fasta
+  - interval
+  - windowmasker
+tools:
+  - windowmasker:
+      description: |
+        A program to mask highly repetitive and low complexity DNA sequences within a genome.
+      homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public
+      documentation: https://ncbi.github.io/cxx-toolkit/
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - counts:
+      type: file
+      description: Contains count data of repetitive regions.
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - ref:
+      type: file
+      description: An input nucleotide fasta file.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - wm_intervals:
+      type: file
+      description: |
+        An output file containing genomic locations of low
+        complexity and highly repetitive regions
+      pattern: "${output}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@DLBPointon"
diff --git a/nextflow.config b/nextflow.config
index 5c7d1f0d..91844aa9 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,6 +14,7 @@ params {
     input                      = null
     yaml                       = null
     align                      = false
+    mask                       = false
 
     // Reference options    
     fasta                      = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index c8f95768..0597fdcb 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -24,7 +24,13 @@
                 },
                 "align": {
                     "type": "boolean",
-                    "description": "Boolean to turn on optional alignment before running the rest of the pipeline."
+                    "description": "Turn on optional alignment before running the rest of the pipeline.",
+                    "fa_icon": "fas fa-toggle-off"
+                },
+                "mask": {
+                    "type": "boolean",
+                    "description": "Turn on optional genome masking if needed.",
+                    "fa_icon": "fas fa-toggle-off"
                 },
                 "yaml": {
                     "type": "string",
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
new file mode 100644
index 00000000..d1e31a72
--- /dev/null
+++ b/subworkflows/local/prepare_genome.nf
@@ -0,0 +1,49 @@
+//
+// Prepare genome for downstream processing
+//
+
+include { GUNZIP                } from '../../modules/nf-core/gunzip/main'
+include { WINDOWMASKER_MKCOUNTS } from '../../modules/nf-core/windowmasker/mkcounts/main'
+include { WINDOWMASKER_USTAT    } from '../../modules/nf-core/windowmasker/ustat/main'
+
+
+workflow PREPARE_GENOME {
+    take:
+    fasta     // channel: [ meta, path(genome) ]
+
+
+    main:
+    ch_versions = Channel.empty()
+
+
+    //
+    // MODULE: Decompress FASTA file if needed
+    //
+    if ( params.fasta.endsWith('.gz') ) {
+        ch_genome   = GUNZIP ( fasta ).gunzip
+        ch_versions = ch_versions.mix ( GUNZIP.out.versions )
+    } else {
+        ch_genome   = fasta
+    }
+
+
+    //
+    // MODULES: Mask the genome if needed
+    //
+    if ( params.mask ) {
+        WINDOWMASKER_MKCOUNTS ( ch_genome )
+        ch_versions = ch_versions.mix ( WINDOWMASKER_MKCOUNTS.out.versions )
+
+        WINDOWMASKER_USTAT ( WINDOWMASKER_MKCOUNTS.out.counts, ch_genome )
+        ch_versions = ch_versions.mix ( WINDOWMASKER_USTAT.out.versions )
+
+        ch_fasta = WINDOWMASKER_USTAT.out.intervals
+    } else {
+        ch_fasta = ch_genome
+    }
+
+    
+    emit:
+    genome   = ch_fasta            // channel: [ meta, path(genome) ]
+    versions = ch_versions         // channel: [ versions.yml ]
+}
\ No newline at end of file
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index e1a71ed6..919e17bb 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -16,7 +16,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 
 // Check mandatory parameters
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
-if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' }
+if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' }
 if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' }
 if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' }
 if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' }
@@ -50,6 +50,7 @@ include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config'
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
+include { PREPARE_GENOME     } from '../subworkflows/local/prepare_genome'
 include { MINIMAP2_ALIGNMENT } from '../subworkflows/local/minimap_alignment'
 include { INPUT_CHECK        } from '../subworkflows/local/input_check'
 include { COVERAGE_STATS     } from '../subworkflows/local/coverage_stats'
@@ -67,7 +68,6 @@ include { VIEW               } from '../subworkflows/local/view'
 //
 // MODULE: Installed directly from nf-core/modules
 //
-include { GUNZIP                      } from '../modules/nf-core/gunzip/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 include { MULTIQC                     } from '../modules/nf-core/multiqc/main'
 
@@ -85,14 +85,10 @@ workflow BLOBTOOLKIT {
     ch_versions = Channel.empty()
 
     //
-    // MODULE: Decompress FASTA file if needed
+    // SUBWORKFLOW: Prepare genome for downstream processing
     //
-    if ( params.fasta.endsWith('.gz') ) {
-        ch_genome   = GUNZIP ( ch_fasta ).gunzip
-        ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() )
-    } else {
-        ch_genome   = ch_fasta
-    }
+    PREPARE_GENOME ( ch_fasta )
+    ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions )
 
     //
     // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis
@@ -104,7 +100,7 @@ workflow BLOBTOOLKIT {
     // SUBWORKFLOW: Optional read alignment
     //
     if ( params.align ) {
-        MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, ch_genome )
+        MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, PREPARE_GENOME.out.genome )
         ch_versions = ch_versions.mix ( MINIMAP2_ALIGNMENT.out.versions )
         ch_aligned = MINIMAP2_ALIGNMENT.out.aln
     } else {
@@ -114,7 +110,7 @@ workflow BLOBTOOLKIT {
     //
     // SUBWORKFLOW: Calculate genome coverage and statistics 
     //
-    COVERAGE_STATS ( ch_aligned, ch_genome )
+    COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome )
     ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions )
 
     //
@@ -128,7 +124,7 @@ workflow BLOBTOOLKIT {
     }
 
     BUSCO_DIAMOND ( 
-        ch_genome, 
+        PREPARE_GENOME.out.genome, 
         ch_taxon_taxa, 
         ch_busco_db, 
         ch_uniprot, 
@@ -153,7 +149,7 @@ workflow BLOBTOOLKIT {
     // SUBWORKFLOW: Create BlobTools dataset
     //
     if ( !params.yaml ) {
-        BLOBTOOLKIT_CONFIG ( ch_genome )
+        BLOBTOOLKIT_CONFIG ( PREPARE_GENOME.out.genome )
         ch_config   = BLOBTOOLKIT_CONFIG.out.yaml
         ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() )
     } else {