From 44776b3e6fe3f31204bf0edc983aa7a8c16be742 Mon Sep 17 00:00:00 2001 From: Priyanka Surana Date: Wed, 27 Sep 2023 13:52:27 +0100 Subject: [PATCH] prepare genome subworkflow --- conf/modules.config | 16 +++-- modules.json | 10 +++ modules/nf-core/windowmasker/mkcounts/main.nf | 55 +++++++++++++++ .../nf-core/windowmasker/mkcounts/meta.yml | 40 +++++++++++ modules/nf-core/windowmasker/ustat/main.nf | 69 +++++++++++++++++++ modules/nf-core/windowmasker/ustat/meta.yml | 48 +++++++++++++ nextflow.config | 1 + nextflow_schema.json | 8 ++- subworkflows/local/prepare_genome.nf | 49 +++++++++++++ workflows/blobtoolkit.nf | 22 +++--- 10 files changed, 300 insertions(+), 18 deletions(-) create mode 100644 modules/nf-core/windowmasker/mkcounts/main.nf create mode 100644 modules/nf-core/windowmasker/mkcounts/meta.yml create mode 100644 modules/nf-core/windowmasker/ustat/main.nf create mode 100644 modules/nf-core/windowmasker/ustat/meta.yml create mode 100644 subworkflows/local/prepare_genome.nf diff --git a/conf/modules.config b/conf/modules.config index 069dea0b..d29e500f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,6 +20,14 @@ process { ] } + withName: "WINDOWMASKER_MKCOUNTS" { + ext.args = "-infmt fasta -sformat obinary" + } + + withName: "WINDOWMASKER_USTAT" { + ext.args = "-infmt fasta -dust T -outfmt fasta" + } + withName: "MINIMAP2_HIC" { ext.args = "-ax sr" } @@ -40,6 +48,10 @@ process { ext.args = "-ax map-ont" } + withName: "SAMTOOLS_VIEW" { + ext.args = "--output-fmt bam --write-index" + } + withName: "SAMTOOLS_INDEX" { ext.args = "-c" } @@ -48,10 +60,6 @@ process { ext.args = "--lineage --busco" } - withName: "SAMTOOLS_VIEW" { - ext.args = "--output-fmt bam --write-index" - } - withName: "BUSCO" { scratch = true // Overridden in the test profile, see at the end of this file diff --git a/modules.json b/modules.json index 5a525a75..ee078010 100644 --- a/modules.json +++ b/modules.json @@ -65,6 +65,16 @@ "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", "installed_by": ["modules"] + }, + "windowmasker/mkcounts": { + "branch": "master", + "git_sha": "30c3ed32e8bd5ddaf349ba2f4f99d38182fdc08c", + "installed_by": ["modules"] + }, + "windowmasker/ustat": { + "branch": "master", + "git_sha": "726ee59cd9360a965d96ea9ea8770f16b8ddd6cc", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/windowmasker/mkcounts/main.nf b/modules/nf-core/windowmasker/mkcounts/main.nf new file mode 100644 index 00000000..bfa66f35 --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/main.nf @@ -0,0 +1,55 @@ +process WINDOWMASKER_MKCOUNTS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::blast=2.14.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta), path(ref) + + output: + tuple val(meta), path("*.txt") , emit: counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def memory = 3072 + if (!task.memory) { + log.info '[WINDOWMASKER: MK_COUNTS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + memory = (task.memory.toMega()).intValue() + } + + """ + windowmasker -mk_counts \\ + $args \\ + -mem ${memory} \\ + -in ${ref} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/mkcounts/meta.yml b/modules/nf-core/windowmasker/mkcounts/meta.yml new file mode 100644 index 00000000..788dc96c --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/meta.yml @@ -0,0 +1,40 @@ +name: windowmasker_mkcounts +description: A program to generate frequency counts of repetitive units. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${prefix}.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" diff --git a/modules/nf-core/windowmasker/ustat/main.nf b/modules/nf-core/windowmasker/ustat/main.nf new file mode 100644 index 00000000..72a19dbf --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/main.nf @@ -0,0 +1,69 @@ +process WINDOWMASKER_USTAT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::blast=2.14.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta) , path(counts) + tuple val(meta2), path(ref) + + output: + tuple val(meta), path("${output}") , emit: intervals + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + + """ + windowmasker -ustat \\ + ${counts} \\ + $args \\ + -in ${ref} \\ + -out ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + """ + touch ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/ustat/meta.yml b/modules/nf-core/windowmasker/ustat/meta.yml new file mode 100644 index 00000000..6acf2e50 --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/meta.yml @@ -0,0 +1,48 @@ +name: windowmasker_ustat +description: A program to take a counts file and creates a file of genomic co-ordinates to be masked. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - counts: + type: file + description: Contains count data of repetitive regions. + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - wm_intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${output}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" diff --git a/nextflow.config b/nextflow.config index 5c7d1f0d..91844aa9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { input = null yaml = null align = false + mask = false // Reference options fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index c8f95768..0597fdcb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,13 @@ }, "align": { "type": "boolean", - "description": "Boolean to turn on optional alignment before running the rest of the pipeline." + "description": "Turn on optional alignment before running the rest of the pipeline.", + "fa_icon": "fas fa-toggle-off" + }, + "mask": { + "type": "boolean", + "description": "Turn on optional genome masking if needed.", + "fa_icon": "fas fa-toggle-off" }, "yaml": { "type": "string", diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf new file mode 100644 index 00000000..d1e31a72 --- /dev/null +++ b/subworkflows/local/prepare_genome.nf @@ -0,0 +1,49 @@ +// +// Prepare genome for downstream processing +// + +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { WINDOWMASKER_MKCOUNTS } from '../../modules/nf-core/windowmasker/mkcounts/main' +include { WINDOWMASKER_USTAT } from '../../modules/nf-core/windowmasker/ustat/main' + + +workflow PREPARE_GENOME { + take: + fasta // channel: [ meta, path(genome) ] + + + main: + ch_versions = Channel.empty() + + + // + // MODULE: Decompress FASTA file if needed + // + if ( params.fasta.endsWith('.gz') ) { + ch_genome = GUNZIP ( fasta ).gunzip + ch_versions = ch_versions.mix ( GUNZIP.out.versions ) + } else { + ch_genome = fasta + } + + + // + // MODULES: Mask the genome if needed + // + if ( params.mask ) { + WINDOWMASKER_MKCOUNTS ( ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_MKCOUNTS.out.versions ) + + WINDOWMASKER_USTAT ( WINDOWMASKER_MKCOUNTS.out.counts, ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_USTAT.out.versions ) + + ch_fasta = WINDOWMASKER_USTAT.out.intervals + } else { + ch_fasta = ch_genome + } + + + emit: + genome = ch_fasta // channel: [ meta, path(genome) ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index e1a71ed6..919e17bb 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -16,7 +16,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } @@ -50,6 +50,7 @@ include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { MINIMAP2_ALIGNMENT } from '../subworkflows/local/minimap_alignment' include { INPUT_CHECK } from '../subworkflows/local/input_check' include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' @@ -67,7 +68,6 @@ include { VIEW } from '../subworkflows/local/view' // // MODULE: Installed directly from nf-core/modules // -include { GUNZIP } from '../modules/nf-core/gunzip/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -85,14 +85,10 @@ workflow BLOBTOOLKIT { ch_versions = Channel.empty() // - // MODULE: Decompress FASTA file if needed + // SUBWORKFLOW: Prepare genome for downstream processing // - if ( params.fasta.endsWith('.gz') ) { - ch_genome = GUNZIP ( ch_fasta ).gunzip - ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) - } else { - ch_genome = ch_fasta - } + PREPARE_GENOME ( ch_fasta ) + ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions ) // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis @@ -104,7 +100,7 @@ workflow BLOBTOOLKIT { // SUBWORKFLOW: Optional read alignment // if ( params.align ) { - MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, ch_genome ) + MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGNMENT.out.versions ) ch_aligned = MINIMAP2_ALIGNMENT.out.aln } else { @@ -114,7 +110,7 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Calculate genome coverage and statistics // - COVERAGE_STATS ( ch_aligned, ch_genome ) + COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // @@ -128,7 +124,7 @@ workflow BLOBTOOLKIT { } BUSCO_DIAMOND ( - ch_genome, + PREPARE_GENOME.out.genome, ch_taxon_taxa, ch_busco_db, ch_uniprot, @@ -153,7 +149,7 @@ workflow BLOBTOOLKIT { // SUBWORKFLOW: Create BlobTools dataset // if ( !params.yaml ) { - BLOBTOOLKIT_CONFIG ( ch_genome ) + BLOBTOOLKIT_CONFIG ( PREPARE_GENOME.out.genome ) ch_config = BLOBTOOLKIT_CONFIG.out.yaml ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) } else {