diff --git a/conf/modules.config b/conf/modules.config index c0bcbe4..41fec0b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -17,11 +17,25 @@ process { publishDir = [ path: { "${meta.outdir}/assembly" }, mode: 'copy', - saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null } + saveAs: { filename -> + filename.endsWith('assembly_report.txt') || + filename.endsWith('assembly_stats.txt') || + filename.endsWith("ACCESSION") || + filename.endsWith("SOURCE") ? filename : null + } ] } withName: '.*:.*:PREPARE_UNMASKED_FASTA:.*' { + publishDir = [ + path: { "${meta.outdir}/assembly" }, + mode: 'copy', + saveAs: { filename -> + filename.equals('versions.yml') || filename.endsWith('.dict') ? null : filename + } + ] + } + withName: '.*:.*:PREPARE_UNMASKED_HEADER:.*' { publishDir = [ path: { "${meta.outdir}/assembly" }, mode: 'copy', @@ -33,7 +47,9 @@ process { publishDir = [ path: { "${meta.outdir}/repeats/ncbi" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> + filename.equals('versions.yml') || filename.endsWith('.dict') ? null : filename + } ] } diff --git a/docs/usage.md b/docs/usage.md index f785a05..f3ca14e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,7 +48,7 @@ A samplesheet may contain: - only one row per assembly All samplesheet columns correspond exactly to their corresponding command-line parameter, -except `outdir` which overrides or complements `--oudir`. +except `outdir` which overrides or complements `--outdir`. An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -105,7 +105,7 @@ First, go to the [sanger-tol/insdcdownload releases page](https://github.com/san This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. > 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. @@ -126,7 +126,7 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer environment. - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) diff --git a/modules/local/build_sam_header.nf b/modules/local/build_sam_header.nf new file mode 100644 index 0000000..9b57e8d --- /dev/null +++ b/modules/local/build_sam_header.nf @@ -0,0 +1,97 @@ +// Module that parses an NCBI assembly and assembly report and outputs +// a SAM header template +process BUILD_SAM_HEADER { + tag "${meta.id}" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0'}" + + input: + tuple val(meta), path(dict), path(report), path(source) + + output: + tuple val(meta), path(filename_header), emit: header + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + filename_header = "${prefix}.header.sam" + + // Use the supplied speciesRegex or default if not provided + def speciesRegex = task.ext.speciesRegex ?: '# Organism name:\s*([^\\(]*)\s*(.*)' + + """ + sourcePath=\$(cat $source | tr -d '\\n') + genBankAccession=\$(awk '/^# GenBank assembly accession:/ { gsub("\\r", ""); print \$NF }' $report) + + duplicate_found=0 + + awk -v species_regex='$speciesRegex' -v genBankAccession=\$genBankAccession -v sourcePath=\$sourcePath -v duplicate_found="duplicate_found" ' + BEGIN { + OFS = "\\t"; + IFS = "\\t"; + AS = "AS:" genBankAccession; + species_name = ""; + } + NR == FNR { + if (\$0 ~ /^# Organism name:/) { + match(\$0, species_regex, arr); + species_name = arr[1]; + } + if (\$0 !~ /^#/) { + split(\$0, fields, "\\t"); + if (fields[2] == "assembled-molecule") { + lookup[fields[5]] = fields[3]; + } else { + lookup[fields[5]] = fields[1]; + } + } + next; + } + /^@HD/ { + print; + next; + } + /^@SQ/ { + split(\$0, fields, "\\t"); + sn = ""; + for (i in fields) { + if (fields[i] ~ /^SN:/) { + split(fields[i], sn_field, ":"); + sn = sn_field[2]; + } + if (fields[i] ~ /^UR:/) { + fields[i] = "UR:" sourcePath; + } + } + if (sn in lookup) { + new_field = "AN:" lookup[sn]; + } + new_sp = "SP:" species_name; + print join(fields, OFS), AS, new_field, new_sp; + next; + } + { + print; + } + function join(arr, sep) { + result = arr[1]; + for (i = 2; i <= length(arr); i++) { + result = result sep arr[i]; + } + return result; + } + ' $report $dict > $filename_header + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/ncbi_download.nf b/modules/local/ncbi_download.nf index ae63e52..c893c67 100644 --- a/modules/local/ncbi_download.nf +++ b/modules/local/ncbi_download.nf @@ -19,6 +19,7 @@ process NCBI_DOWNLOAD { tuple val(meta), path(filename_assembly_report), emit: assembly_report tuple val(meta), path(filename_assembly_stats) , emit: assembly_stats tuple val(meta), path(filename_accession) , emit: accession + tuple val(meta), path(filename_source) , emit: source path "versions.yml" , emit: versions when: @@ -43,6 +44,7 @@ process NCBI_DOWNLOAD { filename_assembly_stats = "${prefix}.assembly_stats.txt" filename_fasta = "${prefix}.masked.ncbi.fa" // NOTE: this channel eventually sees ".masked.ncbi" being added to meta.id filename_accession = "ACCESSION" + filename_source = "SOURCE" // store URL """ wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt @@ -56,6 +58,7 @@ process NCBI_DOWNLOAD { mv ${remote_filename_stem}_assembly_report.txt ${filename_assembly_report} mv ${remote_filename_stem}_assembly_stats.txt ${filename_assembly_stats} echo "${assembly_accession}" > ${filename_accession} + echo "${ftp_path}/${remote_filename_stem}_genomic.fna.gz" > ${filename_source} zcat ${remote_filename_stem}_genomic.fna.gz > ${filename_fasta} cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/download_genome.nf b/subworkflows/local/download_genome.nf index dd4da55..9771681 100644 --- a/subworkflows/local/download_genome.nf +++ b/subworkflows/local/download_genome.nf @@ -15,7 +15,12 @@ workflow DOWNLOAD_GENOME { main: ch_versions = Channel.empty() + // Download assembly ch_masked_fasta = NCBI_DOWNLOAD ( assembly_params ).fasta + + // Parse assembly to build header template + ch_assembly_report = NCBI_DOWNLOAD.out.assembly_report + ch_versions = ch_versions.mix(NCBI_DOWNLOAD.out.versions.first()) // Fix meta.id ch_masked_fasta_id = ch_masked_fasta.map { [it[0] + [id: it[0]["id"] + ".masked.ncbi"], it[1]] } @@ -28,5 +33,7 @@ workflow DOWNLOAD_GENOME { emit: fasta_unmasked = ch_unmasked_fasta // path: genome.unmasked.fa fasta_masked = ch_masked_fasta_id // path: genome.masked.ncbi.fa + assembly_report = ch_assembly_report // path: genome.assembly_report.txt + source = NCBI_DOWNLOAD.out.source // path: SOURCE (contains URL) versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/subworkflows/local/prepare_header.nf b/subworkflows/local/prepare_header.nf new file mode 100644 index 0000000..38bec86 --- /dev/null +++ b/subworkflows/local/prepare_header.nf @@ -0,0 +1,32 @@ +// Create a SAM header template from NCBI assembly report and SAMtools .dict +include { BUILD_SAM_HEADER } from '../../modules/local/build_sam_header' + +workflow PREPARE_HEADER { + + take: + dict // file: /path/to/genome.dict + report // file: /path/to/genome.assembly_report.txt + source // file: /path/to/SOURCE (ftp path as string) + + main: + ch_versions = Channel.empty() + + // The meta maps differ, so join the channels by meta.id + dict_mapped = dict.map { meta, path -> [meta.id, meta, path] } + report_mapped = report.map { meta, path -> [meta.id, path] } + source_mapped = source.map { meta, path -> [meta.id, path] } + + joined = dict_mapped + | join(report_mapped) + | join(source_mapped) + | map { it[1..-1] } // remove leading meta.id + + // Get header template + ch_header = BUILD_SAM_HEADER(joined).header + + ch_versions = ch_versions.mix(BUILD_SAM_HEADER.out.versions.first()) + + emit: + header = ch_header // path: genome.header.sam + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/workflows/insdcdownload.nf b/workflows/insdcdownload.nf index 67793b7..fd542f9 100644 --- a/workflows/insdcdownload.nf +++ b/workflows/insdcdownload.nf @@ -18,11 +18,12 @@ WorkflowInsdcdownload.initialise(params, log) // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { DOWNLOAD_GENOME } from '../subworkflows/local/download_genome' -include { PARAMS_CHECK } from '../subworkflows/local/params_check' -include { PREPARE_FASTA as PREPARE_UNMASKED_FASTA } from '../subworkflows/local/prepare_fasta' -include { PREPARE_FASTA as PREPARE_REPEAT_MASKED_FASTA } from '../subworkflows/local/prepare_fasta' -include { PREPARE_REPEATS } from '../subworkflows/local/prepare_repeats' +include { DOWNLOAD_GENOME } from '../subworkflows/local/download_genome' +include { PARAMS_CHECK } from '../subworkflows/local/params_check' +include { PREPARE_FASTA as PREPARE_UNMASKED_FASTA } from '../subworkflows/local/prepare_fasta' +include { PREPARE_FASTA as PREPARE_REPEAT_MASKED_FASTA } from '../subworkflows/local/prepare_fasta' +include { PREPARE_HEADER as PREPARE_UNMASKED_HEADER } from '../subworkflows/local/prepare_header' +include { PREPARE_REPEATS } from '../subworkflows/local/prepare_repeats' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,11 +70,20 @@ workflow INSDCDOWNLOAD { ) ch_versions = ch_versions.mix(PREPARE_UNMASKED_FASTA.out.versions) + // Header for unmasked fasta + PREPARE_UNMASKED_HEADER ( + PREPARE_UNMASKED_FASTA.out.dict, + DOWNLOAD_GENOME.out.assembly_report, + DOWNLOAD_GENOME.out.source + ) + ch_versions = ch_versions.mix(PREPARE_UNMASKED_HEADER.out.versions) + // Preparation of repeat-masking files PREPARE_REPEAT_MASKED_FASTA ( DOWNLOAD_GENOME.out.fasta_masked ) ch_versions = ch_versions.mix(PREPARE_REPEAT_MASKED_FASTA.out.versions) + PREPARE_REPEATS ( PREPARE_REPEAT_MASKED_FASTA.out.fasta_gz )