Merge pull request #20 from tkchafin/samtools_reheader

Formatting custom SAM header
sanger-tol · Jun 26, 2024 · 2bea0fc · 2bea0fc
2 parents 8976796 + 822760d
commit 2bea0fc
Show file tree

Hide file tree

Showing 7 changed files with 186 additions and 9 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -17,7 +17,12 @@ process {
         publishDir = [
             path: { "${meta.outdir}/assembly" },
             mode: 'copy',
-            saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null }
+            saveAs: { filename ->
+                filename.endsWith('assembly_report.txt') ||
+                filename.endsWith('assembly_stats.txt') ||
+                filename.endsWith("ACCESSION") ||
+                filename.endsWith("SOURCE") ? filename : null
+            }
         ]
     }
 
@@ -28,6 +33,13 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+    withName: '.*:.*:PREPARE_UNMASKED_HEADER:.*' {
+        publishDir = [
+            path: { "${meta.outdir}/assembly" },
+            mode: 'copy',
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 
     withName: '.*:.*:(PREPARE_REPEAT_MASKED_FASTA:.*|PREPARE_REPEATS:TABIX_.*)' {
         publishDir = [

diff --git a/docs/usage.md b/docs/usage.md
@@ -48,7 +48,7 @@ A samplesheet may contain:
 - only one row per assembly
 
 All samplesheet columns correspond exactly to their corresponding command-line parameter,
-except `outdir` which overrides or complements `--oudir`.
+except `outdir` which overrides or complements `--outdir`.
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -105,7 +105,7 @@ First, go to the [sanger-tol/insdcdownload releases page](https://github.com/san
 
 This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports.
 
-To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter.
+To further assist in reproducibility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter.
 
 > 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles.
 
@@ -126,7 +126,7 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c
 Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important!
 They are loaded in sequence, so later profiles can overwrite earlier profiles.
 
-If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment.
+If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer environment.
 
 - `docker`
   - A generic configuration profile to be used with [Docker](https://docker.com/)

diff --git a/modules/local/build_sam_header.nf b/modules/local/build_sam_header.nf
@@ -0,0 +1,96 @@
+// Module that parses an NCBI assembly and assembly report and outputs
+// a SAM header template
+process BUILD_SAM_HEADER {
+    tag "$genome"
+    label 'process_single'
+
+    conda "conda-forge::gawk=5.1.0"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
+        'biocontainers/gawk:5.1.0'}"
+
+    input:
+    tuple val(meta), path(dict), path(report), path(source)
+
+    output:
+    tuple val(meta), path(filename_header), emit: header
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    filename_header = "${prefix}.header.sam"
+
+    // Use the supplied speciesRegex or default if not provided
+    def speciesRegex = task.ext.speciesRegex ?: '# Organism name:\s*([^\\(]*)\s*\\(.*\\)'
+
+    """
+    sourcePath=\$(cat $source | tr -d '\\n')
+    genBankAccession=\$(awk '/^# GenBank assembly accession:/ { gsub("\\r", ""); print \$NF }' $report)
+    awk -v species_regex='$speciesRegex' -v genBankAccession=\$genBankAccession -v sourcePath=\$sourcePath '
+    BEGIN {
+        OFS = "\\t";
+        IFS = "\\t";
+        AS = "AS:" genBankAccession;
+        species_name = "";
+    }
+    NR == FNR {
+        if (\$0 ~ /^# Organism name:/) {
+            match(\$0, species_regex, arr);
+            species_name = arr[1];
+        }
+        if (\$0 !~ /^#/) {
+            split(\$0, fields, "\\t");
+            lookup[fields[5]] = fields[3];
+        }
+        next;
+    }
+    /^@HD/ {
+        print;
+        next;
+    }
+    /^@SQ/ {
+        split(\$0, fields, "\\t");
+        sn = "";
+        for (i in fields) {
+            if (fields[i] ~ /^SN:/) {
+                split(fields[i], sn_field, ":");
+                sn = sn_field[2];
+            }
+            if (fields[i] ~ /^UR:/) {
+                fields[i] = "UR:" sourcePath;
+            }
+        }
+        if (sn in lookup) {
+            new_field = "AM:" lookup[sn];
+        } else {
+            new_field = "AM:na";
+        }
+        new_sp = "SP:" species_name;
+        print join(fields, OFS), AS, new_field, new_sp;
+        next;
+    }
+    {
+        print;
+    }
+    function join(arr, sep) {
+        result = arr[1];
+        for (i = 2; i <= length(arr); i++) {
+            result = result sep arr[i];
+        }
+        return result;
+    }
+    ' $report $dict > $filename_header
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//')
+    END_VERSIONS
+    """
+}
+
+
+
+
diff --git a/modules/local/ncbi_download.nf b/modules/local/ncbi_download.nf
@@ -19,6 +19,7 @@ process NCBI_DOWNLOAD {
     tuple val(meta), path(filename_assembly_report), emit: assembly_report
     tuple val(meta), path(filename_assembly_stats) , emit: assembly_stats
     tuple val(meta), path(filename_accession)      , emit: accession
+    tuple val(meta), path(filename_source)         , emit: source
     path  "versions.yml"                           , emit: versions
 
     when:
@@ -43,6 +44,7 @@ process NCBI_DOWNLOAD {
     filename_assembly_stats = "${prefix}.assembly_stats.txt"
     filename_fasta = "${prefix}.masked.ncbi.fa"  // NOTE: this channel eventually sees ".masked.ncbi" being added to meta.id
     filename_accession = "ACCESSION"
+    filename_source = "SOURCE"  // store URL
 
     """
     wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt
@@ -56,6 +58,7 @@ process NCBI_DOWNLOAD {
     mv ${remote_filename_stem}_assembly_report.txt ${filename_assembly_report}
     mv ${remote_filename_stem}_assembly_stats.txt  ${filename_assembly_stats}
     echo "${assembly_accession}"                 > ${filename_accession}
+    echo "${ftp_path}"                           > ${filename_source}
     zcat ${remote_filename_stem}_genomic.fna.gz  > ${filename_fasta}
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/subworkflows/local/download_genome.nf b/subworkflows/local/download_genome.nf
@@ -15,7 +15,12 @@ workflow DOWNLOAD_GENOME {
     main:
     ch_versions = Channel.empty()
 
+    // Download assembly
     ch_masked_fasta     = NCBI_DOWNLOAD ( assembly_params ).fasta
+
+    // Parse assembly to build header template
+    ch_assembly_report  = NCBI_DOWNLOAD.out.assembly_report
+
     ch_versions         = ch_versions.mix(NCBI_DOWNLOAD.out.versions.first())
     // Fix meta.id
     ch_masked_fasta_id  = ch_masked_fasta.map { [it[0] + [id: it[0]["id"] + ".masked.ncbi"], it[1]] }
@@ -28,5 +33,7 @@ workflow DOWNLOAD_GENOME {
     emit:
     fasta_unmasked  = ch_unmasked_fasta         // path: genome.unmasked.fa
     fasta_masked    = ch_masked_fasta_id        // path: genome.masked.ncbi.fa
+    assembly_report = ch_assembly_report        // path: genome.assembly_report.txt
+    source          = NCBI_DOWNLOAD.out.source  // path: SOURCE (contains URL)
     versions        = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/prepare_header.nf b/subworkflows/local/prepare_header.nf
@@ -0,0 +1,39 @@
+// Create a SAM header template from NCBI assembly report and SAMtools .dict
+include { BUILD_SAM_HEADER } from '../../modules/local/build_sam_header'
+
+workflow PREPARE_HEADER {
+
+    take:
+    dict    // file: /path/to/genome.dict
+    report  // file: /path/to/genome.assembly_report.txt
+    source  // file: /path/to/SOURCE (ftp path as string)
+
+    main:
+    ch_versions = Channel.empty()
+
+    // Normalize the dict ID by removing .masked.ncbi if present and join
+    dict_mapped = dict.map { meta, path ->
+        def id = meta.id.replace('.masked.ncbi', '')
+        [id, [meta, path]]
+    }
+    report_mapped = report.map { meta, path -> [meta.id, path] }
+    source_mapped = source.map { meta, path -> [meta.id, path] }
+
+    joined = dict_mapped.join(report_mapped).join(source_mapped)
+
+    // Create input tuple with original meta.id
+    formatted_joined = joined.map { id, dict_tuple, report_path, source_path ->
+        def original_meta = dict_tuple[0] // Get the original ID with .masked.ncbi if present
+        def dict_path = dict_tuple[1]
+        return [original_meta, dict_path, report_path, source_path]
+    }
+
+    // Get header template
+    ch_header = BUILD_SAM_HEADER(formatted_joined).header
+
+    ch_versions = ch_versions.mix(BUILD_SAM_HEADER.out.versions.first())
+
+    emit:
+    header = ch_header                   // path: genome.header.sam
+    versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
+}
diff --git a/workflows/insdcdownload.nf b/workflows/insdcdownload.nf
@@ -18,11 +18,13 @@ WorkflowInsdcdownload.initialise(params, log)
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { DOWNLOAD_GENOME                              } from '../subworkflows/local/download_genome'
-include { PARAMS_CHECK                                 } from '../subworkflows/local/params_check'
-include { PREPARE_FASTA as PREPARE_UNMASKED_FASTA      } from '../subworkflows/local/prepare_fasta'
-include { PREPARE_FASTA as PREPARE_REPEAT_MASKED_FASTA } from '../subworkflows/local/prepare_fasta'
-include { PREPARE_REPEATS                              } from '../subworkflows/local/prepare_repeats'
+include { DOWNLOAD_GENOME                                } from '../subworkflows/local/download_genome'
+include { PARAMS_CHECK                                   } from '../subworkflows/local/params_check'
+include { PREPARE_FASTA as PREPARE_UNMASKED_FASTA        } from '../subworkflows/local/prepare_fasta'
+include { PREPARE_FASTA as PREPARE_REPEAT_MASKED_FASTA   } from '../subworkflows/local/prepare_fasta'
+include { PREPARE_HEADER as PREPARE_UNMASKED_HEADER      } from '../subworkflows/local/prepare_header'
+include { PREPARE_HEADER as PREPARE_REPEAT_MASKED_HEADER } from '../subworkflows/local/prepare_header'
+include { PREPARE_REPEATS                                } from '../subworkflows/local/prepare_repeats'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -69,16 +71,34 @@ workflow INSDCDOWNLOAD {
     )
     ch_versions         = ch_versions.mix(PREPARE_UNMASKED_FASTA.out.versions)
 
+    // Header for unmasked fasta
+    PREPARE_UNMASKED_HEADER (
+        PREPARE_UNMASKED_FASTA.out.dict,
+        DOWNLOAD_GENOME.out.assembly_report,
+        DOWNLOAD_GENOME.out.source
+    )
+    ch_versions         = ch_versions.mix(PREPARE_UNMASKED_HEADER.out.versions)
+
     // Preparation of repeat-masking files
     PREPARE_REPEAT_MASKED_FASTA (
         DOWNLOAD_GENOME.out.fasta_masked
     )
     ch_versions         = ch_versions.mix(PREPARE_REPEAT_MASKED_FASTA.out.versions)
+
     PREPARE_REPEATS (
         PREPARE_REPEAT_MASKED_FASTA.out.fasta_gz
     )
     ch_versions         = ch_versions.mix(PREPARE_REPEATS.out.versions)
 
+    // Header for masked fasta
+    PREPARE_REPEAT_MASKED_HEADER (
+        PREPARE_REPEAT_MASKED_FASTA.out.dict,
+        DOWNLOAD_GENOME.out.assembly_report,
+        DOWNLOAD_GENOME.out.source
+    )
+    ch_versions         = ch_versions.mix(PREPARE_REPEAT_MASKED_HEADER.out.versions)
+
+
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )