-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #20 from tkchafin/samtools_reheader
Formatting custom SAM header
- Loading branch information
Showing
7 changed files
with
186 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
// Module that parses an NCBI assembly and assembly report and outputs | ||
// a SAM header template | ||
process BUILD_SAM_HEADER { | ||
tag "$genome" | ||
label 'process_single' | ||
|
||
conda "conda-forge::gawk=5.1.0" | ||
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : | ||
'biocontainers/gawk:5.1.0'}" | ||
|
||
input: | ||
tuple val(meta), path(dict), path(report), path(source) | ||
|
||
output: | ||
tuple val(meta), path(filename_header), emit: header | ||
path "versions.yml", emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def prefix = task.ext.prefix ?: "${meta.id}" | ||
filename_header = "${prefix}.header.sam" | ||
|
||
// Use the supplied speciesRegex or default if not provided | ||
def speciesRegex = task.ext.speciesRegex ?: '# Organism name:\s*([^\\(]*)\s*\\(.*\\)' | ||
|
||
""" | ||
sourcePath=\$(cat $source | tr -d '\\n') | ||
genBankAccession=\$(awk '/^# GenBank assembly accession:/ { gsub("\\r", ""); print \$NF }' $report) | ||
awk -v species_regex='$speciesRegex' -v genBankAccession=\$genBankAccession -v sourcePath=\$sourcePath ' | ||
BEGIN { | ||
OFS = "\\t"; | ||
IFS = "\\t"; | ||
AS = "AS:" genBankAccession; | ||
species_name = ""; | ||
} | ||
NR == FNR { | ||
if (\$0 ~ /^# Organism name:/) { | ||
match(\$0, species_regex, arr); | ||
species_name = arr[1]; | ||
} | ||
if (\$0 !~ /^#/) { | ||
split(\$0, fields, "\\t"); | ||
lookup[fields[5]] = fields[3]; | ||
} | ||
next; | ||
} | ||
/^@HD/ { | ||
print; | ||
next; | ||
} | ||
/^@SQ/ { | ||
split(\$0, fields, "\\t"); | ||
sn = ""; | ||
for (i in fields) { | ||
if (fields[i] ~ /^SN:/) { | ||
split(fields[i], sn_field, ":"); | ||
sn = sn_field[2]; | ||
} | ||
if (fields[i] ~ /^UR:/) { | ||
fields[i] = "UR:" sourcePath; | ||
} | ||
} | ||
if (sn in lookup) { | ||
new_field = "AM:" lookup[sn]; | ||
} else { | ||
new_field = "AM:na"; | ||
} | ||
new_sp = "SP:" species_name; | ||
print join(fields, OFS), AS, new_field, new_sp; | ||
next; | ||
} | ||
{ | ||
print; | ||
} | ||
function join(arr, sep) { | ||
result = arr[1]; | ||
for (i = 2; i <= length(arr); i++) { | ||
result = result sep arr[i]; | ||
} | ||
return result; | ||
} | ||
' $report $dict > $filename_header | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//') | ||
END_VERSIONS | ||
""" | ||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Create a SAM header template from NCBI assembly report and SAMtools .dict | ||
include { BUILD_SAM_HEADER } from '../../modules/local/build_sam_header' | ||
|
||
workflow PREPARE_HEADER { | ||
|
||
take: | ||
dict // file: /path/to/genome.dict | ||
report // file: /path/to/genome.assembly_report.txt | ||
source // file: /path/to/SOURCE (ftp path as string) | ||
|
||
main: | ||
ch_versions = Channel.empty() | ||
|
||
// Normalize the dict ID by removing .masked.ncbi if present and join | ||
dict_mapped = dict.map { meta, path -> | ||
def id = meta.id.replace('.masked.ncbi', '') | ||
[id, [meta, path]] | ||
} | ||
report_mapped = report.map { meta, path -> [meta.id, path] } | ||
source_mapped = source.map { meta, path -> [meta.id, path] } | ||
|
||
joined = dict_mapped.join(report_mapped).join(source_mapped) | ||
|
||
// Create input tuple with original meta.id | ||
formatted_joined = joined.map { id, dict_tuple, report_path, source_path -> | ||
def original_meta = dict_tuple[0] // Get the original ID with .masked.ncbi if present | ||
def dict_path = dict_tuple[1] | ||
return [original_meta, dict_path, report_path, source_path] | ||
} | ||
|
||
// Get header template | ||
ch_header = BUILD_SAM_HEADER(formatted_joined).header | ||
|
||
ch_versions = ch_versions.mix(BUILD_SAM_HEADER.out.versions.first()) | ||
|
||
emit: | ||
header = ch_header // path: genome.header.sam | ||
versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters