From 45de788c5d25833c974f1acc392d464d92ce15cc Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 17 Nov 2023 12:37:35 +0000 Subject: [PATCH 01/33] copy align_pacbio subworkflow from readmapping pipleline dev branch, 093b313e29ed68480d81d796cc1609536518ee5a and install all the related modules. --- modules.json | 40 ++++++++++ modules/local/pacbio_filter.nf | 30 +++++++ modules/nf-core/blast/blastn/environment.yml | 7 ++ modules/nf-core/blast/blastn/main.nf | 57 ++++++++++++++ modules/nf-core/blast/blastn/meta.yml | 55 +++++++++++++ .../nf-core/blast/blastn/tests/main.nf.test | 71 +++++++++++++++++ .../blast/blastn/tests/nextflow.config | 5 ++ modules/nf-core/blast/blastn/tests/tags.yml | 2 + modules/nf-core/gunzip/environment.yml | 7 ++ modules/nf-core/gunzip/main.nf | 48 ++++++++++++ modules/nf-core/gunzip/meta.yml | 39 ++++++++++ modules/nf-core/gunzip/tests/main.nf.test | 35 +++++++++ .../nf-core/gunzip/tests/main.nf.test.snap | 31 ++++++++ modules/nf-core/gunzip/tests/tags.yml | 2 + .../nf-core/minimap2/align/environment.yml | 8 ++ modules/nf-core/minimap2/align/main.nf | 48 ++++++++++++ modules/nf-core/minimap2/align/meta.yml | 75 ++++++++++++++++++ .../nf-core/samtools/collate/environment.yml | 7 ++ modules/nf-core/samtools/collate/main.nf | 46 +++++++++++ modules/nf-core/samtools/collate/meta.yml | 43 ++++++++++ .../nf-core/samtools/fasta/environment.yml | 7 ++ modules/nf-core/samtools/fasta/main.nf | 44 +++++++++++ modules/nf-core/samtools/fasta/meta.yml | 60 ++++++++++++++ .../nf-core/samtools/flagstat/environment.yml | 7 ++ modules/nf-core/samtools/flagstat/main.nf | 46 +++++++++++ modules/nf-core/samtools/flagstat/meta.yml | 51 ++++++++++++ .../samtools/flagstat/tests/main.nf.test | 35 +++++++++ .../samtools/flagstat/tests/main.nf.test.snap | 16 ++++ .../nf-core/samtools/flagstat/tests/tags.yml | 2 + .../nf-core/samtools/idxstats/environment.yml | 7 ++ modules/nf-core/samtools/idxstats/main.nf | 48 ++++++++++++ modules/nf-core/samtools/idxstats/meta.yml | 52 +++++++++++++ .../samtools/idxstats/tests/main.nf.test | 35 +++++++++ .../samtools/idxstats/tests/main.nf.test.snap | 16 ++++ .../nf-core/samtools/idxstats/tests/tags.yml | 2 + .../nf-core/samtools/stats/environment.yml | 7 ++ modules/nf-core/samtools/stats/main.nf | 49 ++++++++++++ modules/nf-core/samtools/stats/meta.yml | 63 +++++++++++++++ .../nf-core/samtools/stats/tests/main.nf.test | 78 +++++++++++++++++++ .../samtools/stats/tests/main.nf.test.snap | 64 +++++++++++++++ modules/nf-core/samtools/stats/tests/tags.yml | 2 + subworkflows/local/align_pacbio.nf | 70 +++++++++++++++++ subworkflows/local/convert_stats.nf | 54 +++++++++++++ subworkflows/local/filter_pacbio.nf | 77 ++++++++++++++++++ 44 files changed, 1548 insertions(+) create mode 100644 modules/local/pacbio_filter.nf create mode 100644 modules/nf-core/blast/blastn/environment.yml create mode 100644 modules/nf-core/blast/blastn/main.nf create mode 100644 modules/nf-core/blast/blastn/meta.yml create mode 100644 modules/nf-core/blast/blastn/tests/main.nf.test create mode 100644 modules/nf-core/blast/blastn/tests/nextflow.config create mode 100644 modules/nf-core/blast/blastn/tests/tags.yml create mode 100644 modules/nf-core/gunzip/environment.yml create mode 100644 modules/nf-core/gunzip/main.nf create mode 100644 modules/nf-core/gunzip/meta.yml create mode 100644 modules/nf-core/gunzip/tests/main.nf.test create mode 100644 modules/nf-core/gunzip/tests/main.nf.test.snap create mode 100644 modules/nf-core/gunzip/tests/tags.yml create mode 100644 modules/nf-core/minimap2/align/environment.yml create mode 100644 modules/nf-core/minimap2/align/main.nf create mode 100644 modules/nf-core/minimap2/align/meta.yml create mode 100644 modules/nf-core/samtools/collate/environment.yml create mode 100644 modules/nf-core/samtools/collate/main.nf create mode 100644 modules/nf-core/samtools/collate/meta.yml create mode 100644 modules/nf-core/samtools/fasta/environment.yml create mode 100644 modules/nf-core/samtools/fasta/main.nf create mode 100644 modules/nf-core/samtools/fasta/meta.yml create mode 100644 modules/nf-core/samtools/flagstat/environment.yml create mode 100644 modules/nf-core/samtools/flagstat/main.nf create mode 100644 modules/nf-core/samtools/flagstat/meta.yml create mode 100644 modules/nf-core/samtools/flagstat/tests/main.nf.test create mode 100644 modules/nf-core/samtools/flagstat/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/flagstat/tests/tags.yml create mode 100644 modules/nf-core/samtools/idxstats/environment.yml create mode 100644 modules/nf-core/samtools/idxstats/main.nf create mode 100644 modules/nf-core/samtools/idxstats/meta.yml create mode 100644 modules/nf-core/samtools/idxstats/tests/main.nf.test create mode 100644 modules/nf-core/samtools/idxstats/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/idxstats/tests/tags.yml create mode 100644 modules/nf-core/samtools/stats/environment.yml create mode 100644 modules/nf-core/samtools/stats/main.nf create mode 100644 modules/nf-core/samtools/stats/meta.yml create mode 100644 modules/nf-core/samtools/stats/tests/main.nf.test create mode 100644 modules/nf-core/samtools/stats/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/stats/tests/tags.yml create mode 100644 subworkflows/local/align_pacbio.nf create mode 100644 subworkflows/local/convert_stats.nf create mode 100644 subworkflows/local/filter_pacbio.nf diff --git a/modules.json b/modules.json index 1ab14cf..1e3636d 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "blast/blastn": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "cat/cat": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -25,11 +30,41 @@ "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", "installed_by": ["modules"] }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/collate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, + "samtools/fasta": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7", + "installed_by": ["modules"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7", + "installed_by": ["modules"] + }, "samtools/merge": { "branch": "master", "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", @@ -41,6 +76,11 @@ "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", "installed_by": ["modules"] }, + "samtools/stats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf new file mode 100644 index 0000000..18dd11c --- /dev/null +++ b/modules/local/pacbio_filter.nf @@ -0,0 +1,30 @@ +process PACBIO_FILTER { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'quay.io/biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(txt) + + output: + path("*.blocklist"), emit: list + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + pacbio_filter.sh $txt ${prefix}.blocklist + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml new file mode 100644 index 0000000..cb9b15d --- /dev/null +++ b/modules/nf-core/blast/blastn/environment.yml @@ -0,0 +1,7 @@ +name: blast_blastn +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::blast=2.14.1 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf new file mode 100644 index 0000000..e8b96ad --- /dev/null +++ b/modules/nf-core/blast/blastn/main.nf @@ -0,0 +1,57 @@ +process BLAST_BLASTN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': + 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) + + output: + tuple val(meta), path('*.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + blastn \\ + -num_threads ${task.cpus} \\ + -db \$DB \\ + -query ${fasta_name} \\ + ${args} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml new file mode 100644 index 0000000..a0d64dd --- /dev/null +++ b/modules/nf-core/blast/blastn/meta.yml @@ -0,0 +1,55 @@ +name: blast_blastn +description: Queries a BLAST DNA database +keywords: + - fasta + - blast + - blastn + - DNA sequence +tools: + - blast: + description: | + BLAST finds regions of similarity between biological sequences. + homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi + documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs + doi: 10.1016/S0022-2836(05)80360-2 + licence: ["US-Government-Work"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing queries sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: directory + description: Directory containing the blast database + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - txt: + type: file + description: File containing blastn hits + pattern: "*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@vagkaratzas" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test new file mode 100644 index 0000000..1058c81 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process BLAST_BLASTN" + script "../main.nf" + process "BLAST_BLASTN" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "blast" + tag "blast/blastn" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + } + + test("Should search for nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert process.out.versions } + ) + } + + } + + test("Should search for zipped nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/blast/blastn/tests/nextflow.config b/modules/nf-core/blast/blastn/tests/nextflow.config new file mode 100644 index 0000000..0899289 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: BLAST_MAKEBLASTDB { + ext.args = '-dbtype nucl' + } +} diff --git a/modules/nf-core/blast/blastn/tests/tags.yml b/modules/nf-core/blast/blastn/tests/tags.yml new file mode 100644 index 0000000..b4588ab --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/tags.yml @@ -0,0 +1,2 @@ +blast/blastn: + - modules/nf-core/blast/blastn/** diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 0000000..25910b3 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 0000000..468a6f2 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 0000000..231034f --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,39 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 0000000..d031792 --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [], + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 0000000..720fd9f --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 0000000..fd3f691 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..60b9a8b --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,8 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.14 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 0000000..fa3ae50 --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 0000000..408522d --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,75 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/samtools/collate/environment.yml b/modules/nf-core/samtools/collate/environment.yml new file mode 100644 index 0000000..0fb861b --- /dev/null +++ b/modules/nf-core/samtools/collate/environment.yml @@ -0,0 +1,7 @@ +name: samtools_collate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/collate/main.nf b/modules/nf-core/samtools/collate/main.nf new file mode 100644 index 0000000..38a4daf --- /dev/null +++ b/modules/nf-core/samtools/collate/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_COLLATE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0': + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + path fasta + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + collate \\ + $args \\ + ${reference} \\ + -@ $task.cpus \\ + -o ${prefix}.${extension} \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collate/meta.yml b/modules/nf-core/samtools/collate/meta.yml new file mode 100644 index 0000000..3c79927 --- /dev/null +++ b/modules/nf-core/samtools/collate/meta.yml @@ -0,0 +1,43 @@ +name: "samtools_collate" +description: shuffles and groups reads together by their names +keywords: + - collate + - bam +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-collate.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: "['MIT']" +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: Collated BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fasta/environment.yml b/modules/nf-core/samtools/fasta/environment.yml new file mode 100644 index 0000000..8a82f9e --- /dev/null +++ b/modules/nf-core/samtools/fasta/environment.yml @@ -0,0 +1,7 @@ +name: samtools_fasta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf new file mode 100644 index 0000000..925ed62 --- /dev/null +++ b/modules/nf-core/samtools/fasta/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTA { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta + tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : + meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : + "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" + """ + samtools \\ + fasta \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fasta.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml new file mode 100644 index 0000000..1d07ea1 --- /dev/null +++ b/modules/nf-core/samtools/fasta/meta.yml @@ -0,0 +1,60 @@ +name: "samtools_fasta" +description: Converts a SAM/BAM/CRAM file to FASTA +keywords: + - bam + - sam + - cram + - fasta +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-fasta.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: "['MIT']" +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fasta files +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fasta.gz" + - interleaved: + type: file + description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fasta.gz" + - singleton: + type: file + description: Compressed FASTA file with singleton reads + pattern: "*_singleton.fasta.gz" + - other: + type: file + description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fasta.gz" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 0000000..22bdb5c --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,7 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf new file mode 100644 index 0000000..9dee35a --- /dev/null +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml new file mode 100644 index 0000000..9799135 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -0,0 +1,51 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 0000000..c618de7 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools/flagstat" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.flagstat).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 0000000..880019f --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2023-11-14T15:49:22.577133" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 0000000..2d2b725 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 0000000..89bd272 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf new file mode 100644 index 0000000..b22d084 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 0000000..344e92a --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,52 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 0000000..0174a9e --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools/idxstats" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.idxstats).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 0000000..4c6c12b --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2023-11-14T15:52:19.875194" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 0000000..d3057c6 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 0000000..ed4e896 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000..07286ef --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000..735ff81 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,63 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 0000000..e037132 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules/nf-core" + tag "samtools" + tag "samtools/stats" + + test("SAMTOOLS STATS Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here. + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + + ] + input[1] = [[],[]] + """ + + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + test("SAMTOOLS CRAM Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai'], checkIfExists: true) + + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + + + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..516b2b0 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "SAMTOOLS STATS Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:12:42.998746" + }, + "SAMTOOLS CRAM Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:13:30.747222" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 0000000..7c28e30 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf new file mode 100644 index 0000000..767563b --- /dev/null +++ b/subworkflows/local/align_pacbio.nf @@ -0,0 +1,70 @@ +// +// Align PacBio read files against the genome +// + +include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' + + +workflow ALIGN_PACBIO { + take: + fasta // channel: [ val(meta), /path/to/fasta ] + reads // channel: [ val(meta), /path/to/datafile ] + db // channel: /path/to/vector_db + + + main: + ch_versions = Channel.empty() + + + // Filter BAM and output as FASTQ + FILTER_PACBIO ( reads, db ) + ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + + + // Align Fastq to Genome + fasta + | map { meta, file -> file } + | set { ch_fasta } + + MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) + ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + + + // Collect all alignment output by sample name + MINIMAP2_ALIGN.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | groupTuple ( by: [0] ) + | set { ch_bams } + + + // Merge + SAMTOOLS_MERGE ( ch_bams, [], [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + // Position sort BAM file + SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) + + + // Convert merged BAM to CRAM and calculate indices and statistics + SAMTOOLS_SORT.out.bam + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_sort } + + CONVERT_STATS ( ch_sort, ch_fasta ) + ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) + + + emit: + cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] + crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] + stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] + idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] + flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf new file mode 100644 index 0000000..7c381fd --- /dev/null +++ b/subworkflows/local/convert_stats.nf @@ -0,0 +1,54 @@ +// +// Convert BAM to CRAM, create index and calculate statistics +// + +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' +include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/samtools/idxstats/main' + + +workflow CONVERT_STATS { + take: + bam // channel: [ val(meta), /path/to/bam, /path/to/bai] + fasta // channel: /path/to/fasta + + + main: + ch_versions = Channel.empty() + + + // Convert BAM to CRAM + SAMTOOLS_VIEW ( bam, fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + + + // Combine CRAM and CRAI into one channel + SAMTOOLS_VIEW.out.cram + | join ( SAMTOOLS_VIEW.out.crai ) + | set { ch_cram_crai } + + + // Calculate statistics + SAMTOOLS_STATS ( ch_cram_crai, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_STATS.out.versions.first() ) + + + // Calculate statistics based on flag values + SAMTOOLS_FLAGSTAT ( ch_cram_crai ) + ch_versions = ch_versions.mix ( SAMTOOLS_FLAGSTAT.out.versions.first() ) + + + // Calculate index statistics + SAMTOOLS_IDXSTATS ( ch_cram_crai ) + ch_versions = ch_versions.mix ( SAMTOOLS_IDXSTATS.out.versions.first() ) + + + emit: + cram = SAMTOOLS_VIEW.out.cram // channel: [ val(meta), /path/to/cram ] + crai = SAMTOOLS_VIEW.out.crai // channel: [ val(meta), /path/to/crai ] + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), /path/to/stats ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), /path/to/idxstats ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), /path/to/flagstat ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf new file mode 100644 index 0000000..6078d4d --- /dev/null +++ b/subworkflows/local/filter_pacbio.nf @@ -0,0 +1,77 @@ +// +// Filter PacBio reads +// Original protocol is a modified version by Shane of the original program, HiFiAdapterFilt +// + +include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' +include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' +include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' +include { SAMTOOLS_VIEW as SAMTOOLS_FILTER } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' + + +workflow FILTER_PACBIO { + take: + reads // channel: [ val(meta), /path/to/datafile ] + db // channel: /path/to/vector_db + + + main: + ch_versions = Channel.empty() + + + // Convert from PacBio BAM to Samtools BAM + reads + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_pacbio } + + SAMTOOLS_CONVERT ( ch_pacbio, [], [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) + + + // Collate BAM file to create interleaved FASTA + SAMTOOLS_COLLATE ( SAMTOOLS_CONVERT.out.bam, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) + + + // Convert BAM to FASTA + SAMTOOLS_FASTA ( SAMTOOLS_COLLATE.out.bam, true ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTA.out.versions.first() ) + + + // Gunzip FASTA file to BLAST + GUNZIP ( SAMTOOLS_FASTA.out.other ) + ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) + + + // Nucleotide BLAST + BLAST_BLASTN ( GUNZIP.out.gunzip, db ) + ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) + + + // Filter BLAST output + PACBIO_FILTER ( BLAST_BLASTN.out.txt ) + ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) + + + // Create filtered BAM file + SAMTOOLS_CONVERT.out.bam + | join ( SAMTOOLS_CONVERT.out.csi ) + | set { ch_reads } + + SAMTOOLS_FILTER ( ch_reads, [], PACBIO_FILTER.out.list ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) + + + // Convert BAM to FASTQ + SAMTOOLS_FASTQ ( SAMTOOLS_FILTER.out.unoutput, true ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) + + + emit: + fastq = SAMTOOLS_FASTQ.out.other // channel: [ meta, /path/to/fastq ] + versions = ch_versions // channel: [ versions.yml ] +} From a5b1d968e92fe68f891dcacb1770e79274fa550e Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Mon, 20 Nov 2023 10:28:59 +0000 Subject: [PATCH 02/33] add align option for pacbio_align subworkflow --- assets/vectorDB.tar.gz | Bin 0 -> 4640 bytes modules.json | 11 + .../nf-core/samtools/fastq/environment.yml | 7 + modules/nf-core/samtools/fastq/main.nf | 44 ++ modules/nf-core/samtools/fastq/meta.yml | 62 +++ modules/nf-core/untar/environment.yml | 9 + modules/nf-core/untar/main.nf | 63 +++ modules/nf-core/untar/meta.yml | 46 ++ modules/nf-core/untar/tests/main.nf.test | 77 +++ modules/nf-core/untar/tests/main.nf.test.snap | 513 ++++++++++++++++++ modules/nf-core/untar/tests/tags.yml | 2 + nextflow.config | 2 + nextflow_schema.json | 10 + workflows/variantcalling.nf | 30 +- 14 files changed, 875 insertions(+), 1 deletion(-) create mode 100644 assets/vectorDB.tar.gz create mode 100644 modules/nf-core/samtools/fastq/environment.yml create mode 100644 modules/nf-core/samtools/fastq/main.nf create mode 100644 modules/nf-core/samtools/fastq/meta.yml create mode 100644 modules/nf-core/untar/environment.yml create mode 100644 modules/nf-core/untar/main.nf create mode 100644 modules/nf-core/untar/meta.yml create mode 100644 modules/nf-core/untar/tests/main.nf.test create mode 100644 modules/nf-core/untar/tests/main.nf.test.snap create mode 100644 modules/nf-core/untar/tests/tags.yml diff --git a/assets/vectorDB.tar.gz b/assets/vectorDB.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..f9b08d5df9a1e5a7750ff4fa3e31b9829063bc27 GIT binary patch literal 4640 zcmY+GWmFS@yT&D?OBzH#q+3F|K}3*_NsTciM@cif1q6defq`^7YJ{{P-5@bSKzblC zVz6EBx&M2Q_tW#?dCz&?^SmG4A4d|&9iusK`#S`OHsrphvn}kj?Cab#&$><=$;iwI zMjm~n(X`u+ftcD_$e6(eSwE1nLX5OW3h-gQx;5x@0qTcBw^zYqvI*ey8IOT`qFyTkpnBDA6=r8F+w*RL!9k1J&S}3E ztjQ+v1IG`2=WK+ z_o2ETLMk#SyMAA-7mFJmnXJPpAVm$}Teuo}i^Q{V1Gr@NO(&1IJ=;a#CzrgLZhef6 z&rNrT=R_uWob)m&T8l?c4byk<$rspV`%8?Bo<7j$kYytqxlJ9MbhB{R6<~;7O8O?L z+m2cCYh4f_ylr4yyNyvqh}hdKS#`#zhPi<-BO>VP0ZkW z2&^o2ZbHZlO7uAJQ}8gKSH_i9rcp4{@w0qqsR48aGf;P_m?%EG7tL;$t^O4@q+!>zqc?bObeo)>Jh=K;^0(DNJ53co?Sdd@TeCJdWP40P(uU z?$VTgA2xhJMu8-toHcCVq=<+uBQP`yrm>q;FSv}**&^I4C!LCIQS*w+(I|L8Q2T@E zCry<(P|S;Yq(n(QH8PCgGQM1mr%UA|`9^Y6_bZ4Lkfgd6XxIsJ2iWy#FpyXiwA))U zI_tP9iPO!KJK9kV-K?pPW%)78$5Hz3v33xGjV07uoK++!8zfA`%%)|GB={Zlh$(u- z5pi!Qq9wY3#XXJ7-D&AaJM#ujXZs%%gT@7IWE9I3$alMeMXcIrPACjanqnR)%xPlB z?z~QM$e#6NqPW3$f}kZ`V@B@P7(O^fLeB&j5j)2dzCpGC#LyEe_=4yu`^^BP8cTMZ zqJuGbER+Yf+u+RyRO$ul;*SbtG%WEbU)sXBBgteMja$0BQp~~j%d{DL>6fysgnNBt zN<)c$va~y5u&Rf`C0YKGC8L=yeL;TsEs_&TUdTXvzXu*c@(`FBK>DrAJ!2`#0%}?U z9?)TAf8;ROIfvgog@{bpg()@}6j9^1+?7Z+XBR~#yec(m5&2{uFB`RFc1-5$v+01QfXF%uca#TTGD7 zuy+xezUBnfXmveo>@KWZ>|-9)b9d#P(+3!c3x5~q6}0v18JLYa*9d)7*MF~vR{S@V z-;>#qMQAiiM?z06OMkL}BK@mIyB+Vno`xRAen-djo-PI=f0F@~XQ|F%BQvFbs=s$4 z@g=z!D_T);d(W7L7u{z_qxZ#A-J}`=A4iht?dEWoh}F2cja-|TNQsYf;Z>qkAH{~$ zfaL~G?$1}7P%Vmkagx(nD-%)col#WUCd<9Emb%MNQ;;^xQ`qE^jsn==y}4Q;%vBW? zKQEYgj8lJ`OqyQ=R8LQlU_wCEQrEZ|kZY~^V9RN5-~fnj2y7S$t0(F+?f*(Uup623 zMqnjotwgu46<@-%cE1%cUkzv}f@WFv7_O)mMkEph$dv`a7|IhvNyGu`lD;zhwJe&> zA|h_Nc!fkl1MXBPa91xVH^TB)2~~PlwK|4MI0CU!uNFfH^Pc`STGzy4P6Yy(zRYF% z;klqdXSVtK88999Thx%PJl}aNdqn zqVN!it=W`Cy_Rjxcsn*uxlFjfxF|pR(zy+9iN|BlhP4JTKmwpKryz=xm==r$g;mKk zs=nt-kgz_II%lMBUK9TqX{A{wK9bmKyH*A>L=iWpWc?Q77s)o0&ns16b1ARs9s`Z3 z4YHq8MTw1Pf(g(!-gSD!Ym&WSp(%RC$8luKeS(y+{B4{?std`pEKDy7 zM;bn>iki)6q#;#bt`WuCJbD%RGIq8|k-iwV+6?+FZP@><>J46b%IloOyUZX&(CE~J zm30Hi4Iy7|-l)Bfp%7w&DZhK-sz3;${e+-E|!ud?xa#)eL_v8h9 zTb%Fm^W4D}DD}S@|H{X=U}FsRZ*$8U20Lx2OVPQt0Sq>P^9GDRA=QNrLhs#6M;Up7 z54#Ab^OJCq4~I5`9-a&BaQC!B-TOB$bbYHKOj}p}@bfhH z1V{-aU>NPQ@Ouy)1G(=|`R?!?B>#uMC6 zU~Ad0vEz*WW+)sSRW#=H1=C6UpW^>pj=yR3VLG3<;W(~1CaW;xn!s0V|J;To=J%r; zY%OV%#V;TXhvWLUnpLvEqj^~?aST(3wFReG?a%wBw>CLm1Pj*};kSxdpG6$q{|h3* z#)j_)hrUPQ3>E9IPM;!hNxEp;FdT7auhvDl^S$-}&%w%V{gHg!TEp_>o88KQc3H2f zohqB4N2t8KbIGuam#Y~akBHf@+Dla~NMK$#vM~8PpnCB0>znH(K8x*UmO8&V;?8Gn zHT!*CKJ%e<=;n{@$l#mxZ`2&CGnoEC83uo#b;r@+w3FgKv9H+j>Fo7g6XN8sV1T*D zRm1uGYR7`K)8wP3^ZXY;V?=jmVMd#ai8#M)9Jw6}Gc zFTN^E@fJq8rJZpe2p%#ByV%f{q1XSDFF`z~H!P3b|Cob;>a)+T|2ztxS1NWlYwoiZ zgNz>5iN5MDSYi5A_7R|o#Xe-Pm#RrtS6bG1Ho6P<{7}iwY&jZnX^UA@Qe8EzC)!p z9`wYO8Y)tn0CEa87|73=4|llknI8P!L>+GdJVC$lNcRO@mP=fKC!0)wPjPPZ-*t^j z*|zfqs(1Xn=3#zdH~oPOcKMdAi56;)&&>jUorf8~exA=uSb&+?Y%#)WDoFv6xopg8 z(Yi2sVXBQTMdz7(5Sz)8Yu`?AXx2xh3uI*jher=}A|9G8%N6k{dU^ z&jDS_HUhR-|#7x@&tU{7ckg8qomqmJUt(a)&=E3pML*k46l;rH=gYI?Am>#o- zLePnWYuv)cTo&oJlkkDPZ+mze*t1xney+pwtt*GdNG)+$$H#Y|1b%l^l2TPib!SKF3lKwT+jN8`aC(l7X} zZ)d^FhG?+aj2CM5G3I zy@Q~P@grkA&W=ORFt}&{x@QB#x8k3=V8q@I_Rs>XqLQ_nGsnjcfVG~>@u$N+nVl*! z?Vd{!a$5rx7IXJA<)pi%q1cXeP$;ydI7J|!1=`wkor03{YG^uz7oFFLJpVKAkGh&# z?Y&lJ&IxLtyV~B128AA-?dO~Bv%j#BWy+qe_h~0FzY?YHU&E!TirIx=C zt~!nfVhqv07B$0km9YLtKbk7g?_cj@CzBSxjT z#2D*I%DuNSi{+TEV64vVYn&bMzh*8Bj0zH0t+mGQlcr=c2R@KR{gItkERX(-Zu@;1eYH{`Hjqbm z%tV~@{B;!i&4S!+N5r>2AU!|-@$W^+e#y3Rjx)xqdh(gNf`euzs8dGZt(pi2sgy;qv-8aRYD@d4CKuX`nu z_!i#R9$;_Vyvv~aq~+joj<(q*1sUSFGA{_)!)IwRoWeQmzNj^zHy zKbnPfOS19;?|1s7Q}kWA ${prefix}_interleaved.fastq.gz" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + samtools \\ + fastq \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 0000000..c4002a4 --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,62 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fastq file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + - interleaved: + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + - singleton: + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + - other: + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fastq.gz" +authors: + - "@priyanka-surana" + - "@suzannejin" +maintainers: + - "@priyanka-surana" + - "@suzannejin" diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000..d6917da --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,9 @@ +name: untar +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 + - conda-forge::grep=3.11 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..8a75bb9 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..a9a2110 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000..d40db13 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,77 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['sarscov2']['genome']['kraken2_tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_different_output_path") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['homo_sapiens']['illumina']['test_flowcell'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_different_output_path") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['generic']['tar']['tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000..146c867 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,513 @@ +{ + "test_untar_different_output_path": { + "content": [ + [ + [ + [ + + ], + [ + [ + [ + [ + [ + [ + "s_1_1101.bcl:md5,ad01889e2ff43e2f194224e20bdb600c", + "s_1_1101.stats:md5,4bbbf103454b37fbc3138fadf1b4446b" + ], + [ + "s_1_1101.bcl:md5,565384bbe67a694dfd690bae6d1d30c2", + "s_1_1101.stats:md5,55e5abd8f129ff38ef169873547abdb8" + ], + [ + "s_1_1101.bcl:md5,650fa58a630a9148835ba79e323d4237", + "s_1_1101.stats:md5,77403669ca1b05340c390dff64425c1e" + ], + [ + "s_1_1101.bcl:md5,54471c9e97299cd141e202e204637702", + "s_1_1101.stats:md5,67b14c9a89b7f8556674a7524d5cfb2d" + ], + [ + "s_1_1101.bcl:md5,74e4f929fc7476c380fd9d741ddb6700", + "s_1_1101.stats:md5,5730a4c35463eaa12a06b6758710b98c" + ], + [ + "s_1_1101.bcl:md5,c785f472f4350c120c02c888c8189590", + "s_1_1101.stats:md5,fee4ec63895ea81007e06ee6a36ba5e0" + ], + [ + "s_1_1101.bcl:md5,b7ea50bb25f08d43c301741d77050a9b", + "s_1_1101.stats:md5,fa7c68f3122c74d14364e6f7b011af70" + ], + [ + "s_1_1101.bcl:md5,9d5087dc4bcae39d66486363d4f68ecf", + "s_1_1101.stats:md5,23cdceee4d82c4b8e7c60018b9276ace" + ], + [ + "s_1_1101.bcl:md5,581e0c5ee94e8f2de14b2b1d8e777530", + "s_1_1101.stats:md5,9a3536d573c97f66bb56b49463612607" + ], + [ + "s_1_1101.bcl:md5,296fc026bb34c67bbe2b44845fe0d1de", + "s_1_1101.stats:md5,a7f57a7770fb9c5ae2a0fb1ef403ec4f" + ], + [ + "s_1_1101.bcl:md5,2a3ca15531556c36d10d132a9e051de8", + "s_1_1101.stats:md5,2d0bcdb0a1b51d3d79e415db2ab2d3b1" + ], + [ + "s_1_1101.bcl:md5,1150d46a2ccd4ac58aee0585d3e4ffd7", + "s_1_1101.stats:md5,2e97550bd5b5864ffd0565bb7a3f6d40" + ], + [ + "s_1_1101.bcl:md5,0b85c4b3da0de95e7b862d849c5333ae", + "s_1_1101.stats:md5,6eab9746fbeb783b0cd70398f44e0c1a" + ], + [ + "s_1_1101.bcl:md5,e0e9c91f4698804d7a6d1058ef68b34f", + "s_1_1101.stats:md5,790022cdc7878a02b2ebd166e1ddf0a7" + ], + [ + "s_1_1101.bcl:md5,38cd0ad4de359e651c8ac0d5777ea625", + "s_1_1101.stats:md5,a1b1d5ea5371d326abb029774483c5e6" + ], + [ + "s_1_1101.bcl:md5,b0ddc05c4012ccba24e712a1cfec748f", + "s_1_1101.stats:md5,af3d232f839d720f76f40ba06caa2987" + ], + [ + "s_1_1101.bcl:md5,af32fcc5dc3b836cf7a5ba3db85a75dd", + "s_1_1101.stats:md5,f93f2c09bd4e486c74a5f6e2040f7296" + ], + [ + "s_1_1101.bcl:md5,54b7428e037ca87816107647d4a3d9db", + "s_1_1101.stats:md5,e5ac77a72cd7bed5e9bf03cccda0e48c" + ], + [ + "s_1_1101.bcl:md5,fc8b4eacd493bf3d0b20bc23998dc7ff", + "s_1_1101.stats:md5,190315e159e2f4bc4c057ded7470dc52" + ], + [ + "s_1_1101.bcl:md5,9484ecffda489927fce424ac6a44fa9d", + "s_1_1101.stats:md5,0825feeb457ecc9efcf6f8526ba32311" + ], + [ + "s_1_1101.bcl:md5,eec59e21036e31c95ce1e847bfb0a9c4", + "s_1_1101.stats:md5,9acc13f63c98e5a8445e7be70d49222b" + ], + [ + "s_1_1101.bcl:md5,a9fb24476f87cba4fba68e2b3c3f2c07", + "s_1_1101.stats:md5,dc0aa7db9790733291c3e6480ca2a0fc" + ], + [ + "s_1_1101.bcl:md5,ed950b3e82c500927c2e236c9df005c6", + "s_1_1101.stats:md5,dccb71ec47d1f9d33a192da6d5660a45" + ], + [ + "s_1_1101.bcl:md5,b3e992025e995ca56b5ea2820144ef47", + "s_1_1101.stats:md5,a6a829bf2cffb26ac5d9dc3012057699" + ], + [ + "s_1_1101.bcl:md5,89edc726a5a4e0b4ff8ca3899ed0232b", + "s_1_1101.stats:md5,5b9b4fd8110577a59b82d0c419519d29" + ], + [ + "s_1_1101.bcl:md5,4dc696149169f232c451225f563cb5cd", + "s_1_1101.stats:md5,d3514a71ea3adc60e2943c6b8f6e2598" + ], + [ + "s_1_1101.bcl:md5,35b992d0318afb7c825ceaa31b0755e6", + "s_1_1101.stats:md5,2826093acc175c16c3795de7c4ca8f07" + ], + [ + "s_1_1101.bcl:md5,7bc927f56a362e49c00b5d76ee048901", + "s_1_1101.stats:md5,e47d862b795fd6b88a31d7d482ab22f6" + ], + [ + "s_1_1101.bcl:md5,84742233ff2a651626fe9036f27f7cb2", + "s_1_1101.stats:md5,b78fad11d3c50bc76b722cdc03e3028b" + ], + [ + "s_1_1101.bcl:md5,3935341c86263a7938e8c49620ef39f8", + "s_1_1101.stats:md5,cc6585b2daac5354073d150874da9704" + ], + [ + "s_1_1101.bcl:md5,3627f4fd548bf6e64aaf08fba3a342be", + "s_1_1101.stats:md5,120ae4831ae004ff7d16728aef36e82f" + ], + [ + "s_1_1101.bcl:md5,07631014bc35124149fabd80ef19f933", + "s_1_1101.stats:md5,eadd63d91f47cc6db6b6f0a967a23927" + ], + [ + "s_1_1101.bcl:md5,a1149c80415dc2f34d768eeb397c43fb", + "s_1_1101.stats:md5,ca89a9def67611a9151c6ce685b7cce1" + ], + [ + "s_1_1101.bcl:md5,eb5f71d4741d2f40618756bc72eaf8b4", + "s_1_1101.stats:md5,90f48501e735e5915b843478e23d1ae2" + ], + [ + "s_1_1101.bcl:md5,9bf270fe3f6add1a591ebc24fff10078", + "s_1_1101.stats:md5,a4e429671d4098034293c638aa655e16" + ], + [ + "s_1_1101.bcl:md5,219bedcbd24bae54fe4cf05dae05282c", + "s_1_1101.stats:md5,dd97525b65b68207137d51fcf19132c7" + ], + [ + "s_1_1101.bcl:md5,5163bc00a68fd57ae50cae0b76350892", + "s_1_1101.stats:md5,b606a5368eff1f012f3ea5d11ccdf2e0" + ], + [ + "s_1_1101.bcl:md5,fc429195a5af59a59e0cc4c48e6c05ea", + "s_1_1101.stats:md5,d809aa19698053f90d639da4dcad8008" + ], + [ + "s_1_1101.bcl:md5,383340219a1dd77076a092a64a71a7e4", + "s_1_1101.stats:md5,b204a5cf256378679ffc906c15cc1bae" + ], + [ + "s_1_1101.bcl:md5,0c369540d3e24696cf1f9c55bab69315", + "s_1_1101.stats:md5,a2bc69a4031a22ce9621dcc623a0bf4b" + ], + [ + "s_1_1101.bcl:md5,3127abc8016ba8eb954f8f8015dff387", + "s_1_1101.stats:md5,5deafff31150b7bf757f814e49a53bc2" + ], + [ + "s_1_1101.bcl:md5,045f40c82de676bafec3d59f91376a7a", + "s_1_1101.stats:md5,890700edc20687c090ef52248c7884b1" + ], + [ + "s_1_1101.bcl:md5,78af269aa2b39a1d765703f0a4739a86", + "s_1_1101.stats:md5,303cf457aa1543a8208544f694cbc531" + ], + [ + "s_1_1101.bcl:md5,0ab8c781959b783b62888e9274364a46", + "s_1_1101.stats:md5,2605b0e8322f83aa4d0dae5da4ec7a7a" + ], + [ + "s_1_1101.bcl:md5,d0cf823ffe352e8b3f75d589544ab617", + "s_1_1101.stats:md5,efa3c0e01e3db71e12fd961cb2d03739" + ], + [ + "s_1_1101.bcl:md5,db4ca4ab7a01e03c246f9160c3758d82", + "s_1_1101.stats:md5,f61550d9e4a90df6b860e68f41f82f60" + ], + [ + "s_1_1101.bcl:md5,1af39a2c7e5ff20ece91cb8160b51d17", + "s_1_1101.stats:md5,d0e20879afcaf6dfcd88c73f1c5c78cf" + ], + [ + "s_1_1101.bcl:md5,4cf7123bb0fffcd79266df03aef01665", + "s_1_1101.stats:md5,29bff4075109a121b087116b58d7e927" + ], + [ + "s_1_1101.bcl:md5,aa9980428cb60cd6320f4b48f4dd0d74", + "s_1_1101.stats:md5,6b0e20bde93133117a8d1a6df3d6f37b" + ], + [ + "s_1_1101.bcl:md5,0f6e440374e15b9b491d52fb83a8adfe", + "s_1_1101.stats:md5,55cb5eb0ecdabd23dca39ab8c4607598" + ], + [ + "s_1_1101.bcl:md5,2c645d7bdaddaa403f6e304d36df9e4b", + "s_1_1101.stats:md5,53acf33d21f832779b400c2447386ce4" + ], + [ + "s_1_1101.bcl:md5,3bbf0863b423b770c879203644420206", + "s_1_1101.stats:md5,579bdc7293cac8c3d7407249cacf4c25" + ], + [ + "s_1_1101.bcl:md5,6658a08409e81d29cfeb2d096b491985", + "s_1_1101.stats:md5,bb559ffbea46d612f9933cefa84c4c03" + ], + [ + "s_1_1101.bcl:md5,1700d9a13d3d4f7643af2943ef838acb", + "s_1_1101.stats:md5,f01cb6050ebfb15da1e0399ebd791eb4" + ], + [ + "s_1_1101.bcl:md5,1ac7aa9ffae25eb103f755f33e4a39c6", + "s_1_1101.stats:md5,0b9d45d7929ccf336d5e5b95373ed3c2" + ], + [ + "s_1_1101.bcl:md5,812a97af2e983a53226e18c75190b06c", + "s_1_1101.stats:md5,d2410c7b0e506dab2972e77e2398de1e" + ], + [ + "s_1_1101.bcl:md5,c981e8e4dcc434956c2b86159da268bc", + "s_1_1101.stats:md5,e9c826e85361ce673f1f248786c9a611" + ], + [ + "s_1_1101.bcl:md5,88e09e99a0a4ef3357b203a41b22f77c", + "s_1_1101.stats:md5,ef06f2e5ad667bbd383f9ed6a05b7b42" + ], + [ + "s_1_1101.bcl:md5,461c8b146fc8a7938be38689978ecd09", + "s_1_1101.stats:md5,65115693935da66f9791b27136e22fb0" + ], + [ + "s_1_1101.bcl:md5,c7b827df5ce20e0f21916fe60860ca3f", + "s_1_1101.stats:md5,87be73613aeb507847f94d3cac5bb30a" + ], + [ + "s_1_1101.bcl:md5,7c4cc3dc9c8a1b0f15917b282dfb40ce", + "s_1_1101.stats:md5,bdd9181fa89debbfafe7b6ea3e064065" + ], + [ + "s_1_1101.bcl:md5,19f4debaf91e118aca8934517179ac33", + "s_1_1101.stats:md5,1143082719e136241d21b14a6b19b8a2" + ], + [ + "s_1_1101.bcl:md5,38aa256ad2d697d84b0b2c0e876a3eba", + "s_1_1101.stats:md5,64dd82f03df23f7f437eede2671ed4fe" + ], + [ + "s_1_1101.bcl:md5,b7929970378949571fed922c1b8cab32", + "s_1_1101.stats:md5,3d6d7985a41629fe196e4342d7fe36aa" + ], + [ + "s_1_1101.bcl:md5,fb2ed0bf6e89d79624ee78754e773491", + "s_1_1101.stats:md5,f34940810ff255aee79953496a12716d" + ], + [ + "s_1_1101.bcl:md5,4f8a8311f5f9c3a7629c1a973a7b280e", + "s_1_1101.stats:md5,4fd7cd28c09f4e152e7c2ad1ab541cd2" + ], + [ + "s_1_1101.bcl:md5,9eb46c903d0344e25af51f88cc311d60", + "s_1_1101.stats:md5,df3abd5f620d9e7f99496098d9fd3f7f" + ], + [ + "s_1_1101.bcl:md5,3ecbc17f3660e2014b58d7fe70ae62d5", + "s_1_1101.stats:md5,8e89a13c85a6d6ab3ccd251b66d1f165" + ], + [ + "s_1_1101.bcl:md5,5d59cc2499a77791233a64f73fe82894", + "s_1_1101.stats:md5,32ec99cd400f4b80cb26e2fa8e07ece0" + ], + [ + "s_1_1101.bcl:md5,1c052da47b9ae8554388f0fa3aade482", + "s_1_1101.stats:md5,d23f438772673688aa7bc92421dc6dce" + ], + [ + "s_1_1101.bcl:md5,1a52bd4f23130c0c96bc967ccd448a2b", + "s_1_1101.stats:md5,9b597e3388d59ef1f61aba30ac90ea79" + ], + [ + "s_1_1101.bcl:md5,8a1e84b79cf3f80794c20e3a0cc84688", + "s_1_1101.stats:md5,9561f7b6ef4b1849afc72b2bb49792bd" + ], + [ + "s_1_1101.bcl:md5,75c00111051f3fa95d04286823cb9109", + "s_1_1101.stats:md5,1fe786cdf8181767deafbd60b3c76610" + ], + [ + "s_1_1101.bcl:md5,529255d8deee0873ed5565e6d1a2ebda", + "s_1_1101.stats:md5,3fa7f467e97a75880f32d17b7429d316" + ], + [ + "s_1_1101.bcl:md5,ea4d960e3d9355d2149da71b88a21df4", + "s_1_1101.stats:md5,2540fe65586e8e800c1ddd8cddd1e8cd" + ], + [ + "s_1_1101.bcl:md5,0dfe1fd92a2dce2f23119aa483429744", + "s_1_1101.stats:md5,78257b2169fb9f0cf40966e06e847e86" + ], + [ + "s_1_1101.bcl:md5,f692ddc9aa3ab849271d07c666d0b3b9", + "s_1_1101.stats:md5,aa2ec6a3e3a9c116e34fe74a21e6459e" + ], + [ + "s_1_1101.bcl:md5,29cc4c239eae7c871c9a1adf92ebdb98", + "s_1_1101.stats:md5,263184813090acd740a5bf25304aed3a" + ], + [ + "s_1_1101.bcl:md5,e005af6a84925e326afbfe264241f047", + "s_1_1101.stats:md5,b6fb20868eebaffcc19daa694a449795" + ], + [ + "s_1_1101.bcl:md5,02f1a699b1ba9967accccf99a7af3d24", + "s_1_1101.stats:md5,4f007efacecaf26dc0e0231aede28754" + ], + [ + "s_1_1101.bcl:md5,df308c72a2dcc655cd95e98f5457187a", + "s_1_1101.stats:md5,130c4b07f4c14030bab012824cbe34da" + ], + [ + "s_1_1101.bcl:md5,f3ce10d8d2406b72355023bfa8c96822", + "s_1_1101.stats:md5,2638f4db393ed5b699ec2ce59ff0ec19" + ], + [ + "s_1_1101.bcl:md5,cc2f6d675ad1593ff96f734b172d249e", + "s_1_1101.stats:md5,f5b13f1e1ababc9e1a7a73b0b993cbf1" + ], + [ + "s_1_1101.bcl:md5,7938a0b21448305a951b023b1845b3a7", + "s_1_1101.stats:md5,fcd57511adabfc3ba1ac045165330006" + ], + [ + "s_1_1101.bcl:md5,44879bc6a38df1fee8def61868115041", + "s_1_1101.stats:md5,517e20e4b58a8023a37f9af62e0e2036" + ], + [ + "s_1_1101.bcl:md5,8749611e62406a7d2f34c610a55e56af", + "s_1_1101.stats:md5,8ccf24b3676ef84f2e513be8f2a9f3d1" + ], + [ + "s_1_1101.bcl:md5,a9846a037611cda3721958088f714c0e", + "s_1_1101.stats:md5,6438fa5a1892f328cab1605a95d80a3b" + ], + [ + "s_1_1101.bcl:md5,d6c4a2a726496476eb826532f974ed5f", + "s_1_1101.stats:md5,8c2c65b5e8b00dbf61ada65252aeb266" + ], + [ + "s_1_1101.bcl:md5,be3dde6cae7dd85855a6bf295ebfacfe", + "s_1_1101.stats:md5,93bc13f3b0749b2b8d8bcb0b1199f4f0" + ], + [ + "s_1_1101.bcl:md5,7c64514735a6cf1565b60647edd17d20", + "s_1_1101.stats:md5,4a0aa6c49b24f876415e5878cef7f805" + ], + [ + "s_1_1101.bcl:md5,3983b4043bc9df4b505202a5134ccf03", + "s_1_1101.stats:md5,1c9d9a8558adc1279ca27c96bc1b9758" + ], + [ + "s_1_1101.bcl:md5,a0b8d77f116ec95975f9253dcb768136", + "s_1_1101.stats:md5,c3992b786756e7ec42f65ef4b13b50d4" + ], + [ + "s_1_1101.bcl:md5,43c95ba35d06bb7c57fbd16f3d1cfd6c", + "s_1_1101.stats:md5,3cb69d04698c39f97f962e5bf1eea7f0" + ], + [ + "s_1_1101.bcl:md5,3dbeea0cad7052f19f53ff6f19dd4d90", + "s_1_1101.stats:md5,58bbc8254f0f5f4a244531e8e9c12a04" + ], + [ + "s_1_1101.bcl:md5,da56d088996376c898d855b6cd0a7dfc", + "s_1_1101.stats:md5,9f2d78af6908ce1576b89cdc059844ff" + ], + [ + "s_1_1101.bcl:md5,7b641a5565f095e9a6ffcad9e4305033", + "s_1_1101.stats:md5,3ada06c59b4fb41b83ab6abd0979e9fc" + ], + [ + "s_1_1101.bcl:md5,a3843d397a01d51657825bb652c191e5", + "s_1_1101.stats:md5,19341e52a4bfc7d9d48e9d2acc68c519" + ], + [ + "s_1_1101.bcl:md5,048e3ebfc8efeb8012def6b741c9060d", + "s_1_1101.stats:md5,88bd38deca1e87d700effab1fd099565" + ], + [ + "s_1_1101.bcl:md5,b340db0e07e829dd5da22371916a1a9e", + "s_1_1101.stats:md5,e44cfaddcc4ffb968e5b1a2f41ac48a5" + ], + [ + "s_1_1101.bcl:md5,e6011ec6eabbc2b8792deb283c621ce0", + "s_1_1101.stats:md5,090875dcd1a431af24bc631333f089c4" + ], + [ + "s_1_1101.bcl:md5,a08f216e3352345031ed100ec4245082", + "s_1_1101.stats:md5,97b949ef4b96219e1369f673cf5f8a6c" + ], + [ + "s_1_1101.bcl:md5,b43337c76fb037dfcf5f8f7bcb3618e5", + "s_1_1101.stats:md5,ddef585805e79951f69d23ab7354f69b" + ], + [ + "s_1_1101.bcl:md5,8c61fd004104397b360855e058bbf1bf", + "s_1_1101.stats:md5,0f8d253816d594dcfea3ccf48c826401" + ], + [ + "s_1_1101.bcl:md5,594d06310d328b188aa0b3edfff22cb2", + "s_1_1101.stats:md5,3160bf271b39aeb7590e4fd2984710ba" + ], + [ + "s_1_1101.bcl:md5,4c9eada67c9d55437211d83e111961d5", + "s_1_1101.stats:md5,2901b46ab16ec4863d30e4c84ec29c97" + ], + [ + "s_1_1101.bcl:md5,e03971ae5282f0accc0c1b7374d9ef1b", + "s_1_1101.stats:md5,60d2a19ce59bf70a21a28555484cead8" + ], + [ + "s_1_1101.bcl:md5,e1c6f7a06e63d149895d3e48e63df155", + "s_1_1101.stats:md5,44beb10af847ea3dddaf06dda7031126" + ], + [ + "s_1_1101.bcl:md5,960a99bf29a8f9d936e9b8582d46c9c6", + "s_1_1101.stats:md5,544cd1a7aaaa841914b40ece43399334" + ], + [ + "s_1_1101.bcl:md5,5706679f349fd4a6b6313bc2c41c7a42", + "s_1_1101.stats:md5,627eea844b26dae033848c2f9f69177b" + ], + [ + "s_1_1101.bcl:md5,21da5abc4b0402bbac14b5ab998b0b4f", + "s_1_1101.stats:md5,515bd140b095ad90473ca7a9a69877ab" + ], + "s_1_1101.control:md5,08a72e2198ae95150718e8adf011d105", + "s_1_1101.filter:md5,3a72bc73b323c8cb0ac5bfeb62d98989" + ] + ], + [ + "s_1_1101.locs:md5,0827ea802e5257cc5b20e757a33d4c98" + ], + "RTAConfiguration.xml:md5,c7d6e257bc374f142dc64b9d2281d4c9", + "config.xml:md5,9a4cc7ec01fefa2f1ce9bcb45bbad6e9" + ] + ], + [ + "ControlMetricsOut.bin:md5,6d77b38d0793a6e1ce1e85706e488953", + "CorrectedIntMetricsOut.bin:md5,2bbf84d3be72734addaa2fe794711434", + "ErrorMetricsOut.bin:md5,38c88def138e9bb832539911affdb286", + "ExtractionMetricsOut.bin:md5,7497c3178837eea8f09350b5cd252e99", + "IndexMetricsOut.bin:md5,d41d8cd98f00b204e9800998ecf8427e", + "QMetricsOut.bin:md5,7e9f198d53ebdfbb699a5f94cf1ed51c", + "TileMetricsOut.bin:md5,83891751ec1c91a425a524b476b6ca3c" + ], + "RunInfo.xml:md5,03038959f4dd181c86bc97ae71fe270a" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:39.562418" + }, + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:46.878844" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:08.16574" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 0000000..feb6f15 --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/nextflow.config b/nextflow.config index 90da5d8..bcc78b0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,10 +13,12 @@ params { input = null fasta = null fai = null + align = false interval = null include_positions = null exclude_positions = null split_fasta_cutoff = 100000 + vector_db = "${projectDir}/assets/vectorDB.tar.gz" // Boilerplate options outdir = 'results' diff --git a/nextflow_schema.json b/nextflow_schema.json index e035219..958cdb9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -28,6 +28,12 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "vector_db": { + "type": "string", + "default": "/Users/gq2/git3/variantcalling/assets/vectorDB.tar.gz", + "description": "Path to directory or tar.gz archive for pre-built PacBio vector database.", + "format": "file-path" + }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -51,6 +57,10 @@ "type": "string", "description": "Path to the index file of the FASTA genome file, either fai or gzi." }, + "align": { + "type": "boolean", + "description": "Align the input reads to the reference" + }, "interval": { "type": "string", "description": "Interval bed file." diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index c88480f..54f442f 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -60,11 +60,13 @@ if ( (params.include_positions) && (params.exclude_positions) ){ // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { ALIGN_PACBIO } from '../subworkflows/local/align_pacbio' include { INPUT_MERGE } from '../subworkflows/local/input_merge' include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split' include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' include { PROCESS_VCF } from '../subworkflows/local/process_vcf' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -75,7 +77,8 @@ include { PROCESS_VCF } from '../subworkflows/local/process_vcf' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { UNTAR } from '../modules/nf-core/untar/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -127,6 +130,31 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) + // + // SUBWORKFLOW: align reads if required + // + if( params.align ){ + + if ( params.vector_db.endsWith( '.tar.gz' ) ) { + + UNTAR ( [ [:], params.vector_db ] ).untar + | map { meta, file -> file } + | set { ch_vector_db } + ch_versions = ch_versions.mix ( UNTAR.out.versions ) + } else { + Channel.fromPath ( params.vector_db ) + | set { ch_vector_db } + } + + ALIGN_PACBIO ( + ch_fasta, + INPUT_CHECK.out.reads, + ch_vector_db + ) + ch_versions = ch_versions.mix( ALIGN_PACBIO.out.versions ) + } + + // // SUBWORKFLOW: merge the input reads by sample name // From 4c8367e60879412b7ee2ae66c3ac0a32576ebd15 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 23 Nov 2023 11:02:57 +0000 Subject: [PATCH 03/33] re-patch samtools view --- modules.json | 2 +- modules/nf-core/samtools/view/environment.yml | 7 ++++++ modules/nf-core/samtools/view/main.nf | 5 +++-- modules/nf-core/samtools/view/meta.yml | 5 +++++ .../nf-core/samtools/view/samtools-view.diff | 22 +++++++++++++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 modules/nf-core/samtools/view/environment.yml create mode 100644 modules/nf-core/samtools/view/samtools-view.diff diff --git a/modules.json b/modules.json index 6ef43e2..bc30640 100644 --- a/modules.json +++ b/modules.json @@ -88,7 +88,7 @@ }, "samtools/view": { "branch": "master", - "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 0000000..141e7bd --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index cb91fac..613c6e7 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" @@ -19,6 +19,7 @@ process SAMTOOLS_VIEW { tuple val(meta), path("*.bai"), emit: bai, optional: true tuple val(meta), path("*.csi"), emit: csi, optional: true tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.unoutput"), emit: unoutput, optional: true path "versions.yml", emit: versions when: @@ -29,7 +30,7 @@ process SAMTOOLS_VIEW { def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def reference = fasta ? "--reference ${fasta}" : "" - def readnames = qname ? "--qname-file ${qname}": "" + def readnames = qname ? "--qname-file ${qname} --unoutput ${prefix}.unoutput": "" def file_type = args.contains("--output-fmt sam") ? "sam" : args.contains("--output-fmt bam") ? "bam" : args.contains("--output-fmt cram") ? "cram" : diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 3b05450..3dadafa 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -82,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/samtools-view.diff b/modules/nf-core/samtools/view/samtools-view.diff new file mode 100644 index 0000000..1fa860a --- /dev/null +++ b/modules/nf-core/samtools/view/samtools-view.diff @@ -0,0 +1,22 @@ +Changes in module 'nf-core/samtools/view' +--- modules/nf-core/samtools/view/main.nf ++++ modules/nf-core/samtools/view/main.nf +@@ -19,6 +19,7 @@ + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true ++ tuple val(meta), path("*.unoutput"), emit: unoutput, optional: true + path "versions.yml", emit: versions + + when: +@@ -29,7 +30,7 @@ + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" +- def readnames = qname ? "--qname-file ${qname}": "" ++ def readnames = qname ? "--qname-file ${qname} --unoutput ${prefix}.unoutput": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + +************************************************************ From b4f582dc924b07de09ab2bf0421fd82e075b47c2 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Mon, 27 Nov 2023 10:08:56 +0000 Subject: [PATCH 04/33] nf-core modules update vcftools --- modules.json | 2 +- modules/nf-core/vcftools/environment.yml | 1 + modules/nf-core/vcftools/main.nf | 1 - modules/nf-core/vcftools/vcftools.diff | 33 ------------------------ 4 files changed, 2 insertions(+), 35 deletions(-) delete mode 100644 modules/nf-core/vcftools/vcftools.diff diff --git a/modules.json b/modules.json index bc30640..e7bcfa1 100644 --- a/modules.json +++ b/modules.json @@ -99,7 +99,7 @@ }, "vcftools": { "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "git_sha": "485558b40040fc3ace093d9084210125d8ba4c97", "installed_by": ["modules"], "patch": "modules/nf-core/vcftools/vcftools.diff" } diff --git a/modules/nf-core/vcftools/environment.yml b/modules/nf-core/vcftools/environment.yml index 875817e..503449e 100644 --- a/modules/nf-core/vcftools/environment.yml +++ b/modules/nf-core/vcftools/environment.yml @@ -1,3 +1,4 @@ +name: vcftools channels: - conda-forge - bioconda diff --git a/modules/nf-core/vcftools/main.nf b/modules/nf-core/vcftools/main.nf index b36c429..0e61955 100644 --- a/modules/nf-core/vcftools/main.nf +++ b/modules/nf-core/vcftools/main.nf @@ -94,7 +94,6 @@ process VCFTOOLS { (args.contains('--hapcount')) ? "--hapcount ${bed}" : (args.contains('--positions')) ? "--positions ${bed}" : (args.contains('--exclude-positions')) ? "--exclude-positions ${bed}" : '' - args_list.removeIf { it.contains('--bed') } args_list.removeIf { it.contains('--exclude-bed') } args_list.removeIf { it.contains('--hapcount') } diff --git a/modules/nf-core/vcftools/vcftools.diff b/modules/nf-core/vcftools/vcftools.diff deleted file mode 100644 index fd2e9ec..0000000 --- a/modules/nf-core/vcftools/vcftools.diff +++ /dev/null @@ -1,33 +0,0 @@ -Changes in module 'nf-core/vcftools' ---- modules/nf-core/vcftools/meta.yml -+++ modules/nf-core/vcftools/meta.yml -@@ -1,6 +1,7 @@ - name: vcftools - description: A set of tools written in Perl and C++ for working with VCF files - keywords: -+ - VCFtools - - VCF - - sort - tools: - ---- modules/nf-core/vcftools/main.nf -+++ modules/nf-core/vcftools/main.nf -@@ -91,10 +91,15 @@ - - def bed_arg = (args.contains('--bed')) ? "--bed ${bed}" : - (args.contains('--exclude-bed')) ? "--exclude-bed ${bed}" : -- (args.contains('--hapcount')) ? "--hapcount ${bed}" : '' -+ (args.contains('--hapcount')) ? "--hapcount ${bed}" : -+ (args.contains('--positions')) ? "--positions ${bed}" : -+ (args.contains('--exclude-positions')) ? "--exclude-positions ${bed}" : '' -+ - args_list.removeIf { it.contains('--bed') } - args_list.removeIf { it.contains('--exclude-bed') } - args_list.removeIf { it.contains('--hapcount') } -+ args_list.removeIf { it.contains('--positions') } -+ args_list.removeIf { it.contains('--exclude-positions') } - - def diff_variant_arg = (args.contains('--diff')) ? "--diff ${diff_variant_file}" : - (args.contains('--gzdiff')) ? "--gzdiff ${diff_variant_file}" : - -************************************************************ From f2cefafc13a71f71a679431a478916b5efc4d8c6 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Mon, 27 Nov 2023 16:07:12 +0000 Subject: [PATCH 05/33] update the way calling module or subworkflow --- bin/pacbio_filter.sh | 6 ++++ conf/modules.config | 46 +++++++++++++++++++++++++++++ subworkflows/local/filter_pacbio.nf | 7 +++-- workflows/variantcalling.nf | 23 +++++++-------- 4 files changed, 67 insertions(+), 15 deletions(-) create mode 100755 bin/pacbio_filter.sh diff --git a/bin/pacbio_filter.sh b/bin/pacbio_filter.sh new file mode 100755 index 0000000..73d7caa --- /dev/null +++ b/bin/pacbio_filter.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +input=$1 +output=$2 + +grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($2 ~ /NGB00972/ && $3 >= 97 && $4 >= 44) || ($2 ~ /NGB00973/ && $3 >= 97 && $4 >= 34) || ($2 ~ /^bc/ && $3 >= 99 && $4 >= 16)) print $1}' | sort -u > $output diff --git a/conf/modules.config b/conf/modules.config index cb326e7..c8bbc2f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,6 +20,52 @@ process { ] } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { + ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { + ext.prefix = { "${meta.id}.filter" } + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { + ext.prefix = { "${meta.id}.collate" } + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:BLAST_BLASTN' { + ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' + } + + withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' { + ext.args = '-ax map-hifi --cs=short' + } + + withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' { + ext.args = { "-c -p" } + ext.prefix = { "${meta.id}.merge" } + } + + withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.sort" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { + ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } + ext.args = '--output-fmt cram --write-index' + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_STATS' { + ext.prefix = { "${input.baseName}" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + ext.prefix = { "${bam.baseName}" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + ext.prefix = { "${bam.baseName}" } + } + withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { ext.args = '--output-fmt cram --write-index -F 0x900' ext.prefix = { "${meta.id}_filtered" } diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 6078d4d..2e306bf 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -28,7 +28,7 @@ workflow FILTER_PACBIO { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_pacbio } - SAMTOOLS_CONVERT ( ch_pacbio, [], [] ) + SAMTOOLS_CONVERT (ch_pacbio, [ [], [] ], [] ) ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) @@ -48,7 +48,8 @@ workflow FILTER_PACBIO { // Nucleotide BLAST - BLAST_BLASTN ( GUNZIP.out.gunzip, db ) + db.map{db -> [ [], db]}.set{ch_db} + BLAST_BLASTN ( GUNZIP.out.gunzip, ch_db ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) @@ -62,7 +63,7 @@ workflow FILTER_PACBIO { | join ( SAMTOOLS_CONVERT.out.csi ) | set { ch_reads } - SAMTOOLS_FILTER ( ch_reads, [], PACBIO_FILTER.out.list ) + SAMTOOLS_FILTER ( ch_reads, [ [], [] ], PACBIO_FILTER.out.list ) ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 54f442f..b200086 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -44,12 +44,6 @@ if ( (params.include_positions) && (params.exclude_positions) ){ ch_positions = [] } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -66,7 +60,6 @@ include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split' include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' include { PROCESS_VCF } from '../subworkflows/local/process_vcf' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -122,6 +115,7 @@ workflow VARIANTCALLING { ch_index = ch_fai } + // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // @@ -130,6 +124,7 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) + // // SUBWORKFLOW: align reads if required // @@ -146,11 +141,12 @@ workflow VARIANTCALLING { | set { ch_vector_db } } - ALIGN_PACBIO ( - ch_fasta, - INPUT_CHECK.out.reads, - ch_vector_db - ) + ch_fasta.map{ fasta -> [[], fasta]}.set{fasta_meta} + ALIGN_PACBIO ( + fasta_meta, + INPUT_CHECK.out.reads, + ch_vector_db + ) ch_versions = ch_versions.mix( ALIGN_PACBIO.out.versions ) } @@ -177,6 +173,7 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix( INPUT_FILTER_SPLIT.out.versions ) + // // SUBWORKFLOW: call deepvariant // @@ -185,6 +182,7 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix( DEEPVARIANT_CALLER.out.versions ) + // // convert VCF channel meta id // @@ -198,6 +196,7 @@ workflow VARIANTCALLING { PROCESS_VCF( vcf, ch_positions ) ch_versions = ch_versions.mix( PROCESS_VCF.out.versions ) + // // MODULE: Combine different version together // From 27e5d9e318259ddef15b8fe5bd359fbfec7a915f Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Mon, 27 Nov 2023 16:07:41 +0000 Subject: [PATCH 06/33] add a new test_align profile --- assets/samplesheet_test_align.csv | 2 ++ conf/test_align.config | 27 +++++++++++++++++++++++++++ nextflow.config | 1 + 3 files changed, 30 insertions(+) create mode 100644 assets/samplesheet_test_align.csv create mode 100644 conf/test_align.config diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv new file mode 100644 index 0000000..381e21d --- /dev/null +++ b/assets/samplesheet_test_align.csv @@ -0,0 +1,2 @@ +sample,datatype,datafile +mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram \ No newline at end of file diff --git a/conf/test_align.config b/conf/test_align.config new file mode 100644 index 0000000..9680e67 --- /dev/null +++ b/conf/test_align.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run sanger-tol/variantcalling -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test align profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = "${projectDir}/assets/samplesheet_test_align.csv" + + // Fasta references + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta" +} diff --git a/nextflow.config b/nextflow.config index bcc78b0..93512e2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -169,6 +169,7 @@ profiles { executor.memory = 60.GB } test { includeConfig 'conf/test.config' } + test_align { includeConfig 'conf/test_align.config' } test_full { includeConfig 'conf/test_full.config' } } From 59892b0203a7d3836666a707217fcb7b4d0e8df7 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Mon, 27 Nov 2023 16:43:07 +0000 Subject: [PATCH 07/33] make samtools convert output as bam --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index c8bbc2f..29fb899 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -21,7 +21,7 @@ process { } withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { - ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" + ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" } withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { From eef4d82093a870087847627f69bdb63a5b8b28c2 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Tue, 28 Nov 2023 09:46:30 +0000 Subject: [PATCH 08/33] use compressed fasta file --- conf/test_align.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_align.config b/conf/test_align.config index 9680e67..746d4a8 100644 --- a/conf/test_align.config +++ b/conf/test_align.config @@ -23,5 +23,5 @@ params { input = "${projectDir}/assets/samplesheet_test_align.csv" // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" } From 9347f34508247f87e2761fc530c373ae3eeadc07 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Tue, 28 Nov 2023 10:51:13 +0000 Subject: [PATCH 09/33] extra modules configs --- conf/modules.config | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 29fb899..f909984 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -24,10 +24,6 @@ process { ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { - ext.prefix = { "${meta.id}.filter" } - } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { ext.prefix = { "${meta.id}.collate" } } @@ -36,8 +32,16 @@ process { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { + ext.prefix = { "${meta.id}.filter" } + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FASTQ' { + ext.args = '-F 0x200 -nt' + } + withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' { - ext.args = '-ax map-hifi --cs=short' + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } } withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' { From 2d70af5fed6ff976350cef3e58e43cad3147cb8e Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Tue, 28 Nov 2023 18:01:39 +0000 Subject: [PATCH 10/33] remove -b flag for samtools view --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index f909984..a92a73d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -21,7 +21,7 @@ process { } withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { - ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" + ext.args = "-e '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" } withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { From f7d5faf742155f3d866aa7b62dd557ad67f1e171 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 11:02:47 +0000 Subject: [PATCH 11/33] add RG group to meta data in sample checking --- subworkflows/local/deepvariant_caller.nf | 6 ++--- subworkflows/local/input_check.nf | 30 ++++++++++++++++++++---- subworkflows/local/input_merge.nf | 2 +- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/deepvariant_caller.nf b/subworkflows/local/deepvariant_caller.nf index 10b5359..31b42e0 100644 --- a/subworkflows/local/deepvariant_caller.nf +++ b/subworkflows/local/deepvariant_caller.nf @@ -14,7 +14,7 @@ workflow DEEPVARIANT_CALLER { ch_versions = Channel.empty() reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> - [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.type ], + [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], cram, crai, interval @@ -23,14 +23,14 @@ workflow DEEPVARIANT_CALLER { // fasta fasta = reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> - [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.type ], + [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], fasta ] } // fai fai = reads_fasta.map{ meta, cram, crai, interval, fasta_file_name, fasta, fai -> - [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.type ], + [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], fai ] } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7e9f667..ec8a9f4 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,13 +12,33 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [ - [ id: it.sample, sample: it.sample, type: it.datatype ], - file(it.datafile) - ] } + .map { create_data_channel( it ) } .set { reads } - + emit: reads // channel: [ val(meta), data ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } + +// Function to get list of [ meta, reads ] +def create_data_channel ( LinkedHashMap row ) { + // create meta map + def meta = [:] + meta.id = row.sample + meta.sample = row.sample + meta.datatype = row.datatype + + if ( meta.datatype == "pacbio" ) { + platform = "PACBIO" + } + meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0] + "\\tPL:" + platform + "\\tSM:" + meta.sample + "\'" + + // add path(s) of the read file(s) to the meta map + def data_meta = [] + if ( !file(row.datafile).exists() ) { + exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" + } else { + data_meta = [ meta, file(row.datafile) ] + } + return data_meta +} diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 90bb82f..c63c679 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -38,7 +38,7 @@ workflow INPUT_MERGE { .join( grouped_reads ) .map { sample, meta, bam_cram_list -> [ [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', - type: meta.type + type: meta.datatype ], bam_cram_list ]} From ce47ad9c0cb61b11133fc01476c66b2f79cd9c6b Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 11:38:30 +0000 Subject: [PATCH 12/33] update channel with meta --- subworkflows/local/align_pacbio.nf | 11 ++++------- subworkflows/local/convert_stats.nf | 5 ++--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 767563b..5c31f34 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -26,23 +26,20 @@ workflow ALIGN_PACBIO { // Align Fastq to Genome - fasta - | map { meta, file -> file } - | set { ch_fasta } - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) + MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id, 'datatype': meta.datatype], bam] } | groupTuple ( by: [0] ) | set { ch_bams } // Merge - SAMTOOLS_MERGE ( ch_bams, [], [] ) + SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) @@ -56,7 +53,7 @@ workflow ALIGN_PACBIO { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, ch_fasta ) + CONVERT_STATS ( ch_sort, fasta ) ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf index 7c381fd..9118e8d 100644 --- a/subworkflows/local/convert_stats.nf +++ b/subworkflows/local/convert_stats.nf @@ -11,15 +11,14 @@ include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/samtools/idxstats/main workflow CONVERT_STATS { take: bam // channel: [ val(meta), /path/to/bam, /path/to/bai] - fasta // channel: /path/to/fasta + fasta // channel: [ val(meta), /path/to/fasta ] main: ch_versions = Channel.empty() - // Convert BAM to CRAM - SAMTOOLS_VIEW ( bam, fasta, [] ) + SAMTOOLS_VIEW ( bam, fasta, [ ] ) ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) From 80e83c78c98dfd3a94ae4de6ee3561c1113ea339 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 11:50:08 +0000 Subject: [PATCH 13/33] publish aligned cram files with stats --- conf/modules.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index a92a73d..2ce358a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -70,6 +70,14 @@ process { ext.prefix = { "${bam.baseName}" } } + withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' { + publishDir = [ + path: { "${params.outdir}/variant_calling" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { ext.args = '--output-fmt cram --write-index -F 0x900' ext.prefix = { "${meta.id}_filtered" } From 8e2ccc8cf052df815951f253ede12b02ff1396d0 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 13:33:37 +0000 Subject: [PATCH 14/33] Remove samtools sort after minimap_align and before merging. --- subworkflows/local/align_pacbio.nf | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 5c31f34..26c575a 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -26,30 +26,23 @@ workflow ALIGN_PACBIO { // Align Fastq to Genome - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id, 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id, 'datatype': meta.datatype, 'sample': meta.sample ], bam] } | groupTuple ( by: [0] ) | set { ch_bams } - // Merge SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } From 2815232aee217faf7111449dd8cfc2070797b0e0 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 14:39:07 +0000 Subject: [PATCH 15/33] add conditions pacbio align modules configs --- conf/modules.config | 91 +++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2ce358a..a2f44ea 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,62 +20,65 @@ process { ] } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { - ext.args = "-e '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" - } + if( params.align ) { - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { - ext.prefix = { "${meta.id}.collate" } - } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { + ext.args = "-e '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" + } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:BLAST_BLASTN' { - ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' - } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { + ext.prefix = { "${meta.id}.collate" } + } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { - ext.prefix = { "${meta.id}.filter" } - } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:BLAST_BLASTN' { + ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' + } - withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FASTQ' { - ext.args = '-F 0x200 -nt' - } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { + ext.prefix = { "${meta.id}.filter" } + } - withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } - } + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FASTQ' { + ext.args = '-F 0x200 -nt' + } - withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' { - ext.args = { "-c -p" } - ext.prefix = { "${meta.id}.merge" } - } + withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' { + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } + } - withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.sort" } - } + withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' { + ext.args = { "-c -p" } + ext.prefix = { "${meta.id}.merge" } + } - withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { - ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } - ext.args = '--output-fmt cram --write-index' - } + withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.sort" } + } - withName: '.*:CONVERT_STATS:SAMTOOLS_STATS' { - ext.prefix = { "${input.baseName}" } - } + withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { + ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } + ext.args = '--output-fmt cram --write-index' + } - withName: '.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { - ext.prefix = { "${bam.baseName}" } - } + withName: '.*:CONVERT_STATS:SAMTOOLS_STATS' { + ext.prefix = { "${input.baseName}" } + } - withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { - ext.prefix = { "${bam.baseName}" } - } + withName: '.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + ext.prefix = { "${bam.baseName}" } + } - withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/variant_calling" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + ext.prefix = { "${bam.baseName}" } + } + + withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' { + publishDir = [ + path: { "${params.outdir}/variant_calling" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { From 7125e18e39271c92f9d7f679e69f0b5d4439f1b5 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 14:40:01 +0000 Subject: [PATCH 16/33] pass the aligned reads to variant calling --- workflows/variantcalling.nf | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index b200086..714cdc3 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -141,15 +141,20 @@ workflow VARIANTCALLING { | set { ch_vector_db } } - ch_fasta.map{ fasta -> [[], fasta]}.set{fasta_meta} + ch_fasta.map{ fasta -> [[], fasta] }.set{ fasta_meta } ALIGN_PACBIO ( fasta_meta, INPUT_CHECK.out.reads, ch_vector_db ) ch_versions = ch_versions.mix( ALIGN_PACBIO.out.versions ) - } + ch_aligned_reads = ALIGN_PACBIO.out.cram + + } else { + + ch_aligned_reads = INPUT_CHECK.out.reads + } // // SUBWORKFLOW: merge the input reads by sample name @@ -157,7 +162,7 @@ workflow VARIANTCALLING { INPUT_MERGE ( ch_fasta, ch_index, - INPUT_CHECK.out.reads, + ch_aligned_reads, ) ch_versions = ch_versions.mix( INPUT_MERGE.out.versions ) From adcda19b2a4835d0a6e26f4e3e3d15946727257b Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 16:18:55 +0000 Subject: [PATCH 17/33] remove unused module config --- assets/samplesheet_test_align.csv | 2 +- conf/modules.config | 4 ---- conf/test_align.config | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv index 381e21d..a2bae25 100644 --- a/assets/samplesheet_test_align.csv +++ b/assets/samplesheet_test_align.csv @@ -1,2 +1,2 @@ sample,datatype,datafile -mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram \ No newline at end of file +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam diff --git a/conf/modules.config b/conf/modules.config index a2f44ea..7a0528e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -51,10 +51,6 @@ process { ext.prefix = { "${meta.id}.merge" } } - withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.sort" } - } - withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } ext.args = '--output-fmt cram --write-index' diff --git a/conf/test_align.config b/conf/test_align.config index 746d4a8..6284ff5 100644 --- a/conf/test_align.config +++ b/conf/test_align.config @@ -23,5 +23,5 @@ params { input = "${projectDir}/assets/samplesheet_test_align.csv" // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz" } From 6b545450ec8d0352d4e922516de3809652c638a7 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 17:17:49 +0000 Subject: [PATCH 18/33] no need to sort merge after aligment --- conf/modules.config | 9 +++++---- subworkflows/local/input_merge.nf | 2 +- workflows/variantcalling.nf | 29 +++++++++++++++-------------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7a0528e..e963a11 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -75,6 +75,11 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + } else{ + + withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' { + ext.args = '--write-index' + } } withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { @@ -82,10 +87,6 @@ process { ext.prefix = { "${meta.id}_filtered" } } - withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' { - ext.args = '--write-index' - } - withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' { ext.args = '--model_type=PACBIO' } diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index c63c679..bbbec3e 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -38,7 +38,7 @@ workflow INPUT_MERGE { .join( grouped_reads ) .map { sample, meta, bam_cram_list -> [ [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', - type: meta.datatype + datatype: meta.datatype ], bam_cram_list ]} diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 714cdc3..50970c5 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -149,30 +149,31 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix( ALIGN_PACBIO.out.versions ) - ch_aligned_reads = ALIGN_PACBIO.out.cram + ALIGN_PACBIO.out.cram + .join( ALIGN_PACBIO.out.crai ) + .set{ ch_aligned_reads } } else { - ch_aligned_reads = INPUT_CHECK.out.reads - } - - // - // SUBWORKFLOW: merge the input reads by sample name - // - INPUT_MERGE ( - ch_fasta, - ch_index, - ch_aligned_reads, - ) - ch_versions = ch_versions.mix( INPUT_MERGE.out.versions ) + // + // SUBWORKFLOW: merge the input reads by sample name + // + INPUT_MERGE ( + ch_fasta, + ch_index, + INPUT_CHECK.out.reads, + ) + ch_versions = ch_versions.mix( INPUT_MERGE.out.versions ) + ch_aligned_reads = INPUT_MERGE.out.indexed_merged_reads + } // // SUBWORKFLOW: split the input fasta file and filter input reads // INPUT_FILTER_SPLIT ( ch_fasta, - INPUT_MERGE.out.indexed_merged_reads, + ch_aligned_reads, ch_interval, split_fasta_cutoff ) From e03a4ee5f38a25ff8ee20a32e327b5491d125f6f Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 29 Nov 2023 17:48:35 +0000 Subject: [PATCH 19/33] add two test sample files for alignments --- assets/samplesheet_test_align.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv index a2bae25..b7f429e 100644 --- a/assets/samplesheet_test_align.csv +++ b/assets/samplesheet_test_align.csv @@ -1,2 +1,3 @@ sample,datatype,datafile icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam From 1540017f25dd8e590281518c3825514bde316de4 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 11:32:52 +0000 Subject: [PATCH 20/33] convert fasta channel to value channel --- subworkflows/local/align_pacbio.nf | 1 + subworkflows/local/input_merge.nf | 11 ++++----- workflows/variantcalling.nf | 37 ++++++++++++++---------------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 26c575a..12a8255 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -36,6 +36,7 @@ workflow ALIGN_PACBIO { | groupTuple ( by: [0] ) | set { ch_bams } + // Merge SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index bbbec3e..4df8957 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -7,8 +7,8 @@ include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort' workflow INPUT_MERGE { take: - fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz - fai // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi + fasta // channel: [ val(meta), /path/to/genome.fasta or /path/to/genome.fasta.gz ] + fai // channel: [ val(meta), /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi ] reads // channel: [ val(meta), data ] main: @@ -45,12 +45,9 @@ workflow INPUT_MERGE { .set { grouped_reads_with_meta } // call samtool merge - ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() - ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first() - SAMTOOLS_MERGE( grouped_reads_with_meta, - ch_fasta, - ch_fai + fasta, + fai ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 50970c5..ef6cde9 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -84,38 +84,32 @@ include { UNTAR } from '../modules/nf-core/untar/main' workflow VARIANTCALLING { ch_versions = Channel.empty() + ch_fasta + .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .first() + .set { ch_genome } // // check reference fasta index given or not // if( params.fai == null ){ - ch_fasta - .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } - .set { ch_genome } - SAMTOOLS_FAIDX ( ch_genome, [[], []] ) ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) - SAMTOOLS_FAIDX.out.fai - .map{ mata, fai -> fai } - .set{ ch_fai } - - SAMTOOLS_FAIDX.out.gzi - .map{ meta, gzi -> gzi } - .set{ ch_gzi } - if( params.fasta.endsWith('.gz') ){ - ch_index = ch_gzi + ch_genome_index = SAMTOOLS_FAIDX.out.gzi }else{ - ch_index = ch_fai + ch_genome_index = SAMTOOLS_FAIDX.out.fai } }else{ - ch_index = ch_fai + ch_index + .map { fai -> [ [ 'id': fai.baseName ], fai ] } + .first() + .set { ch_genome_index } } - // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // @@ -136,14 +130,17 @@ workflow VARIANTCALLING { | map { meta, file -> file } | set { ch_vector_db } ch_versions = ch_versions.mix ( UNTAR.out.versions ) + + } else { + Channel.fromPath ( params.vector_db ) | set { ch_vector_db } + } - ch_fasta.map{ fasta -> [[], fasta] }.set{ fasta_meta } ALIGN_PACBIO ( - fasta_meta, + ch_genome, INPUT_CHECK.out.reads, ch_vector_db ) @@ -159,8 +156,8 @@ workflow VARIANTCALLING { // SUBWORKFLOW: merge the input reads by sample name // INPUT_MERGE ( - ch_fasta, - ch_index, + ch_genome, + ch_genome_index, INPUT_CHECK.out.reads, ) ch_versions = ch_versions.mix( INPUT_MERGE.out.versions ) From 8463d43733dd750708e55652f4d868b9cc3fd57d Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 11:58:50 +0000 Subject: [PATCH 21/33] put _T1 back to distinguish the same samles --- bin/check_samplesheet.py | 6 +++++- subworkflows/local/input_check.nf | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index d088e65..c91f170 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -108,7 +108,11 @@ def validate_unique_samples(self): """ if len(self._seen) != len(self.validated): raise AssertionError("The combination of sample name and data file must be unique.") - + seen = Counter() + for row in self.validated: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index ec8a9f4..b71f3fd 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -25,13 +25,13 @@ def create_data_channel ( LinkedHashMap row ) { // create meta map def meta = [:] meta.id = row.sample - meta.sample = row.sample + meta.sample = row.sample.split('_')[0..-2].join('_') meta.datatype = row.datatype if ( meta.datatype == "pacbio" ) { platform = "PACBIO" } - meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0] + "\\tPL:" + platform + "\\tSM:" + meta.sample + "\'" + meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0..-2].join('.') + "\\tPL:" + platform + "\\tSM:" + meta.sample + "\'" // add path(s) of the read file(s) to the meta map def data_meta = [] From 50df394e0ec07a98ff9ea6f37ab64a893b95dd6c Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 12:15:55 +0000 Subject: [PATCH 22/33] add combined in the aligned bam/cram file name if sample being combined --- assets/samplesheet_test_align.csv | 1 + subworkflows/local/align_pacbio.nf | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv index b7f429e..4b5a9b2 100644 --- a/assets/samplesheet_test_align.csv +++ b/assets/samplesheet_test_align.csv @@ -1,3 +1,4 @@ sample,datatype,datafile icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam +icCanRufa1XXXXX,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 12a8255..4c98f2c 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -32,10 +32,20 @@ workflow ALIGN_PACBIO { // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id, 'datatype': meta.datatype, 'sample': meta.sample ], bam] } + | map { meta, bam -> [['id': meta.sample, 'datatype': meta.datatype, 'sample': meta.sample ], bam] } | groupTuple ( by: [0] ) + | map { meta, bam_list -> + [ + [ + 'id': ( bam_list.size() == 1 ) ? meta.sample : meta.sample + '_combined', + 'sample' : meta.sample, + 'datatype': meta.datatype + ], + bam_list + ] + } | set { ch_bams } - + // Merge SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) From add2bcb53877eea227dbc8e061ef2d2bd299dcd4 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 12:26:31 +0000 Subject: [PATCH 23/33] add a full test with aligment --- assets/samplesheet_test_full_align.csv | 2 ++ conf/test_align.config | 6 +++--- conf/test_full_align.config | 25 +++++++++++++++++++++++++ nextflow.config | 7 ++++--- 4 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 assets/samplesheet_test_full_align.csv create mode 100644 conf/test_full_align.config diff --git a/assets/samplesheet_test_full_align.csv b/assets/samplesheet_test_full_align.csv new file mode 100644 index 0000000..d5b0ac4 --- /dev/null +++ b/assets/samplesheet_test_full_align.csv @@ -0,0 +1,2 @@ +sample,datatype,datafile +ilPolIcar1,pacbio,/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/genomic_data/ilPolIcar1/pacbio/m64016_191206_183623.ccs.bc1019_BAK8B_OA--bc1019_BAK8B_OA.bam diff --git a/conf/test_align.config b/conf/test_align.config index 6284ff5..8da6b65 100644 --- a/conf/test_align.config +++ b/conf/test_align.config @@ -5,14 +5,14 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run sanger-tol/variantcalling -profile test, --outdir + nextflow run sanger-tol/variantcalling -profile test_align, --outdir --align ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Test align profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test profile with alignment' + config_profile_description = 'Minimal unaligned test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 diff --git a/conf/test_full_align.config b/conf/test_full_align.config new file mode 100644 index 0000000..79b9fd7 --- /dev/null +++ b/conf/test_full_align.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests with alignment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run sanger-tol/variantcalling -profile test_full_align, --outdir --align + +---------------------------------------------------------------------------------------- +*/ + +cleanup = true + +params { + config_profile_name = 'Full test profile with alignment' + config_profile_description = 'Full non-aligned test dataset to check pipeline function' + + // Input data for full size test + input = "${projectDir}/assets/samplesheet_test_full_align.csv" + + // Fasta references + fasta = "/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/assembly/release/ilPolIcar1.1/insdc/GCA_937595015.1.fasta.gz" + +} diff --git a/nextflow.config b/nextflow.config index 93512e2..bb1f0f6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -168,9 +168,10 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } - test_align { includeConfig 'conf/test_align.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_align { includeConfig 'conf/test_align.config' } + test_full { includeConfig 'conf/test_full.config' } + test_full_align { includeConfig 'conf/test_full_align.config' } } From 4285d4c373bae0f69ba5907315aa88d8baf1c26e Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 12:34:48 +0000 Subject: [PATCH 24/33] add one alignment test in the simple ci test --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbc6e02..567ac13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,3 +41,7 @@ jobs: # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + + - name: Run pipeline with unaligned test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_align,docker --outdir ./results --align From ef82d4975e69d5da2be0a5e6a0efd730d1800736 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 12:38:23 +0000 Subject: [PATCH 25/33] black check --- bin/check_samplesheet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index c91f170..3a6b9d7 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -114,6 +114,7 @@ def validate_unique_samples(self): seen[sample] += 1 row[self._sample_col] = f"{sample}_T{seen[sample]}" + def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" lines = [] From 6bf3a19daafbb6ee83cac92411384109359889f7 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 13:16:13 +0000 Subject: [PATCH 26/33] EditorConfig linting --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index e963a11..22d84ce 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -67,7 +67,7 @@ process { withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { ext.prefix = { "${bam.baseName}" } } - + withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' { publishDir = [ path: { "${params.outdir}/variant_calling" }, From 006eced3bdb8450099226a67ef529f78ac78082b Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 30 Nov 2023 13:25:22 +0000 Subject: [PATCH 27/33] make the sanger farm test with alignment --- .github/workflows/sanger_test.yml | 3 ++- .github/workflows/sanger_test_full.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml index 28f7625..a4e5426 100644 --- a/.github/workflows/sanger_test.yml +++ b/.github/workflows/sanger_test.yml @@ -19,8 +19,9 @@ jobs: parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}", + "align": true } - profiles: test,sanger,singularity,cleanup + profiles: test_align,sanger,singularity,cleanup - uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml index e028c6b..a552c63 100644 --- a/.github/workflows/sanger_test_full.yml +++ b/.github/workflows/sanger_test_full.yml @@ -32,8 +32,9 @@ jobs: parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + "align": true, } - profiles: test_full,sanger,singularity,cleanup + profiles: test_full_align,sanger,singularity,cleanup - uses: actions/upload-artifact@v3 with: From 9a0003ae4fb4cabdf0d7ba14de89e89cd0edb4ba Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 1 Dec 2023 10:59:32 +0000 Subject: [PATCH 28/33] Update module 'samtools/fasta --- modules.json | 2 +- modules/nf-core/samtools/fasta/main.nf | 2 +- modules/nf-core/samtools/fasta/meta.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index e7bcfa1..f5561b1 100644 --- a/modules.json +++ b/modules.json @@ -52,7 +52,7 @@ }, "samtools/fasta": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "6f4299292ef2c5b66e6829527b2647c301b77cc9", "installed_by": ["modules"] }, "samtools/fastq": { diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf index 925ed62..dc4ad98 100644 --- a/modules/nf-core/samtools/fasta/main.nf +++ b/modules/nf-core/samtools/fasta/main.nf @@ -38,7 +38,7 @@ process SAMTOOLS_FASTA { cat <<-END_VERSIONS > versions.yml "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ } diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml index 1d07ea1..eae26f0 100644 --- a/modules/nf-core/samtools/fasta/meta.yml +++ b/modules/nf-core/samtools/fasta/meta.yml @@ -12,7 +12,7 @@ tools: documentation: "https://www.htslib.org/doc/samtools-fasta.html" tool_dev_url: "https://github.com/samtools/samtools" doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" + licence: ["MIT"] input: # Only when we have meta - meta: From 4db4cbc9c904c19d0a798c740ce60dc007cf3721 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 7 Dec 2023 10:50:17 +0000 Subject: [PATCH 29/33] remove the combined in the output file name if the same sample input file being combined. --- subworkflows/local/align_pacbio.nf | 10 ---------- subworkflows/local/input_merge.nf | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 4c98f2c..75f4ac2 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -34,16 +34,6 @@ workflow ALIGN_PACBIO { MINIMAP2_ALIGN.out.bam | map { meta, bam -> [['id': meta.sample, 'datatype': meta.datatype, 'sample': meta.sample ], bam] } | groupTuple ( by: [0] ) - | map { meta, bam_list -> - [ - [ - 'id': ( bam_list.size() == 1 ) ? meta.sample : meta.sample + '_combined', - 'sample' : meta.sample, - 'datatype': meta.datatype - ], - bam_list - ] - } | set { ch_bams } diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 4df8957..12f2653 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -37,7 +37,7 @@ workflow INPUT_MERGE { .map { sample, meta_list -> [sample, meta_list[0]] } .join( grouped_reads ) .map { sample, meta, bam_cram_list -> [ - [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', + [ id: sample, datatype: meta.datatype ], bam_cram_list From 625f649545cdafdc2bc91dedf8a7edac8639dc4b Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 7 Dec 2023 13:47:40 +0000 Subject: [PATCH 30/33] update file name for VCf output files --- subworkflows/local/deepvariant_caller.nf | 26 +++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/deepvariant_caller.nf b/subworkflows/local/deepvariant_caller.nf index 31b42e0..7e4adf0 100644 --- a/subworkflows/local/deepvariant_caller.nf +++ b/subworkflows/local/deepvariant_caller.nf @@ -14,7 +14,11 @@ workflow DEEPVARIANT_CALLER { ch_versions = Channel.empty() reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> - [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], + [ [ id: meta.id + "_" + fasta_file_name, + sample: meta.id, + type: meta.datatype, + fasta_file_name: fasta_file_name + ], cram, crai, interval @@ -44,9 +48,15 @@ workflow DEEPVARIANT_CALLER { // group the vcf files together by sample DEEPVARIANT.out.vcf - .map { meta, vcf -> [ meta.sample, vcf ] } + .map { meta, vcf -> [ + [ id: meta.fasta_file_name.tokenize(".")[0..-2].join(".") + + "." + meta.type + + "." + meta.sample + ], + vcf + ] } .groupTuple() - .map { sample, vcf -> [ [id: sample], vcf, [] ] } + .map { meta, vcf -> [ meta, vcf, [] ] } .set { vcf } // catcat vcf files @@ -55,9 +65,15 @@ workflow DEEPVARIANT_CALLER { // group the g vcf files together by sample DEEPVARIANT.out.gvcf - .map { meta, gvcf -> [ meta.sample, gvcf ] } + .map { meta, gvcf -> [ + [ id: meta.fasta_file_name.tokenize(".")[0..-2].join(".") + + "." + meta.type + + "." + meta.sample + ], + gvcf + ] } .groupTuple() - .map { sample, gvcf -> [ [ id: sample ], gvcf, [] ] } + .map { meta, gvcf -> [ meta, gvcf, [] ] } .set { g_vcf } // catcat g vcf files From d348591fec9c86414619a98d1836d30a4358528e Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 7 Dec 2023 15:33:59 +0000 Subject: [PATCH 31/33] make sure fasta or fa being removed from the fasta file name --- conf/modules.config | 2 +- workflows/variantcalling.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 22d84ce..acc63f5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,7 +52,7 @@ process { } withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { - ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } + ext.prefix = { "${meta2.id}.${meta.datatype}.${meta.id}" } ext.args = '--output-fmt cram --write-index' } diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index ef6cde9..f3e7450 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -85,7 +85,7 @@ workflow VARIANTCALLING { ch_versions = Channel.empty() ch_fasta - .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .map { fasta -> [ [ 'id': fasta.baseName - '.fasta' - '.fa' ], fasta ] } .first() .set { ch_genome } From 7d20b1d2db04c13576dcd10ff5e37170f49d832f Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Thu, 7 Dec 2023 15:42:08 +0000 Subject: [PATCH 32/33] Update nextflow_schema.json wrong default value Co-authored-by: Matthieu Muffato --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 958cdb9..c857174 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -30,7 +30,7 @@ }, "vector_db": { "type": "string", - "default": "/Users/gq2/git3/variantcalling/assets/vectorDB.tar.gz", + "default": "${projectDir}/assets/vectorDB.tar.gz", "description": "Path to directory or tar.gz archive for pre-built PacBio vector database.", "format": "file-path" }, From 45a7ba4d3b5a8856003cbf1cbb7a79725be41f17 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 7 Dec 2023 15:54:06 +0000 Subject: [PATCH 33/33] change maxRetries to 5. --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index c4ea647..e08d741 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,7 +16,7 @@ process { time = { check_max( 4.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 5 maxErrors = '-1' // Process-specific resource requirements