From 3f89203dcc7c1182b9f784c17e65cc1b6d9fcbfd Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 14:15:19 +0100 Subject: [PATCH 01/27] Remove index file from the samplesheet and update checking script --- assets/samplesheet.csv | 9 +++++---- assets/samplesheet_test.csv | 9 +++++---- assets/samplesheet_test_full.csv | 4 ++-- assets/schema_input.json | 7 +------ bin/check_samplesheet.py | 29 +++++++---------------------- 5 files changed, 20 insertions(+), 38 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 2ea95db..9de2e5b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3-1.bam +sample3,pacbio,/path/to/data/file/file3-2.cram diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv index cf5546a..6eb03e5 100644 --- a/assets/samplesheet_test.csv +++ b/assets/samplesheet_test.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai -icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai -icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi +sample,datatype,datafile +icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam diff --git a/assets/samplesheet_test_full.csv b/assets/samplesheet_test_full.csv index 4495dff..1e40e2b 100644 --- a/assets/samplesheet_test_full.csv +++ b/assets/samplesheet_test_full.csv @@ -1,2 +1,2 @@ -sample,datatype,datafile,indexfile -icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai +sample,datatype,datafile +icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram diff --git a/assets/schema_input.json b/assets/schema_input.json index 43497e9..f264cf6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -21,13 +21,8 @@ "type": "string", "pattern": "^\\S+\\.(bam|cram)$", "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'" - }, - "indexfile": { - "type": "string", - "pattern": "^\\S+\\.(bai|csi|crai)$", - "errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'" } }, - "required": ["sample", "datatype", "datafile", "indexfile"] + "required": ["sample", "datatype", "datafile"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 6bbd806..52af146 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -35,7 +35,6 @@ def __init__( sample_col="sample", type_col="datatype", file_col="datafile", - index_col="indexfile", **kwargs, ): """ @@ -48,8 +47,6 @@ def __init__( the read data (default "datatype"). file_col (str): The name of the column that contains the file path for the read data (default "datafile"). - index_col (str): The name of the column that contains the index file - for the data (default "indexfile"). """ super().__init__(**kwargs) @@ -57,7 +54,6 @@ def __init__( self._sample_col = sample_col self._type_col = type_col self._file_col = file_col - self._index_col = index_col self._seen = set() self.modified = [] @@ -73,7 +69,6 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_type(row) self._validate_data_file(row) - self._validate_index_file(row) self._seen.add((row[self._sample_col], row[self._file_col])) self.modified.append(row) @@ -98,17 +93,6 @@ def _validate_data_file(self, row): raise AssertionError("Data file is required.") self._validate_data_format(row[self._file_col]) - def _validate_index_file(self, row): - """Assert that the indexfile is non-empty and has the right format.""" - if len(row[self._index_col]) <= 0: - raise AssertionError("Data index file is required.") - if row[self._file_col].endswith("bam") and not ( - row[self._index_col].endswith("bai") or row[self._index_col].endswith("csi") - ): - raise AssertionError("bai or csi index file should be given for bam file.") - if row[self._file_col].endswith("cram") and not row[self._index_col].endswith("crai"): - raise AssertionError("crai index file shuld be given for cram file.") - def _validate_data_format(self, filename): """Assert that a given filename has one of the expected read data file extensions.""" if not any(filename.endswith(extension) for extension in self.DATA_VALID_FORMATS): @@ -162,7 +146,7 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - # same input file could retrun random true or false + # same input file could return random true or false # disable it now # the following validation should be enough # if not sniffer.has_header(peek): @@ -188,16 +172,17 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `variantcalling samplesheet`_:: - sample,datatype,datafile,indexfile - sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai - sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai - sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi + sample,datatype,datafile + sample1,pacbio,/path/to/data/file/file1.bam + sample2,pacbio,/path/to/data/file/file2.cram + sample3,pacbio,/path/to/data/file/file3-1.bam + sample3,pacbio,/path/to/data/file/file3-2.cram .. _variantcalling samplesheet: https://raw.githubusercontent.com/sanger-tol/variantcalling/main/assets/samplesheet.csv """ - required_columns = {"sample", "datatype", "datafile", "indexfile"} + required_columns = {"sample", "datatype", "datafile"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) From cef527b538f5c6e72d014cd7f9eae9c0eec584dc Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 14:50:54 +0100 Subject: [PATCH 02/27] Remove indexfile from Input_check workflow --- subworkflows/local/input_check.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d2f72e9..30a07d5 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,10 +12,10 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [[id: it.sample, type: it.datatype], file(it.datafile), file(it.indexfile)] } + .map { [[id: it.sample, type: it.datatype], file(it.datafile)] } .set { reads } emit: - reads // channel: [ val(meta), data, index ] + reads // channel: [ val(meta), data ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } From 67826e3eeaaa6a7007b562103b72551f83332f9e Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 17:42:56 +0100 Subject: [PATCH 03/27] add sample name to the meta data --- subworkflows/local/input_check.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 30a07d5..aa0bbc9 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,9 +12,9 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [[id: it.sample, type: it.datatype], file(it.datafile)] } + .map { [[id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype], file(it.datafile)] } .set { reads } - + emit: reads // channel: [ val(meta), data ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] From 2d30d826a2a48bece2af926d00d93a672aa1e47d Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 17:55:56 +0100 Subject: [PATCH 04/27] nf-core modules install samtools/merge --- modules.json | 5 ++ modules/nf-core/samtools/merge/main.nf | 56 +++++++++++++++++++ modules/nf-core/samtools/merge/meta.yml | 73 +++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 modules/nf-core/samtools/merge/main.nf create mode 100644 modules/nf-core/samtools/merge/meta.yml diff --git a/modules.json b/modules.json index bc363d0..6f79373 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,11 @@ "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, + "samtools/merge": { + "branch": "master", + "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", + "installed_by": ["modules"] + }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000..b73b7cb --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + """ + touch ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000..3a815f7 --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,73 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" From 953658f86209de21f23627f376f9d688ecc73acb Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 21:12:54 +0100 Subject: [PATCH 05/27] Add input merge sub workflow --- subworkflows/local/input_merge.nf | 44 +++++++++++++++++++++++++++++++ workflows/variantcalling.nf | 10 +++++++ 2 files changed, 54 insertions(+) create mode 100644 subworkflows/local/input_merge.nf diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf new file mode 100644 index 0000000..121c246 --- /dev/null +++ b/subworkflows/local/input_merge.nf @@ -0,0 +1,44 @@ +// +// Merge READS(bam or cram files) together by sample name +// + +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge' + +workflow INPUT_MERGE { + take: + fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz + fai // file: /path/to/genome.*.fai + gzi // file: /path/to/genome.fasta.gz.gzi or null + reads // channel: [ val(meta), data ] + + main: + // group input reads file by sample name + reads + .map{ it -> [ it[0].sample, it[1] ] } + .groupTuple() + .set{ merged_reads } + + // group input meta data together by sample name as well + // use the first meta data for the combined reads + reads + .map{ it -> [ it[0].sample, it[0] ] } + .groupTuple() + .map { it -> [it[0], it[1][0]] } + .join( merged_reads ) + .map { it -> [ it[1] , it [2] ]} + .set { merged_reads_with_meta } + + // call samtool merge + SAMTOOLS_MERGE( merged_reads_with_meta, + [ [], fasta ], + [ [], fai ], + [ [], gzi ] + ) + + emit: + bam = SAMTOOLS_MERGE.out.bam + cram = SAMTOOLS_MERGE.out.cram + csi = SAMTOOLS_MERGE.out.csi + versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ] + +} diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 6c6ce09..de68f0b 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -47,6 +47,7 @@ if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_MERGE } from '../subworkflows/local/input_merge' include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split' include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' @@ -81,6 +82,15 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + INPUT_MERGE ( + fasta_file, + fai_file, + gzi_file, + INPUT_CHECK.out.reads + ) + ch_versions = ch_versions.mix(INPUT_MERGE.out.versions) + + // // SUBWORKFLOW: split the input fasta file and filter input reads // From 0d0d06e80093fb9a81cab0ea60032c9d1434bb70 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 21:41:47 +0100 Subject: [PATCH 06/27] comments --- workflows/variantcalling.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index de68f0b..5b2c581 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -82,6 +82,9 @@ workflow VARIANTCALLING { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + // + // SUBWORKFLOW: merge the input reads by sample name + // INPUT_MERGE ( fasta_file, fai_file, From 224c7f23a0954875fdd1e10179e4d2c33dfa5b4f Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 5 Oct 2023 21:45:11 +0100 Subject: [PATCH 07/27] patch samtools_merge module to allow using fasta.gz file with gzi index file. --- modules.json | 3 ++- modules/nf-core/samtools/merge/main.nf | 1 + modules/nf-core/samtools/merge/meta.yml | 4 +++ .../samtools/merge/samtools-merge.diff | 27 +++++++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/samtools/merge/samtools-merge.diff diff --git a/modules.json b/modules.json index 6f79373..133358e 100644 --- a/modules.json +++ b/modules.json @@ -33,7 +33,8 @@ "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, "samtools/view": { "branch": "master", diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index b73b7cb..3c7faf4 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -11,6 +11,7 @@ process SAMTOOLS_MERGE { tuple val(meta), path(input_files, stageAs: "?/*") tuple val(meta2), path(fasta) tuple val(meta3), path(fai) + tuple val(meta4), path(gzi) output: tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml index 3a815f7..4a6bc23 100644 --- a/modules/nf-core/samtools/merge/meta.yml +++ b/modules/nf-core/samtools/merge/meta.yml @@ -43,6 +43,10 @@ input: type: file description: Index of the reference file the CRAM was created with (optional) pattern: "*.fai" + - gzi: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.gzi" output: - meta: type: map diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff new file mode 100644 index 0000000..4c48cc0 --- /dev/null +++ b/modules/nf-core/samtools/merge/samtools-merge.diff @@ -0,0 +1,27 @@ +Changes in module 'nf-core/samtools/merge' +--- modules/nf-core/samtools/merge/meta.yml ++++ modules/nf-core/samtools/merge/meta.yml +@@ -43,6 +43,10 @@ + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" ++ - gzi: ++ type: file ++ description: Index of the reference file the CRAM was created with (optional) ++ pattern: "*.gzi" + output: + - meta: + type: map + +--- modules/nf-core/samtools/merge/main.nf ++++ modules/nf-core/samtools/merge/main.nf +@@ -11,6 +11,7 @@ + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) ++ tuple val(meta4), path(gzi) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + +************************************************************ From 36f4c677e30bc4b630144d678bfc945fe3474f16 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 09:26:55 +0100 Subject: [PATCH 08/27] use original sample name for id if just 1, otherwise add _combined. --- subworkflows/local/input_merge.nf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 121c246..d2eafb5 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -25,7 +25,12 @@ workflow INPUT_MERGE { .groupTuple() .map { it -> [it[0], it[1][0]] } .join( merged_reads ) - .map { it -> [ it[1] , it [2] ]} + .map { it -> [ + [ id: ( it[2].size() == 1 ) ? it[1].sample : it[1].sample + '_combined', + type: it[1].type + ], + it[2] + ]} .set { merged_reads_with_meta } // call samtool merge @@ -38,7 +43,7 @@ workflow INPUT_MERGE { emit: bam = SAMTOOLS_MERGE.out.bam cram = SAMTOOLS_MERGE.out.cram - csi = SAMTOOLS_MERGE.out.csi + csi = SAMTOOLS_MERGE.out.csi versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ] } From a808cf24897fc3e8c3a87b35a55dcd99cd07faa4 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 10:16:31 +0100 Subject: [PATCH 09/27] Path samtools_merge module again add indexing, emit crai index file as well --- modules/nf-core/samtools/merge/main.nf | 2 ++ modules/nf-core/samtools/merge/meta.yml | 4 +++ .../samtools/merge/samtools-merge.diff | 27 ++++++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index 3c7faf4..90ddfbe 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -17,6 +17,7 @@ process SAMTOOLS_MERGE { tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai") , optional:true, emit: crai path "versions.yml" , emit: versions @@ -32,6 +33,7 @@ process SAMTOOLS_MERGE { samtools \\ merge \\ --threads ${task.cpus-1} \\ + --write-index \\ $args \\ ${reference} \\ ${prefix}.${file_type} \\ diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml index 4a6bc23..056a95f 100644 --- a/modules/nf-core/samtools/merge/meta.yml +++ b/modules/nf-core/samtools/merge/meta.yml @@ -69,6 +69,10 @@ output: type: file description: BAM index file (optional) pattern: "*.csi" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" authors: - "@drpatelh" - "@yuukiiwa " diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff index 4c48cc0..a740ce0 100644 --- a/modules/nf-core/samtools/merge/samtools-merge.diff +++ b/modules/nf-core/samtools/merge/samtools-merge.diff @@ -12,10 +12,21 @@ Changes in module 'nf-core/samtools/merge' output: - meta: type: map +@@ -65,6 +69,10 @@ + type: file + description: BAM index file (optional) + pattern: "*.csi" ++ - crai: ++ type: file ++ description: CRAM index file (optional) ++ pattern: "*.crai" + authors: + - "@drpatelh" + - "@yuukiiwa " --- modules/nf-core/samtools/merge/main.nf +++ modules/nf-core/samtools/merge/main.nf -@@ -11,6 +11,7 @@ +@@ -11,11 +11,13 @@ tuple val(meta), path(input_files, stageAs: "?/*") tuple val(meta2), path(fasta) tuple val(meta3), path(fai) @@ -23,5 +34,19 @@ Changes in module 'nf-core/samtools/merge' output: tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi ++ tuple val(meta), path("*.crai") , optional:true, emit: crai + path "versions.yml" , emit: versions + + +@@ -31,6 +33,7 @@ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ ++ --write-index \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ ************************************************************ From c646b8b1b4b4e5b979347f9f625f89c7e6ca6e01 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 10:27:29 +0100 Subject: [PATCH 10/27] emit crai files as well --- subworkflows/local/input_merge.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index d2eafb5..a0e7220 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -44,6 +44,7 @@ workflow INPUT_MERGE { bam = SAMTOOLS_MERGE.out.bam cram = SAMTOOLS_MERGE.out.cram csi = SAMTOOLS_MERGE.out.csi + crai = SAMTOOLS_MERGE.out.crai versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ] } From c4a762fed89d13c9e39ee99c017d30e4a1239ce0 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 10:43:26 +0100 Subject: [PATCH 11/27] nf-core modules install samtools/sort --- modules.json | 5 +++ modules/nf-core/samtools/sort/main.nf | 49 ++++++++++++++++++++++++++ modules/nf-core/samtools/sort/meta.yml | 48 +++++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 modules/nf-core/samtools/sort/main.nf create mode 100644 modules/nf-core/samtools/sort/meta.yml diff --git a/modules.json b/modules.json index 133358e..183f5b3 100644 --- a/modules.json +++ b/modules.json @@ -36,6 +36,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, + "samtools/sort": { + "branch": "master", + "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", + "installed_by": ["modules"] + }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 0000000..2b7753f --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 0000000..0732843 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" From 9fa3696989b7d4398654c507e0018d983fb37067 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 11:37:33 +0100 Subject: [PATCH 12/27] Add an option to sort input if not sorted. --- nextflow.config | 1 + nextflow_schema.json | 4 ++++ subworkflows/local/input_merge.nf | 18 ++++++++++++++++-- workflows/variantcalling.nf | 6 ++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 83e274f..4659669 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params { // Input options input = null + sort_input = false fasta = null fai = null gzi = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3a5272c..5e04d99 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -28,6 +28,10 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "sort_input": { + "type": "boolean", + "description": "Boolean whether to sort input reads files" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index a0e7220..53b5e0f 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -3,6 +3,7 @@ // include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort' workflow INPUT_MERGE { take: @@ -10,10 +11,22 @@ workflow INPUT_MERGE { fai // file: /path/to/genome.*.fai gzi // file: /path/to/genome.fasta.gz.gzi or null reads // channel: [ val(meta), data ] + sort_input // bollean: true or false main: + ch_versions = Channel.empty() + + // sort input reads if asked + if ( sort_input ) { + SAMTOOLS_SORT( reads ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) + sorted_reads = SAMTOOLS_SORT.out.bam + } else { + sorted_reads = reads + } + // group input reads file by sample name - reads + sorted_reads .map{ it -> [ it[0].sample, it[1] ] } .groupTuple() .set{ merged_reads } @@ -39,12 +52,13 @@ workflow INPUT_MERGE { [ [], fai ], [ [], gzi ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) emit: bam = SAMTOOLS_MERGE.out.bam cram = SAMTOOLS_MERGE.out.cram csi = SAMTOOLS_MERGE.out.csi crai = SAMTOOLS_MERGE.out.crai - versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 5b2c581..2c1a78f 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -28,7 +28,8 @@ if (params.gzi) { } // Check optional parameters -if (params.interval) { interval_file = file(params.interval) } else { interval_file = null } +if (params.sort_input) { sort_input = params.sort_input } else { sort_input = false } +if (params.interval) { interval_file = file(params.interval) } else { interval_file = null } if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 } /* @@ -89,7 +90,8 @@ workflow VARIANTCALLING { fasta_file, fai_file, gzi_file, - INPUT_CHECK.out.reads + INPUT_CHECK.out.reads, + sort_input ) ch_versions = ch_versions.mix(INPUT_MERGE.out.versions) From 5cbea5bd50e0816ab7946c29d006d0975b6532c9 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 12:03:27 +0100 Subject: [PATCH 13/27] combine merged bam/cram together, add with their index files as well. --- subworkflows/local/input_merge.nf | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 53b5e0f..da9047e 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -54,11 +54,16 @@ workflow INPUT_MERGE { ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) + SAMTOOLS_MERGE.out.bam + .join(SAMTOOLS_MERGE.out.csi) + .concat( + SAMTOOLS_MERGE.out.cram + .join(SAMTOOLS_MERGE.out.crai) + ) + .set{ indexed_merged_reads }; + emit: - bam = SAMTOOLS_MERGE.out.bam - cram = SAMTOOLS_MERGE.out.cram - csi = SAMTOOLS_MERGE.out.csi - crai = SAMTOOLS_MERGE.out.crai + indexed_merged_reads = indexed_merged_reads versions = ch_versions // channel: [ versions.yml ] } From 3ac73e9482d849661920ccc216f0d2cb5ff3dcca Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 12:20:16 +0100 Subject: [PATCH 14/27] add filtered to distinguish the samtools input and output name --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 7dc8677..6672bfc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -22,6 +22,7 @@ process { withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { ext.args = '--output-fmt cram --write-index -F 0x900' + ext.prefix = { "${meta.id}_filtered" } } withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' { From bc6e5a0982503673e2898347b8d2253b21da6ea5 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 12:22:42 +0100 Subject: [PATCH 15/27] use the merged read for the rest of pipeline --- workflows/variantcalling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 2c1a78f..3b36a99 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -103,7 +103,7 @@ workflow VARIANTCALLING { fasta_file, fai_file, gzi_file, - INPUT_CHECK.out.reads, + INPUT_MERGE.out.indexed_merged_reads, interval_file, split_fasta_cutoff ) From a7131a8d087c2e69e46ef9c0164d70e19310e916 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 19:00:58 +0100 Subject: [PATCH 16/27] covert all input files into channels, and make reference fasta index file optional --- conf/test.config | 4 +- nextflow_schema.json | 2 +- subworkflows/local/input_filter_split.nf | 10 ++-- subworkflows/local/input_merge.nf | 17 +++++-- workflows/variantcalling.nf | 60 +++++++++++++++--------- 5 files changed, 61 insertions(+), 32 deletions(-) diff --git a/conf/test.config b/conf/test.config index be32ebe..f89e185 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,8 +26,8 @@ params { fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz' // Reference index file - fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' - gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' + // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' // Interval bed file interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed' diff --git a/nextflow_schema.json b/nextflow_schema.json index 5e04d99..dc6428b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -70,7 +70,7 @@ "description": "The minimum fasta file size when splitting the input fasta file by sequence." } }, - "required": ["fasta", "fai"] + "required": ["fasta"] }, "institutional_config_options": { "title": "Institutional config options", diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index c820901..95555d7 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -19,8 +19,7 @@ workflow INPUT_FILTER_SPLIT { ch_versions = Channel.empty() // split the fasta file into files with one sequence each, group them by file size - Channel - .fromPath ( fasta ) + fasta .splitFasta ( file:true ) .branch { small: it.size() < split_fasta_cutoff @@ -62,13 +61,16 @@ workflow INPUT_FILTER_SPLIT { .set { fasta_fai } // filter reads - SAMTOOLS_VIEW ( reads, [ [], fasta ], [] ) + fasta + .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .set { ch_fasta } + SAMTOOLS_VIEW ( reads, ch_fasta, [] ) ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) // combine reads with splitted references SAMTOOLS_VIEW.out.cram .join ( SAMTOOLS_VIEW.out.crai ) - .map { filtered_reads -> filtered_reads + [interval ?: []] } + .combine(interval) .combine ( fasta_fai ) .set { cram_crai_fasta_fai } diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index da9047e..05776ce 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -18,10 +18,12 @@ workflow INPUT_MERGE { // sort input reads if asked if ( sort_input ) { + SAMTOOLS_SORT( reads ) ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) sorted_reads = SAMTOOLS_SORT.out.bam } else { + sorted_reads = reads } @@ -47,10 +49,19 @@ workflow INPUT_MERGE { .set { merged_reads_with_meta } // call samtool merge + fasta + .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .set { ch_fasta } + fai + .map { fai -> [ [ 'id': fai.baseName ], fai ] } + .set { ch_fai } + gzi + .map { gzi -> [ [ 'id': gzi.baseName ], gzi ] } + .set { ch_gzi } SAMTOOLS_MERGE( merged_reads_with_meta, - [ [], fasta ], - [ [], fai ], - [ [], gzi ] + ch_fasta, + ch_fai, + ch_gzi ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 3b36a99..8a933b3 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -14,22 +14,15 @@ def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, p for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { input_file = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { fasta_file = file(params.fasta) } else { exit 1, 'Reference fasta not specified!' } -if (params.fai) { fai_file = file(params.fai) } else { exit 1, 'Reference fasta index not specified!' } - -// Check gzi being given if compressed fasta is provided -if (params.gzi) { - gzi_file = file(params.gzi) -} else if ( params.fasta.endsWith('fasta.gz') ) { - exit 1, 'Reference fasta index gzi file not specified for fasta.gz file!' -} else { - gzi_file = null -} +if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!' } // Check optional parameters +if (params.fai) { ch_fai = Channel.fromPath(params.fai) } else { ch_fai = Channel.empty() } +if (params.gzi) { ch_gzi = Channel.fromPath(params.gzi) } else { ch_gzi = Channel.empty() } +if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } + if (params.sort_input) { sort_input = params.sort_input } else { sort_input = false } -if (params.interval) { interval_file = file(params.interval) } else { interval_file = null } if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 } /* @@ -62,6 +55,7 @@ include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -75,11 +69,33 @@ workflow VARIANTCALLING { ch_versions = Channel.empty() + // + // check reference fasta index given or not + // + if( params.fai == null || ( params.fasta.endsWith('fasta.gz') && params.gzi == null ) ){ + + ch_fasta + .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .set { ch_genome } + + SAMTOOLS_FAIDX ( ch_genome, [[], []]) + ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) + + SAMTOOLS_FAIDX.out.fai + .map{ mata, fai -> fai } + .set{ ch_fai } + + SAMTOOLS_FAIDX.out.gzi + .map{ meta, gzi -> gzi } + .set{ ch_gzi } + + } + // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - input_file + ch_input ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) @@ -87,9 +103,9 @@ workflow VARIANTCALLING { // SUBWORKFLOW: merge the input reads by sample name // INPUT_MERGE ( - fasta_file, - fai_file, - gzi_file, + ch_fasta, + ch_fai, + ch_gzi, INPUT_CHECK.out.reads, sort_input ) @@ -100,15 +116,15 @@ workflow VARIANTCALLING { // SUBWORKFLOW: split the input fasta file and filter input reads // INPUT_FILTER_SPLIT ( - fasta_file, - fai_file, - gzi_file, + ch_fasta, + ch_fai, + ch_gzi, INPUT_MERGE.out.indexed_merged_reads, - interval_file, + ch_interval, split_fasta_cutoff ) ch_versions = ch_versions.mix(INPUT_FILTER_SPLIT.out.versions) - + // // SUBWORKFLOW: call deepvariant // From 0265cbf8053998245ab5ae14e3caeb50c8f05e63 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 21:52:31 +0100 Subject: [PATCH 17/27] use the first for the reference fasta channel --- subworkflows/local/input_filter_split.nf | 5 ++-- subworkflows/local/input_merge.nf | 35 ++++++++++-------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index 95555d7..0482cf2 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -61,9 +61,8 @@ workflow INPUT_FILTER_SPLIT { .set { fasta_fai } // filter reads - fasta - .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } - .set { ch_fasta } + ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() + SAMTOOLS_VIEW ( reads, ch_fasta, [] ) ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 05776ce..4e7efa5 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -18,46 +18,39 @@ workflow INPUT_MERGE { // sort input reads if asked if ( sort_input ) { - SAMTOOLS_SORT( reads ) ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) sorted_reads = SAMTOOLS_SORT.out.bam - } else { - + } else { sorted_reads = reads } - + // group input reads file by sample name sorted_reads - .map{ it -> [ it[0].sample, it[1] ] } + .map{ meta, bam_cram -> [ meta.sample, bam_cram ] } .groupTuple() .set{ merged_reads } - + // group input meta data together by sample name as well // use the first meta data for the combined reads reads - .map{ it -> [ it[0].sample, it[0] ] } + .map{ meta, bam_cram -> [ meta.sample, meta ] } .groupTuple() - .map { it -> [it[0], it[1][0]] } + .map { sample, meta_list -> [sample, meta_list[0]] } .join( merged_reads ) - .map { it -> [ - [ id: ( it[2].size() == 1 ) ? it[1].sample : it[1].sample + '_combined', - type: it[1].type + .map { sample, meta, bam_cram_list -> [ + [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', + type: meta.type ], - it[2] + bam_cram_list ]} .set { merged_reads_with_meta } // call samtool merge - fasta - .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } - .set { ch_fasta } - fai - .map { fai -> [ [ 'id': fai.baseName ], fai ] } - .set { ch_fai } - gzi - .map { gzi -> [ [ 'id': gzi.baseName ], gzi ] } - .set { ch_gzi } + ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() + ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first() + ch_gzi = gzi.map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }.first() + SAMTOOLS_MERGE( merged_reads_with_meta, ch_fasta, ch_fai, From d4d38cc08e2100a3a48c0beca5932e834b76017a Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 6 Oct 2023 22:17:10 +0100 Subject: [PATCH 18/27] move write-index flag to the config file --- conf/modules.config | 4 ++++ modules/nf-core/samtools/merge/main.nf | 1 - modules/nf-core/samtools/merge/samtools-merge.diff | 8 -------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6672bfc..0deaa90 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -25,6 +25,10 @@ process { ext.prefix = { "${meta.id}_filtered" } } + withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' { + ext.args = '--write-index' + } + withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' { ext.args = '--model_type=PACBIO' } diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index 90ddfbe..2dc4008 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -33,7 +33,6 @@ process SAMTOOLS_MERGE { samtools \\ merge \\ --threads ${task.cpus-1} \\ - --write-index \\ $args \\ ${reference} \\ ${prefix}.${file_type} \\ diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff index a740ce0..afe2536 100644 --- a/modules/nf-core/samtools/merge/samtools-merge.diff +++ b/modules/nf-core/samtools/merge/samtools-merge.diff @@ -40,13 +40,5 @@ Changes in module 'nf-core/samtools/merge' path "versions.yml" , emit: versions -@@ -31,6 +33,7 @@ - samtools \\ - merge \\ - --threads ${task.cpus-1} \\ -+ --write-index \\ - $args \\ - ${reference} \\ - ${prefix}.${file_type} \\ ************************************************************ From ea046b9b3a94661860382e185d53cf8fc79b31a9 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Sat, 7 Oct 2023 11:12:47 +0100 Subject: [PATCH 19/27] make sure work file when no interval file given --- subworkflows/local/input_filter_split.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index 0482cf2..2467b68 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -69,7 +69,7 @@ workflow INPUT_FILTER_SPLIT { // combine reads with splitted references SAMTOOLS_VIEW.out.cram .join ( SAMTOOLS_VIEW.out.crai ) - .combine(interval) + .combine(interval.ifEmpty([[]])) .combine ( fasta_fai ) .set { cram_crai_fasta_fai } From 7506e81addf69b1960e521438a1e1cdbe7c81e2f Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Sat, 7 Oct 2023 19:47:51 +0100 Subject: [PATCH 20/27] formating and documents --- README.md | 3 ++- docs/usage.md | 24 ++++++++++++------------ subworkflows/local/input_check.nf | 5 ++++- subworkflows/local/input_merge.nf | 5 +++++ workflows/variantcalling.nf | 2 +- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 62da4b2..ed207d1 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Pipeline summary -The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. +The pipeline takes aligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. Steps involved: - Split fasta file into smaller files, normally one sequence per file unless the sequences are too small. +- Merge input BAM/CRAM files together if they have the same sample names. - Filter out reads using the `-F 0x900` option to only retain the primary alignments. - Run DeepVariant using filtered BAM/CRAM files against each of split fasta files. - Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file. diff --git a/docs/usage.md b/docs/usage.md index af4811c..71cc1c7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,11 +2,11 @@ ## Introduction -The pipleline takes aligned sample reads (CRAM/BAM files and their index files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. +The pipeline takes aligned sample reads (CRAM/BAM files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with at least 4 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -17,21 +17,22 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,datatype,datafile,indexfile -sample1,pacbio,sample1_1.cram,sample1_1.cram.crai -sample1,pacbio,sample1_2.cram,sample1_3.cram.crai -sample1,pacbio,sample1_3.cram,sample1_3.cram.crai +sample,datatype,datafile +sample1,pacbio,sample1_1.cram +sample1,pacbio,sample1_2.cram +sample1,pacbio,sample1_3.cram ``` +If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples. ### Full samplesheet A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data. ```console -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3.bam ``` | Column | Description | @@ -39,7 +40,6 @@ sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | | `datatype` | Sequencing data type. Must be `pacbio`. | | `datafile` | The location for either BAM or CRAM file. | -| `indexfile` | The location for BAM or CRAM index file – BAI, CSI or CRAI. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -62,7 +62,7 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -The pipeline will split the intput fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. +The pipeline will split the input fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. ### Updating the pipeline diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index aa0bbc9..c3e0e49 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,7 +12,10 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [[id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype], file(it.datafile)] } + .map { [ + [ id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype ], + file(it.datafile) + ] } .set { reads } emit: diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 4e7efa5..58395d0 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -18,11 +18,15 @@ workflow INPUT_MERGE { // sort input reads if asked if ( sort_input ) { + SAMTOOLS_SORT( reads ) ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) sorted_reads = SAMTOOLS_SORT.out.bam + } else { + sorted_reads = reads + } // group input reads file by sample name @@ -58,6 +62,7 @@ workflow INPUT_MERGE { ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) + // concat merged bam or cram together along with their index file SAMTOOLS_MERGE.out.bam .join(SAMTOOLS_MERGE.out.csi) .concat( diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 8a933b3..a4cf47c 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -78,7 +78,7 @@ workflow VARIANTCALLING { .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } .set { ch_genome } - SAMTOOLS_FAIDX ( ch_genome, [[], []]) + SAMTOOLS_FAIDX ( ch_genome, [[], []] ) ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) SAMTOOLS_FAIDX.out.fai From 4dacfa801896b36c07377bd6623729008796ae94 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Sat, 7 Oct 2023 18:51:06 +0000 Subject: [PATCH 21/27] [automated] Fix linting with Prettier --- docs/usage.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 71cc1c7..1764889 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,6 +22,7 @@ sample1,pacbio,sample1_1.cram sample1,pacbio,sample1_2.cram sample1,pacbio,sample1_3.cram ``` + If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples. ### Full samplesheet @@ -35,11 +36,11 @@ sample2,pacbio,/path/to/data/file/file2.cram sample3,pacbio,/path/to/data/file/file3.bam ``` -| Column | Description | -| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `datatype` | Sequencing data type. Must be `pacbio`. | -| `datafile` | The location for either BAM or CRAM file. | +| Column | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `datatype` | Sequencing data type. Must be `pacbio`. | +| `datafile` | The location for either BAM or CRAM file. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From ebaa7e47361d9129a103d75e7c01709de92413c4 Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Fri, 13 Oct 2023 09:49:21 +0100 Subject: [PATCH 22/27] Update conf/test.config Co-authored-by: Matthieu Muffato --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index f89e185..ad1e731 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { // Fasta references fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz' - // Reference index file + // Reference index file (optional) // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' From 1b2c45e77be1c11cfadfcb2ba608bed08d575651 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Fri, 27 Oct 2023 15:50:37 +0100 Subject: [PATCH 23/27] nf-core modules update samtools/merge with conda environment file and maintainers list --- modules.json | 2 +- modules/nf-core/samtools/merge/environment.yml | 6 ++++++ modules/nf-core/samtools/merge/main.nf | 2 +- modules/nf-core/samtools/merge/meta.yml | 6 ++++++ 4 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/samtools/merge/environment.yml diff --git a/modules.json b/modules.json index 183f5b3..9781734 100644 --- a/modules.json +++ b/modules.json @@ -32,7 +32,7 @@ }, "samtools/merge": { "branch": "master", - "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", + "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", "installed_by": ["modules"], "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml new file mode 100644 index 0000000..04c82f1 --- /dev/null +++ b/modules/nf-core/samtools/merge/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index 2dc4008..0affdbf 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_MERGE { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda 'modules/nf-core/samtools/merge/environment.yml' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml index 056a95f..bf0da8b 100644 --- a/modules/nf-core/samtools/merge/meta.yml +++ b/modules/nf-core/samtools/merge/meta.yml @@ -79,3 +79,9 @@ authors: - "@maxulysse" - "@FriederikeHanssen" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" From a76298fb06809c6202ebe45ddac6f2ca9b408186 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 1 Nov 2023 19:22:10 +0000 Subject: [PATCH 24/27] Update samtools/merge module and remove its patch. Combine gzi and fai parameters as one. --- conf/test.config | 2 +- modules.json | 2 +- modules/nf-core/samtools/merge/main.nf | 3 +- modules/nf-core/samtools/merge/meta.yml | 4 -- .../samtools/merge/samtools-merge.diff | 44 ------------------- nextflow.config | 1 - nextflow_schema.json | 8 +--- subworkflows/local/input_filter_split.nf | 2 - subworkflows/local/input_merge.nf | 7 +-- workflows/variantcalling.nf | 34 +++++++++----- 10 files changed, 31 insertions(+), 76 deletions(-) delete mode 100644 modules/nf-core/samtools/merge/samtools-merge.diff diff --git a/conf/test.config b/conf/test.config index ad1e731..49b740b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,7 +27,7 @@ params { // Reference index file (optional) // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' - // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' + fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' // Interval bed file interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed' diff --git a/modules.json b/modules.json index 9781734..956cd97 100644 --- a/modules.json +++ b/modules.json @@ -32,7 +32,7 @@ }, "samtools/merge": { "branch": "master", - "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", + "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", "installed_by": ["modules"], "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index 0affdbf..21f785c 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_MERGE { tag "$meta.id" label 'process_low' - conda 'modules/nf-core/samtools/merge/environment.yml' + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" @@ -11,7 +11,6 @@ process SAMTOOLS_MERGE { tuple val(meta), path(input_files, stageAs: "?/*") tuple val(meta2), path(fasta) tuple val(meta3), path(fai) - tuple val(meta4), path(gzi) output: tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml index bf0da8b..2e8f3db 100644 --- a/modules/nf-core/samtools/merge/meta.yml +++ b/modules/nf-core/samtools/merge/meta.yml @@ -43,10 +43,6 @@ input: type: file description: Index of the reference file the CRAM was created with (optional) pattern: "*.fai" - - gzi: - type: file - description: Index of the reference file the CRAM was created with (optional) - pattern: "*.gzi" output: - meta: type: map diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff deleted file mode 100644 index afe2536..0000000 --- a/modules/nf-core/samtools/merge/samtools-merge.diff +++ /dev/null @@ -1,44 +0,0 @@ -Changes in module 'nf-core/samtools/merge' ---- modules/nf-core/samtools/merge/meta.yml -+++ modules/nf-core/samtools/merge/meta.yml -@@ -43,6 +43,10 @@ - type: file - description: Index of the reference file the CRAM was created with (optional) - pattern: "*.fai" -+ - gzi: -+ type: file -+ description: Index of the reference file the CRAM was created with (optional) -+ pattern: "*.gzi" - output: - - meta: - type: map -@@ -65,6 +69,10 @@ - type: file - description: BAM index file (optional) - pattern: "*.csi" -+ - crai: -+ type: file -+ description: CRAM index file (optional) -+ pattern: "*.crai" - authors: - - "@drpatelh" - - "@yuukiiwa " - ---- modules/nf-core/samtools/merge/main.nf -+++ modules/nf-core/samtools/merge/main.nf -@@ -11,11 +11,13 @@ - tuple val(meta), path(input_files, stageAs: "?/*") - tuple val(meta2), path(fasta) - tuple val(meta3), path(fai) -+ tuple val(meta4), path(gzi) - - output: - tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam - tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram - tuple val(meta), path("*.csi") , optional:true, emit: csi -+ tuple val(meta), path("*.crai") , optional:true, emit: crai - path "versions.yml" , emit: versions - - - -************************************************************ diff --git a/nextflow.config b/nextflow.config index 4659669..c1e8ac3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,6 @@ params { sort_input = false fasta = null fai = null - gzi = null interval = null split_fasta_cutoff = 100000 diff --git a/nextflow_schema.json b/nextflow_schema.json index dc6428b..66cfeda 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -49,15 +49,11 @@ "properties": { "fasta": { "type": "string", - "description": "Path to FASTA genome file, either fasta or fast.gz" + "description": "Path to FASTA genome file, either fasta or fast.gz." }, "fai": { "type": "string", - "description": "Path to the index file of the FASTA genome file." - }, - "gzi": { - "type": "string", - "description": "Path to the gzi index file of the FASTA genome file. Required if fasta in gz format." + "description": "Path to the index file of the FASTA genome file, either fai or gzi." }, "interval": { "type": "string", diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index 2467b68..dc0710f 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -9,8 +9,6 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' workflow INPUT_FILTER_SPLIT { take: fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz - fai // file: /path/to/genome.*.fai - gzi // file: /path/to/genome.fasta.gz.gzi or null reads // [ val(meta), data, index ] interval // file: /path/to/intervals.bed split_fasta_cutoff // val(min_file_size) diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 58395d0..1c8a588 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -8,8 +8,7 @@ include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort' workflow INPUT_MERGE { take: fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz - fai // file: /path/to/genome.*.fai - gzi // file: /path/to/genome.fasta.gz.gzi or null + fai // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi reads // channel: [ val(meta), data ] sort_input // bollean: true or false @@ -53,12 +52,10 @@ workflow INPUT_MERGE { // call samtool merge ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first() - ch_gzi = gzi.map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }.first() SAMTOOLS_MERGE( merged_reads_with_meta, ch_fasta, - ch_fai, - ch_gzi + ch_fai ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index a4cf47c..68e2fef 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) WorkflowVariantcalling.initialise(params, log) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, params.interval ] +def checkPathParamList = [ params.input, params.fasta, params.fai, params.interval ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -18,8 +18,17 @@ if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, ' if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!' } // Check optional parameters -if (params.fai) { ch_fai = Channel.fromPath(params.fai) } else { ch_fai = Channel.empty() } -if (params.gzi) { ch_gzi = Channel.fromPath(params.gzi) } else { ch_gzi = Channel.empty() } +if (params.fai){ + if( ( params.fasta.endsWith('.gz') && params.fai.endsWith('.fai') ) + || + ( !params.fasta.endsWith('.gz') && params.fai.endsWith('.gzi') ) + ){ + exit 1, 'Reference fasta and its index file format not matched!' + } + ch_fai = Channel.fromPath(params.fai) +} else { + ch_fai = Channel.empty() +} if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } if (params.sort_input) { sort_input = params.sort_input } else { sort_input = false } @@ -72,7 +81,7 @@ workflow VARIANTCALLING { // // check reference fasta index given or not // - if( params.fai == null || ( params.fasta.endsWith('fasta.gz') && params.gzi == null ) ){ + if( params.fai == null ){ ch_fasta .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } @@ -80,15 +89,23 @@ workflow VARIANTCALLING { SAMTOOLS_FAIDX ( ch_genome, [[], []] ) ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) - + SAMTOOLS_FAIDX.out.fai .map{ mata, fai -> fai } .set{ ch_fai } - + SAMTOOLS_FAIDX.out.gzi .map{ meta, gzi -> gzi } .set{ ch_gzi } + if( params.fasta.endsWith('.gz') ){ + ch_index = ch_gzi + }else{ + ch_index = ch_fai + } + + }else{ + ch_index = ch_fai } // @@ -104,8 +121,7 @@ workflow VARIANTCALLING { // INPUT_MERGE ( ch_fasta, - ch_fai, - ch_gzi, + ch_index, INPUT_CHECK.out.reads, sort_input ) @@ -117,8 +133,6 @@ workflow VARIANTCALLING { // INPUT_FILTER_SPLIT ( ch_fasta, - ch_fai, - ch_gzi, INPUT_MERGE.out.indexed_merged_reads, ch_interval, split_fasta_cutoff From 3cf953d19c3f7c9fc40e5f7878d379364286bd14 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Wed, 1 Nov 2023 20:42:24 +0000 Subject: [PATCH 25/27] remove sort_input params. Always sort the input before merging. --- conf/test.config | 2 +- docs/usage.md | 2 -- nextflow.config | 1 - nextflow_schema.json | 4 ---- subworkflows/local/input_merge.nf | 35 +++++++++++++------------------ workflows/variantcalling.nf | 2 -- 6 files changed, 16 insertions(+), 30 deletions(-) diff --git a/conf/test.config b/conf/test.config index 49b740b..01515e9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,7 +27,7 @@ params { // Reference index file (optional) // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' - fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' // Interval bed file interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed' diff --git a/docs/usage.md b/docs/usage.md index 1764889..46b74b3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -23,8 +23,6 @@ sample1,pacbio,sample1_2.cram sample1,pacbio,sample1_3.cram ``` -If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples. - ### Full samplesheet A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data. diff --git a/nextflow.config b/nextflow.config index c1e8ac3..399b382 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,7 +11,6 @@ params { // Input options input = null - sort_input = false fasta = null fai = null interval = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 66cfeda..d40c501 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -28,10 +28,6 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, - "sort_input": { - "type": "boolean", - "description": "Boolean whether to sort input reads files" - }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf index 1c8a588..90bb82f 100644 --- a/subworkflows/local/input_merge.nf +++ b/subworkflows/local/input_merge.nf @@ -10,50 +10,45 @@ workflow INPUT_MERGE { fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz fai // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi reads // channel: [ val(meta), data ] - sort_input // bollean: true or false main: ch_versions = Channel.empty() - - // sort input reads if asked - if ( sort_input ) { - - SAMTOOLS_SORT( reads ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) - sorted_reads = SAMTOOLS_SORT.out.bam - } else { - - sorted_reads = reads + // group input meta data together by sample name + reads + .map{ meta, bam_cram -> [ meta.sample, meta ] } + .groupTuple() + .set{ grouped_reads_meta } - } + // sort input reads + SAMTOOLS_SORT( reads ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) + sorted_reads = SAMTOOLS_SORT.out.bam // group input reads file by sample name sorted_reads .map{ meta, bam_cram -> [ meta.sample, bam_cram ] } .groupTuple() - .set{ merged_reads } + .set{ grouped_reads } - // group input meta data together by sample name as well + // join grouped reads and meta // use the first meta data for the combined reads - reads - .map{ meta, bam_cram -> [ meta.sample, meta ] } - .groupTuple() + grouped_reads_meta .map { sample, meta_list -> [sample, meta_list[0]] } - .join( merged_reads ) + .join( grouped_reads ) .map { sample, meta, bam_cram_list -> [ [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', type: meta.type ], bam_cram_list ]} - .set { merged_reads_with_meta } + .set { grouped_reads_with_meta } // call samtool merge ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first() - SAMTOOLS_MERGE( merged_reads_with_meta, + SAMTOOLS_MERGE( grouped_reads_with_meta, ch_fasta, ch_fai ) diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 68e2fef..99c35e5 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -31,7 +31,6 @@ if (params.fai){ } if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } -if (params.sort_input) { sort_input = params.sort_input } else { sort_input = false } if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 } /* @@ -123,7 +122,6 @@ workflow VARIANTCALLING { ch_fasta, ch_index, INPUT_CHECK.out.reads, - sort_input ) ch_versions = ch_versions.mix(INPUT_MERGE.out.versions) From d6fa00f37eb2729b6c67b3d0a4d36f02e1536b35 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 2 Nov 2023 11:41:59 +0000 Subject: [PATCH 26/27] only validate the sample sheet not transform the sample names --- bin/check_samplesheet.py | 22 +++++++--------------- subworkflows/local/input_check.nf | 2 +- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 52af146..d088e65 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -"""Provide a command line tool to validate and transform tabular samplesheets.""" +"""Provide a command line tool to validate tabular samplesheets.""" import argparse @@ -55,9 +55,9 @@ def __init__( self._type_col = type_col self._file_col = file_col self._seen = set() - self.modified = [] + self.validated = [] - def validate_and_transform(self, row): + def validate(self, row): """ Perform all validations on the given row. @@ -70,7 +70,7 @@ def validate_and_transform(self, row): self._validate_type(row) self._validate_data_file(row) self._seen.add((row[self._sample_col], row[self._file_col])) - self.modified.append(row) + self.validated.append(row) def _validate_sample(self, row): """Assert that the sample name exists and convert spaces to underscores.""" @@ -105,17 +105,9 @@ def validate_unique_samples(self): """ Assert that the combination of sample name and data filename is unique. - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different files, e.g., multiple runs per experiment. - """ - if len(self._seen) != len(self.modified): + if len(self._seen) != len(self.validated): raise AssertionError("The combination of sample name and data file must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): @@ -195,7 +187,7 @@ def check_samplesheet(file_in, file_out): checker = RowChecker() for i, row in enumerate(reader): try: - checker.validate_and_transform(row) + checker.validate(row) except AssertionError as error: logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) @@ -205,7 +197,7 @@ def check_samplesheet(file_in, file_out): with file_out.open(mode="w", newline="") as out_handle: writer = csv.DictWriter(out_handle, header, delimiter=",") writer.writeheader() - for row in checker.modified: + for row in checker.validated: writer.writerow(row) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index c3e0e49..7e9f667 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -13,7 +13,7 @@ workflow INPUT_CHECK { .csv .splitCsv ( header:true, sep:',' ) .map { [ - [ id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype ], + [ id: it.sample, sample: it.sample, type: it.datatype ], file(it.datafile) ] } .set { reads } From 3e8705cdee3e4adb71309ab4c40ba246c3cf9a73 Mon Sep 17 00:00:00 2001 From: Guoying Qi Date: Thu, 2 Nov 2023 11:42:32 +0000 Subject: [PATCH 27/27] update fai file for the full test and formating --- conf/test_full.config | 3 +-- workflows/variantcalling.nf | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 8532e0d..3a2d38e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -23,6 +23,5 @@ params { fasta = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz' // Reference index file - fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.fai' - gzi = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' + fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' } diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 99c35e5..82267f7 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -25,10 +25,11 @@ if (params.fai){ ){ exit 1, 'Reference fasta and its index file format not matched!' } - ch_fai = Channel.fromPath(params.fai) + ch_fai = Channel.fromPath(params.fai) } else { - ch_fai = Channel.empty() + ch_fai = Channel.empty() } + if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 }