diff --git a/README.md b/README.md index 62da4b2..ed207d1 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Pipeline summary -The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. +The pipeline takes aligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. Steps involved: - Split fasta file into smaller files, normally one sequence per file unless the sequences are too small. +- Merge input BAM/CRAM files together if they have the same sample names. - Filter out reads using the `-F 0x900` option to only retain the primary alignments. - Run DeepVariant using filtered BAM/CRAM files against each of split fasta files. - Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file. diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 2ea95db..9de2e5b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3-1.bam +sample3,pacbio,/path/to/data/file/file3-2.cram diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv index cf5546a..6eb03e5 100644 --- a/assets/samplesheet_test.csv +++ b/assets/samplesheet_test.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai -icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai -icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi +sample,datatype,datafile +icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam diff --git a/assets/samplesheet_test_full.csv b/assets/samplesheet_test_full.csv index 4495dff..1e40e2b 100644 --- a/assets/samplesheet_test_full.csv +++ b/assets/samplesheet_test_full.csv @@ -1,2 +1,2 @@ -sample,datatype,datafile,indexfile -icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai +sample,datatype,datafile +icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram diff --git a/assets/schema_input.json b/assets/schema_input.json index 43497e9..f264cf6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -21,13 +21,8 @@ "type": "string", "pattern": "^\\S+\\.(bam|cram)$", "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'" - }, - "indexfile": { - "type": "string", - "pattern": "^\\S+\\.(bai|csi|crai)$", - "errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'" } }, - "required": ["sample", "datatype", "datafile", "indexfile"] + "required": ["sample", "datatype", "datafile"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 6bbd806..d088e65 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -"""Provide a command line tool to validate and transform tabular samplesheets.""" +"""Provide a command line tool to validate tabular samplesheets.""" import argparse @@ -35,7 +35,6 @@ def __init__( sample_col="sample", type_col="datatype", file_col="datafile", - index_col="indexfile", **kwargs, ): """ @@ -48,8 +47,6 @@ def __init__( the read data (default "datatype"). file_col (str): The name of the column that contains the file path for the read data (default "datafile"). - index_col (str): The name of the column that contains the index file - for the data (default "indexfile"). """ super().__init__(**kwargs) @@ -57,11 +54,10 @@ def __init__( self._sample_col = sample_col self._type_col = type_col self._file_col = file_col - self._index_col = index_col self._seen = set() - self.modified = [] + self.validated = [] - def validate_and_transform(self, row): + def validate(self, row): """ Perform all validations on the given row. @@ -73,9 +69,8 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_type(row) self._validate_data_file(row) - self._validate_index_file(row) self._seen.add((row[self._sample_col], row[self._file_col])) - self.modified.append(row) + self.validated.append(row) def _validate_sample(self, row): """Assert that the sample name exists and convert spaces to underscores.""" @@ -98,17 +93,6 @@ def _validate_data_file(self, row): raise AssertionError("Data file is required.") self._validate_data_format(row[self._file_col]) - def _validate_index_file(self, row): - """Assert that the indexfile is non-empty and has the right format.""" - if len(row[self._index_col]) <= 0: - raise AssertionError("Data index file is required.") - if row[self._file_col].endswith("bam") and not ( - row[self._index_col].endswith("bai") or row[self._index_col].endswith("csi") - ): - raise AssertionError("bai or csi index file should be given for bam file.") - if row[self._file_col].endswith("cram") and not row[self._index_col].endswith("crai"): - raise AssertionError("crai index file shuld be given for cram file.") - def _validate_data_format(self, filename): """Assert that a given filename has one of the expected read data file extensions.""" if not any(filename.endswith(extension) for extension in self.DATA_VALID_FORMATS): @@ -121,17 +105,9 @@ def validate_unique_samples(self): """ Assert that the combination of sample name and data filename is unique. - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different files, e.g., multiple runs per experiment. - """ - if len(self._seen) != len(self.modified): + if len(self._seen) != len(self.validated): raise AssertionError("The combination of sample name and data file must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): @@ -162,7 +138,7 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - # same input file could retrun random true or false + # same input file could return random true or false # disable it now # the following validation should be enough # if not sniffer.has_header(peek): @@ -188,16 +164,17 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `variantcalling samplesheet`_:: - sample,datatype,datafile,indexfile - sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai - sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai - sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi + sample,datatype,datafile + sample1,pacbio,/path/to/data/file/file1.bam + sample2,pacbio,/path/to/data/file/file2.cram + sample3,pacbio,/path/to/data/file/file3-1.bam + sample3,pacbio,/path/to/data/file/file3-2.cram .. _variantcalling samplesheet: https://raw.githubusercontent.com/sanger-tol/variantcalling/main/assets/samplesheet.csv """ - required_columns = {"sample", "datatype", "datafile", "indexfile"} + required_columns = {"sample", "datatype", "datafile"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) @@ -210,7 +187,7 @@ def check_samplesheet(file_in, file_out): checker = RowChecker() for i, row in enumerate(reader): try: - checker.validate_and_transform(row) + checker.validate(row) except AssertionError as error: logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) @@ -220,7 +197,7 @@ def check_samplesheet(file_in, file_out): with file_out.open(mode="w", newline="") as out_handle: writer = csv.DictWriter(out_handle, header, delimiter=",") writer.writeheader() - for row in checker.modified: + for row in checker.validated: writer.writerow(row) diff --git a/conf/modules.config b/conf/modules.config index 7dc8677..0deaa90 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -22,6 +22,11 @@ process { withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { ext.args = '--output-fmt cram --write-index -F 0x900' + ext.prefix = { "${meta.id}_filtered" } + } + + withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' { + ext.args = '--write-index' } withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' { diff --git a/conf/test.config b/conf/test.config index be32ebe..01515e9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,9 +25,9 @@ params { // Fasta references fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz' - // Reference index file - fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' - gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' + // Reference index file (optional) + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' // Interval bed file interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed' diff --git a/conf/test_full.config b/conf/test_full.config index 8532e0d..3a2d38e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -23,6 +23,5 @@ params { fasta = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz' // Reference index file - fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.fai' - gzi = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' + fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' } diff --git a/docs/usage.md b/docs/usage.md index af4811c..46b74b3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,11 +2,11 @@ ## Introduction -The pipleline takes aligned sample reads (CRAM/BAM files and their index files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. +The pipeline takes aligned sample reads (CRAM/BAM files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with at least 4 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -17,10 +17,10 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,datatype,datafile,indexfile -sample1,pacbio,sample1_1.cram,sample1_1.cram.crai -sample1,pacbio,sample1_2.cram,sample1_3.cram.crai -sample1,pacbio,sample1_3.cram,sample1_3.cram.crai +sample,datatype,datafile +sample1,pacbio,sample1_1.cram +sample1,pacbio,sample1_2.cram +sample1,pacbio,sample1_3.cram ``` ### Full samplesheet @@ -28,18 +28,17 @@ sample1,pacbio,sample1_3.cram,sample1_3.cram.crai A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data. ```console -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3.bam ``` -| Column | Description | -| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `datatype` | Sequencing data type. Must be `pacbio`. | -| `datafile` | The location for either BAM or CRAM file. | -| `indexfile` | The location for BAM or CRAM index file – BAI, CSI or CRAI. | +| Column | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `datatype` | Sequencing data type. Must be `pacbio`. | +| `datafile` | The location for either BAM or CRAM file. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -62,7 +61,7 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -The pipeline will split the intput fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. +The pipeline will split the input fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. ### Updating the pipeline diff --git a/modules.json b/modules.json index bc363d0..956cd97 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,17 @@ "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, + "samtools/merge": { + "branch": "master", + "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" + }, + "samtools/sort": { + "branch": "master", + "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", + "installed_by": ["modules"] + }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml new file mode 100644 index 0000000..04c82f1 --- /dev/null +++ b/modules/nf-core/samtools/merge/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000..21f785c --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,57 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai") , optional:true, emit: crai + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + """ + touch ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000..2e8f3db --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,83 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 0000000..2b7753f --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 0000000..0732843 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" diff --git a/nextflow.config b/nextflow.config index 83e274f..399b382 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,6 @@ params { input = null fasta = null fai = null - gzi = null interval = null split_fasta_cutoff = 100000 diff --git a/nextflow_schema.json b/nextflow_schema.json index 3a5272c..d40c501 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -45,15 +45,11 @@ "properties": { "fasta": { "type": "string", - "description": "Path to FASTA genome file, either fasta or fast.gz" + "description": "Path to FASTA genome file, either fasta or fast.gz." }, "fai": { "type": "string", - "description": "Path to the index file of the FASTA genome file." - }, - "gzi": { - "type": "string", - "description": "Path to the gzi index file of the FASTA genome file. Required if fasta in gz format." + "description": "Path to the index file of the FASTA genome file, either fai or gzi." }, "interval": { "type": "string", @@ -66,7 +62,7 @@ "description": "The minimum fasta file size when splitting the input fasta file by sequence." } }, - "required": ["fasta", "fai"] + "required": ["fasta"] }, "institutional_config_options": { "title": "Institutional config options", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d2f72e9..7e9f667 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,10 +12,13 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [[id: it.sample, type: it.datatype], file(it.datafile), file(it.indexfile)] } + .map { [ + [ id: it.sample, sample: it.sample, type: it.datatype ], + file(it.datafile) + ] } .set { reads } - + emit: - reads // channel: [ val(meta), data, index ] + reads // channel: [ val(meta), data ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index c820901..dc0710f 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -9,8 +9,6 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' workflow INPUT_FILTER_SPLIT { take: fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz - fai // file: /path/to/genome.*.fai - gzi // file: /path/to/genome.fasta.gz.gzi or null reads // [ val(meta), data, index ] interval // file: /path/to/intervals.bed split_fasta_cutoff // val(min_file_size) @@ -19,8 +17,7 @@ workflow INPUT_FILTER_SPLIT { ch_versions = Channel.empty() // split the fasta file into files with one sequence each, group them by file size - Channel - .fromPath ( fasta ) + fasta .splitFasta ( file:true ) .branch { small: it.size() < split_fasta_cutoff @@ -62,13 +59,15 @@ workflow INPUT_FILTER_SPLIT { .set { fasta_fai } // filter reads - SAMTOOLS_VIEW ( reads, [ [], fasta ], [] ) + ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() + + SAMTOOLS_VIEW ( reads, ch_fasta, [] ) ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) // combine reads with splitted references SAMTOOLS_VIEW.out.cram .join ( SAMTOOLS_VIEW.out.crai ) - .map { filtered_reads -> filtered_reads + [interval ?: []] } + .combine(interval.ifEmpty([[]])) .combine ( fasta_fai ) .set { cram_crai_fasta_fai } diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf new file mode 100644 index 0000000..90bb82f --- /dev/null +++ b/subworkflows/local/input_merge.nf @@ -0,0 +1,70 @@ +// +// Merge READS(bam or cram files) together by sample name +// + +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort' + +workflow INPUT_MERGE { + take: + fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz + fai // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi + reads // channel: [ val(meta), data ] + + main: + ch_versions = Channel.empty() + + // group input meta data together by sample name + reads + .map{ meta, bam_cram -> [ meta.sample, meta ] } + .groupTuple() + .set{ grouped_reads_meta } + + // sort input reads + SAMTOOLS_SORT( reads ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) + sorted_reads = SAMTOOLS_SORT.out.bam + + // group input reads file by sample name + sorted_reads + .map{ meta, bam_cram -> [ meta.sample, bam_cram ] } + .groupTuple() + .set{ grouped_reads } + + // join grouped reads and meta + // use the first meta data for the combined reads + grouped_reads_meta + .map { sample, meta_list -> [sample, meta_list[0]] } + .join( grouped_reads ) + .map { sample, meta, bam_cram_list -> [ + [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined', + type: meta.type + ], + bam_cram_list + ]} + .set { grouped_reads_with_meta } + + // call samtool merge + ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() + ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first() + + SAMTOOLS_MERGE( grouped_reads_with_meta, + ch_fasta, + ch_fai + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) + + // concat merged bam or cram together along with their index file + SAMTOOLS_MERGE.out.bam + .join(SAMTOOLS_MERGE.out.csi) + .concat( + SAMTOOLS_MERGE.out.cram + .join(SAMTOOLS_MERGE.out.crai) + ) + .set{ indexed_merged_reads }; + + emit: + indexed_merged_reads = indexed_merged_reads + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 6c6ce09..82267f7 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -10,25 +10,28 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) WorkflowVariantcalling.initialise(params, log) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, params.interval ] +def checkPathParamList = [ params.input, params.fasta, params.fai, params.interval ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { input_file = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { fasta_file = file(params.fasta) } else { exit 1, 'Reference fasta not specified!' } -if (params.fai) { fai_file = file(params.fai) } else { exit 1, 'Reference fasta index not specified!' } - -// Check gzi being given if compressed fasta is provided -if (params.gzi) { - gzi_file = file(params.gzi) -} else if ( params.fasta.endsWith('fasta.gz') ) { - exit 1, 'Reference fasta index gzi file not specified for fasta.gz file!' -} else { - gzi_file = null -} +if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!' } // Check optional parameters -if (params.interval) { interval_file = file(params.interval) } else { interval_file = null } +if (params.fai){ + if( ( params.fasta.endsWith('.gz') && params.fai.endsWith('.fai') ) + || + ( !params.fasta.endsWith('.gz') && params.fai.endsWith('.gzi') ) + ){ + exit 1, 'Reference fasta and its index file format not matched!' + } + ch_fai = Channel.fromPath(params.fai) +} else { + ch_fai = Channel.empty() +} + +if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } + if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 } /* @@ -47,6 +50,7 @@ if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_MERGE } from '../subworkflows/local/input_merge' include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split' include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' @@ -60,6 +64,7 @@ include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,27 +78,66 @@ workflow VARIANTCALLING { ch_versions = Channel.empty() + // + // check reference fasta index given or not + // + if( params.fai == null ){ + + ch_fasta + .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] } + .set { ch_genome } + + SAMTOOLS_FAIDX ( ch_genome, [[], []] ) + ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) + + SAMTOOLS_FAIDX.out.fai + .map{ mata, fai -> fai } + .set{ ch_fai } + + SAMTOOLS_FAIDX.out.gzi + .map{ meta, gzi -> gzi } + .set{ ch_gzi } + + if( params.fasta.endsWith('.gz') ){ + ch_index = ch_gzi + }else{ + ch_index = ch_fai + } + + }else{ + ch_index = ch_fai + } + // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - input_file + ch_input ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + // + // SUBWORKFLOW: merge the input reads by sample name + // + INPUT_MERGE ( + ch_fasta, + ch_index, + INPUT_CHECK.out.reads, + ) + ch_versions = ch_versions.mix(INPUT_MERGE.out.versions) + + // // SUBWORKFLOW: split the input fasta file and filter input reads // INPUT_FILTER_SPLIT ( - fasta_file, - fai_file, - gzi_file, - INPUT_CHECK.out.reads, - interval_file, + ch_fasta, + INPUT_MERGE.out.indexed_merged_reads, + ch_interval, split_fasta_cutoff ) ch_versions = ch_versions.mix(INPUT_FILTER_SPLIT.out.versions) - + // // SUBWORKFLOW: call deepvariant //