sanger-tol · gq1 · Nov 3, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/README.md b/README.md
@@ -18,11 +18,12 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 ## Pipeline summary
 
-The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.
+The pipeline takes aligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.
 
 Steps involved:
 
 - Split fasta file into smaller files, normally one sequence per file unless the sequences are too small.
+- Merge input BAM/CRAM files together if they have the same sample names.
 - Filter out reads using the `-F 0x900` option to only retain the primary alignments.
 - Run DeepVariant using filtered BAM/CRAM files against each of split fasta files.
 - Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file.

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,4 +1,5 @@
-sample,datatype,datafile,indexfile
-sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+sample,datatype,datafile
+sample1,pacbio,/path/to/data/file/file1.bam
+sample2,pacbio,/path/to/data/file/file2.cram
+sample3,pacbio,/path/to/data/file/file3-1.bam
+sample3,pacbio,/path/to/data/file/file3-2.cram
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
@@ -1,4 +1,5 @@
-sample,datatype,datafile,indexfile
-icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai
-icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai
-icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi
+sample,datatype,datafile
+icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
+icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
diff --git a/assets/samplesheet_test_full.csv b/assets/samplesheet_test_full.csv
@@ -1,2 +1,2 @@
-sample,datatype,datafile,indexfile
-icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai
+sample,datatype,datafile
+icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -21,13 +21,8 @@
                 "type": "string",
                 "pattern": "^\\S+\\.(bam|cram)$",
                 "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'"
-            },
-            "indexfile": {
-                "type": "string",
-                "pattern": "^\\S+\\.(bai|csi|crai)$",
-                "errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'"
             }
         },
-        "required": ["sample", "datatype", "datafile", "indexfile"]
+        "required": ["sample", "datatype", "datafile"]
     }
 }
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-"""Provide a command line tool to validate and transform tabular samplesheets."""
+"""Provide a command line tool to validate tabular samplesheets."""
 
 
 import argparse
@@ -35,7 +35,6 @@ def __init__(
         sample_col="sample",
         type_col="datatype",
         file_col="datafile",
-        index_col="indexfile",
         **kwargs,
     ):
         """
@@ -48,20 +47,17 @@ def __init__(
                 the read data (default "datatype").
             file_col (str): The name of the column that contains the file path for
                 the read data (default "datafile").
-            index_col (str): The name of the column that contains the index file
-                for the data (default "indexfile").
 
         """
         super().__init__(**kwargs)
 
         self._sample_col = sample_col
         self._type_col = type_col
         self._file_col = file_col
-        self._index_col = index_col
         self._seen = set()
-        self.modified = []
+        self.validated = []
 
-    def validate_and_transform(self, row):
+    def validate(self, row):
         """
         Perform all validations on the given row.
 
@@ -73,9 +69,8 @@ def validate_and_transform(self, row):
         self._validate_sample(row)
         self._validate_type(row)
         self._validate_data_file(row)
-        self._validate_index_file(row)
         self._seen.add((row[self._sample_col], row[self._file_col]))
-        self.modified.append(row)
+        self.validated.append(row)
 
     def _validate_sample(self, row):
         """Assert that the sample name exists and convert spaces to underscores."""
@@ -98,17 +93,6 @@ def _validate_data_file(self, row):
             raise AssertionError("Data file is required.")
         self._validate_data_format(row[self._file_col])
 
-    def _validate_index_file(self, row):
-        """Assert that the indexfile is non-empty and has the right format."""
-        if len(row[self._index_col]) <= 0:
-            raise AssertionError("Data index file is required.")
-        if row[self._file_col].endswith("bam") and not (
-            row[self._index_col].endswith("bai") or row[self._index_col].endswith("csi")
-        ):
-            raise AssertionError("bai or csi index file should be given for bam file.")
-        if row[self._file_col].endswith("cram") and not row[self._index_col].endswith("crai"):
-            raise AssertionError("crai index file shuld be given for cram file.")
-
     def _validate_data_format(self, filename):
         """Assert that a given filename has one of the expected read data file extensions."""
         if not any(filename.endswith(extension) for extension in self.DATA_VALID_FORMATS):
@@ -121,17 +105,9 @@ def validate_unique_samples(self):
         """
         Assert that the combination of sample name and data filename is unique.
 
-        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different files, e.g., multiple runs per experiment.
-
         """
-        if len(self._seen) != len(self.modified):
+        if len(self._seen) != len(self.validated):
             raise AssertionError("The combination of sample name and data file must be unique.")
-        seen = Counter()
-        for row in self.modified:
-            sample = row[self._sample_col]
-            seen[sample] += 1
-            row[self._sample_col] = f"{sample}_T{seen[sample]}"
 
 
 def read_head(handle, num_lines=10):
@@ -162,7 +138,7 @@ def sniff_format(handle):
     peek = read_head(handle)
     handle.seek(0)
     sniffer = csv.Sniffer()
-    # same input file could retrun random true or false
+    # same input file could return random true or false
     # disable it now
     # the following validation should be enough
     # if not sniffer.has_header(peek):
@@ -188,16 +164,17 @@ def check_samplesheet(file_in, file_out):
         This function checks that the samplesheet follows the following structure,
         see also the `variantcalling samplesheet`_::
 
-            sample,datatype,datafile,indexfile
-            sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-            sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-            sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+            sample,datatype,datafile
+            sample1,pacbio,/path/to/data/file/file1.bam
+            sample2,pacbio,/path/to/data/file/file2.cram
+            sample3,pacbio,/path/to/data/file/file3-1.bam
+            sample3,pacbio,/path/to/data/file/file3-2.cram
 
     .. _variantcalling samplesheet:
         https://raw.githubusercontent.com/sanger-tol/variantcalling/main/assets/samplesheet.csv
 
     """
-    required_columns = {"sample", "datatype", "datafile", "indexfile"}
+    required_columns = {"sample", "datatype", "datafile"}
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
@@ -210,7 +187,7 @@ def check_samplesheet(file_in, file_out):
         checker = RowChecker()
         for i, row in enumerate(reader):
             try:
-                checker.validate_and_transform(row)
+                checker.validate(row)
             except AssertionError as error:
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
@@ -220,7 +197,7 @@ def check_samplesheet(file_in, file_out):
     with file_out.open(mode="w", newline="") as out_handle:
         writer = csv.DictWriter(out_handle, header, delimiter=",")
         writer.writeheader()
-        for row in checker.modified:
+        for row in checker.validated:
             writer.writerow(row)
 
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -22,6 +22,11 @@ process {
 
     withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' {
         ext.args   = '--output-fmt cram --write-index -F 0x900'
+        ext.prefix = { "${meta.id}_filtered" }
+    }
+
+    withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' {
+        ext.args   = '--write-index'
     }
 
     withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' {

diff --git a/conf/test.config b/conf/test.config
@@ -25,9 +25,9 @@ params {
     // Fasta references
     fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz'
 
-    // Reference index file
-    fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
-    gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
+    // Reference index file (optional)
+    // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
+    // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
 
     // Interval bed file
     interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed'

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -23,6 +23,5 @@ params {
     fasta = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz'
 
     // Reference index file
-    fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.fai'
-    gzi = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi'
+    fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi'
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -2,11 +2,11 @@
 
 ## Introduction
 
-The pipleline takes aligned sample reads (CRAM/BAM files and their index files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants.
+The pipeline takes aligned sample reads (CRAM/BAM files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants.
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with at least 4 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
 
 ```bash
 --input '[path to samplesheet file]'
@@ -17,29 +17,28 @@ You will need to create a samplesheet with information about the samples you wou
 The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. Below is an example for the same sample sequenced across 3 lanes:
 
 ```console
-sample,datatype,datafile,indexfile
-sample1,pacbio,sample1_1.cram,sample1_1.cram.crai
-sample1,pacbio,sample1_2.cram,sample1_3.cram.crai
-sample1,pacbio,sample1_3.cram,sample1_3.cram.crai
+sample,datatype,datafile
+sample1,pacbio,sample1_1.cram
+sample1,pacbio,sample1_2.cram
+sample1,pacbio,sample1_3.cram
 ```
 
 ### Full samplesheet
 
 A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data.
 
 ```console
-sample,datatype,datafile,indexfile
-sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+sample,datatype,datafile
+sample1,pacbio,/path/to/data/file/file1.bam
+sample2,pacbio,/path/to/data/file/file2.cram
+sample3,pacbio,/path/to/data/file/file3.bam
 ```
 
-| Column      | Description                                                                                                                                                                            |
-| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`    | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `datatype`  | Sequencing data type. Must be `pacbio`.                                                                                                                                                |
-| `datafile`  | The location for either BAM or CRAM file.                                                                                                                                              |
-| `indexfile` | The location for BAM or CRAM index file – BAI, CSI or CRAI.                                                                                                                            |
+| Column     | Description                                                                                                                                                                            |
+| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `datatype` | Sequencing data type. Must be `pacbio`.                                                                                                                                                |
+| `datafile` | The location for either BAM or CRAM file.                                                                                                                                              |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -62,7 +61,7 @@ work                # Directory containing the nextflow working files
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
 
-The pipeline will split the intput fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`.
+The pipeline will split the input fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`.
 
 ### Updating the pipeline
 

diff --git a/modules.json b/modules.json
@@ -30,6 +30,17 @@
                         "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
                         "installed_by": ["modules"]
                     },
+                    "samtools/merge": {
+                        "branch": "master",
+                        "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
+                    },
+                    "samtools/sort": {
+                        "branch": "master",
+                        "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",

diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml