From 3f89203dcc7c1182b9f784c17e65cc1b6d9fcbfd Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 14:15:19 +0100
Subject: [PATCH 01/27] Remove index file from the samplesheet and update
 checking script

---
 assets/samplesheet.csv           |  9 +++++----
 assets/samplesheet_test.csv      |  9 +++++----
 assets/samplesheet_test_full.csv |  4 ++--
 assets/schema_input.json         |  7 +------
 bin/check_samplesheet.py         | 29 +++++++----------------------
 5 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
index 2ea95db..9de2e5b 100644
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@@ -1,4 +1,5 @@
-sample,datatype,datafile,indexfile
-sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+sample,datatype,datafile
+sample1,pacbio,/path/to/data/file/file1.bam
+sample2,pacbio,/path/to/data/file/file2.cram
+sample3,pacbio,/path/to/data/file/file3-1.bam
+sample3,pacbio,/path/to/data/file/file3-2.cram
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
index cf5546a..6eb03e5 100644
--- a/assets/samplesheet_test.csv
+++ b/assets/samplesheet_test.csv
@@ -1,4 +1,5 @@
-sample,datatype,datafile,indexfile
-icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai
-icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai
-icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi
+sample,datatype,datafile
+icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
+icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
diff --git a/assets/samplesheet_test_full.csv b/assets/samplesheet_test_full.csv
index 4495dff..1e40e2b 100644
--- a/assets/samplesheet_test_full.csv
+++ b/assets/samplesheet_test_full.csv
@@ -1,2 +1,2 @@
-sample,datatype,datafile,indexfile
-icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai
+sample,datatype,datafile
+icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram
diff --git a/assets/schema_input.json b/assets/schema_input.json
index 43497e9..f264cf6 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -21,13 +21,8 @@
                 "type": "string",
                 "pattern": "^\\S+\\.(bam|cram)$",
                 "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'"
-            },
-            "indexfile": {
-                "type": "string",
-                "pattern": "^\\S+\\.(bai|csi|crai)$",
-                "errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'"
             }
         },
-        "required": ["sample", "datatype", "datafile", "indexfile"]
+        "required": ["sample", "datatype", "datafile"]
     }
 }
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 6bbd806..52af146 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -35,7 +35,6 @@ def __init__(
         sample_col="sample",
         type_col="datatype",
         file_col="datafile",
-        index_col="indexfile",
         **kwargs,
     ):
         """
@@ -48,8 +47,6 @@ def __init__(
                 the read data (default "datatype").
             file_col (str): The name of the column that contains the file path for
                 the read data (default "datafile").
-            index_col (str): The name of the column that contains the index file
-                for the data (default "indexfile").
 
         """
         super().__init__(**kwargs)
@@ -57,7 +54,6 @@ def __init__(
         self._sample_col = sample_col
         self._type_col = type_col
         self._file_col = file_col
-        self._index_col = index_col
         self._seen = set()
         self.modified = []
 
@@ -73,7 +69,6 @@ def validate_and_transform(self, row):
         self._validate_sample(row)
         self._validate_type(row)
         self._validate_data_file(row)
-        self._validate_index_file(row)
         self._seen.add((row[self._sample_col], row[self._file_col]))
         self.modified.append(row)
 
@@ -98,17 +93,6 @@ def _validate_data_file(self, row):
             raise AssertionError("Data file is required.")
         self._validate_data_format(row[self._file_col])
 
-    def _validate_index_file(self, row):
-        """Assert that the indexfile is non-empty and has the right format."""
-        if len(row[self._index_col]) <= 0:
-            raise AssertionError("Data index file is required.")
-        if row[self._file_col].endswith("bam") and not (
-            row[self._index_col].endswith("bai") or row[self._index_col].endswith("csi")
-        ):
-            raise AssertionError("bai or csi index file should be given for bam file.")
-        if row[self._file_col].endswith("cram") and not row[self._index_col].endswith("crai"):
-            raise AssertionError("crai index file shuld be given for cram file.")
-
     def _validate_data_format(self, filename):
         """Assert that a given filename has one of the expected read data file extensions."""
         if not any(filename.endswith(extension) for extension in self.DATA_VALID_FORMATS):
@@ -162,7 +146,7 @@ def sniff_format(handle):
     peek = read_head(handle)
     handle.seek(0)
     sniffer = csv.Sniffer()
-    # same input file could retrun random true or false
+    # same input file could return random true or false
     # disable it now
     # the following validation should be enough
     # if not sniffer.has_header(peek):
@@ -188,16 +172,17 @@ def check_samplesheet(file_in, file_out):
         This function checks that the samplesheet follows the following structure,
         see also the `variantcalling samplesheet`_::
 
-            sample,datatype,datafile,indexfile
-            sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-            sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-            sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+            sample,datatype,datafile
+            sample1,pacbio,/path/to/data/file/file1.bam
+            sample2,pacbio,/path/to/data/file/file2.cram
+            sample3,pacbio,/path/to/data/file/file3-1.bam
+            sample3,pacbio,/path/to/data/file/file3-2.cram
 
     .. _variantcalling samplesheet:
         https://raw.githubusercontent.com/sanger-tol/variantcalling/main/assets/samplesheet.csv
 
     """
-    required_columns = {"sample", "datatype", "datafile", "indexfile"}
+    required_columns = {"sample", "datatype", "datafile"}
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))

From cef527b538f5c6e72d014cd7f9eae9c0eec584dc Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 14:50:54 +0100
Subject: [PATCH 02/27] Remove indexfile from Input_check workflow

---
 subworkflows/local/input_check.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d2f72e9..30a07d5 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -12,10 +12,10 @@ workflow INPUT_CHECK {
     SAMPLESHEET_CHECK ( samplesheet )
         .csv
         .splitCsv ( header:true, sep:',' )
-        .map { [[id: it.sample, type: it.datatype], file(it.datafile), file(it.indexfile)] }
+        .map { [[id: it.sample, type: it.datatype], file(it.datafile)] }
         .set { reads }
 
     emit:
-    reads                                     // channel: [ val(meta), data, index ]
+    reads                                     // channel: [ val(meta), data ]
     versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }

From 67826e3eeaaa6a7007b562103b72551f83332f9e Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 17:42:56 +0100
Subject: [PATCH 03/27] add sample name to the meta data

---
 subworkflows/local/input_check.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 30a07d5..aa0bbc9 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -12,9 +12,9 @@ workflow INPUT_CHECK {
     SAMPLESHEET_CHECK ( samplesheet )
         .csv
         .splitCsv ( header:true, sep:',' )
-        .map { [[id: it.sample, type: it.datatype], file(it.datafile)] }
+        .map { [[id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype], file(it.datafile)] }
         .set { reads }
-
+        
     emit:
     reads                                     // channel: [ val(meta), data ]
     versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]

From 2d30d826a2a48bece2af926d00d93a672aa1e47d Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 17:55:56 +0100
Subject: [PATCH 04/27] nf-core modules install samtools/merge

---
 modules.json                            |  5 ++
 modules/nf-core/samtools/merge/main.nf  | 56 +++++++++++++++++++
 modules/nf-core/samtools/merge/meta.yml | 73 +++++++++++++++++++++++++
 3 files changed, 134 insertions(+)
 create mode 100644 modules/nf-core/samtools/merge/main.nf
 create mode 100644 modules/nf-core/samtools/merge/meta.yml

diff --git a/modules.json b/modules.json
index bc363d0..6f79373 100644
--- a/modules.json
+++ b/modules.json
@@ -30,6 +30,11 @@
                         "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
                         "installed_by": ["modules"]
                     },
+                    "samtools/merge": {
+                        "branch": "master",
+                        "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
new file mode 100644
index 0000000..b73b7cb
--- /dev/null
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -0,0 +1,56 @@
+process SAMTOOLS_MERGE {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(input_files, stageAs: "?/*")
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(fai)
+
+    output:
+    tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
+    tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram
+    tuple val(meta), path("*.csi")         , optional:true, emit: csi
+    path  "versions.yml"                                  , emit: versions
+
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args   ?: ''
+    prefix   = task.ext.prefix ?: "${meta.id}"
+    def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension()
+    def reference = fasta ? "--reference ${fasta}" : ""
+    """
+    samtools \\
+        merge \\
+        --threads ${task.cpus-1} \\
+        $args \\
+        ${reference} \\
+        ${prefix}.${file_type} \\
+        $input_files
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}"
+    def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension()
+    """
+    touch ${prefix}.${file_type}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml
new file mode 100644
index 0000000..3a815f7
--- /dev/null
+++ b/modules/nf-core/samtools/merge/meta.yml
@@ -0,0 +1,73 @@
+name: samtools_merge
+description: Merge BAM or CRAM file
+keywords:
+  - merge
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input_files:
+      type: file
+      description: BAM/CRAM file
+      pattern: "*.{bam,cram,sam}"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fasta:
+      type: file
+      description: Reference file the CRAM was created with (optional)
+      pattern: "*.{fasta,fa}"
+  - meta3:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fai:
+      type: file
+      description: Index of the reference file the CRAM was created with (optional)
+      pattern: "*.fai"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file
+      pattern: "*.{bam}"
+  - cram:
+      type: file
+      description: CRAM file
+      pattern: "*.{cram}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - csi:
+      type: file
+      description: BAM index file (optional)
+      pattern: "*.csi"
+authors:
+  - "@drpatelh"
+  - "@yuukiiwa "
+  - "@maxulysse"
+  - "@FriederikeHanssen"
+  - "@ramprasadn"

From 953658f86209de21f23627f376f9d688ecc73acb Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 21:12:54 +0100
Subject: [PATCH 05/27] Add input merge sub workflow

---
 subworkflows/local/input_merge.nf | 44 +++++++++++++++++++++++++++++++
 workflows/variantcalling.nf       | 10 +++++++
 2 files changed, 54 insertions(+)
 create mode 100644 subworkflows/local/input_merge.nf

diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
new file mode 100644
index 0000000..121c246
--- /dev/null
+++ b/subworkflows/local/input_merge.nf
@@ -0,0 +1,44 @@
+//
+// Merge READS(bam or cram files) together by sample name
+//
+
+include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge'
+
+workflow INPUT_MERGE {
+    take:
+    fasta              // file: /path/to/genome.fasta or /path/to/genome.fasta.gz
+    fai                // file: /path/to/genome.*.fai
+    gzi                // file: /path/to/genome.fasta.gz.gzi or null
+    reads              // channel: [ val(meta), data ]
+
+    main:
+    // group input reads file by sample name
+    reads
+     .map{ it -> [ it[0].sample, it[1] ] }
+     .groupTuple()
+     .set{ merged_reads } 
+    
+    // group input meta data together by sample name as well
+    // use the first meta data for the combined reads
+    reads
+     .map{ it -> [ it[0].sample, it[0] ] }
+     .groupTuple()
+     .map { it -> [it[0], it[1][0]] }
+     .join( merged_reads )
+     .map { it -> [ it[1] , it [2] ]}
+     .set { merged_reads_with_meta }
+
+    // call samtool merge
+    SAMTOOLS_MERGE( merged_reads_with_meta, 
+                    [ [], fasta ],
+                    [ [], fai ],
+                    [ [], gzi ]
+    )
+
+    emit:
+    bam      = SAMTOOLS_MERGE.out.bam
+    cram     = SAMTOOLS_MERGE.out.cram 
+    csi      = SAMTOOLS_MERGE.out.csi 
+    versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ]
+
+}
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 6c6ce09..de68f0b 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -47,6 +47,7 @@ if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
 include { INPUT_CHECK        } from '../subworkflows/local/input_check'
+include { INPUT_MERGE        } from '../subworkflows/local/input_merge'
 include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split'
 include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller'
 
@@ -81,6 +82,15 @@ workflow VARIANTCALLING {
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
+    INPUT_MERGE (
+        fasta_file,
+        fai_file,
+        gzi_file,
+        INPUT_CHECK.out.reads
+    )
+    ch_versions = ch_versions.mix(INPUT_MERGE.out.versions)
+
+
     //
     // SUBWORKFLOW: split the input fasta file and filter input reads
     //

From 0d0d06e80093fb9a81cab0ea60032c9d1434bb70 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 21:41:47 +0100
Subject: [PATCH 06/27] comments

---
 workflows/variantcalling.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index de68f0b..5b2c581 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -82,6 +82,9 @@ workflow VARIANTCALLING {
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
+    //
+    // SUBWORKFLOW: merge the input reads by sample name
+    //
     INPUT_MERGE (
         fasta_file,
         fai_file,

From 224c7f23a0954875fdd1e10179e4d2c33dfa5b4f Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 5 Oct 2023 21:45:11 +0100
Subject: [PATCH 07/27] patch samtools_merge module to allow using fasta.gz
 file with gzi index file.

---
 modules.json                                  |  3 ++-
 modules/nf-core/samtools/merge/main.nf        |  1 +
 modules/nf-core/samtools/merge/meta.yml       |  4 +++
 .../samtools/merge/samtools-merge.diff        | 27 +++++++++++++++++++
 4 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 modules/nf-core/samtools/merge/samtools-merge.diff

diff --git a/modules.json b/modules.json
index 6f79373..133358e 100644
--- a/modules.json
+++ b/modules.json
@@ -33,7 +33,8 @@
                     "samtools/merge": {
                         "branch": "master",
                         "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
                     "samtools/view": {
                         "branch": "master",
diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
index b73b7cb..3c7faf4 100644
--- a/modules/nf-core/samtools/merge/main.nf
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -11,6 +11,7 @@ process SAMTOOLS_MERGE {
     tuple val(meta), path(input_files, stageAs: "?/*")
     tuple val(meta2), path(fasta)
     tuple val(meta3), path(fai)
+    tuple val(meta4), path(gzi)
 
     output:
     tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml
index 3a815f7..4a6bc23 100644
--- a/modules/nf-core/samtools/merge/meta.yml
+++ b/modules/nf-core/samtools/merge/meta.yml
@@ -43,6 +43,10 @@ input:
       type: file
       description: Index of the reference file the CRAM was created with (optional)
       pattern: "*.fai"
+  - gzi:
+      type: file
+      description: Index of the reference file the CRAM was created with (optional)
+      pattern: "*.gzi"
 output:
   - meta:
       type: map
diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff
new file mode 100644
index 0000000..4c48cc0
--- /dev/null
+++ b/modules/nf-core/samtools/merge/samtools-merge.diff
@@ -0,0 +1,27 @@
+Changes in module 'nf-core/samtools/merge'
+--- modules/nf-core/samtools/merge/meta.yml
++++ modules/nf-core/samtools/merge/meta.yml
+@@ -43,6 +43,10 @@
+       type: file
+       description: Index of the reference file the CRAM was created with (optional)
+       pattern: "*.fai"
++  - gzi:
++      type: file
++      description: Index of the reference file the CRAM was created with (optional)
++      pattern: "*.gzi"
+ output:
+   - meta:
+       type: map
+
+--- modules/nf-core/samtools/merge/main.nf
++++ modules/nf-core/samtools/merge/main.nf
+@@ -11,6 +11,7 @@
+     tuple val(meta), path(input_files, stageAs: "?/*")
+     tuple val(meta2), path(fasta)
+     tuple val(meta3), path(fai)
++    tuple val(meta4), path(gzi)
+ 
+     output:
+     tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
+
+************************************************************

From 36f4c677e30bc4b630144d678bfc945fe3474f16 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 09:26:55 +0100
Subject: [PATCH 08/27] use original sample name for id if just 1, otherwise
 add _combined.

---
 subworkflows/local/input_merge.nf | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 121c246..d2eafb5 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -25,7 +25,12 @@ workflow INPUT_MERGE {
      .groupTuple()
      .map { it -> [it[0], it[1][0]] }
      .join( merged_reads )
-     .map { it -> [ it[1] , it [2] ]}
+     .map { it -> [ 
+          [ id: ( it[2].size() == 1 ) ? it[1].sample : it[1].sample + '_combined',
+            type: it[1].type 
+          ], 
+            it[2] 
+          ]}
      .set { merged_reads_with_meta }
 
     // call samtool merge
@@ -38,7 +43,7 @@ workflow INPUT_MERGE {
     emit:
     bam      = SAMTOOLS_MERGE.out.bam
     cram     = SAMTOOLS_MERGE.out.cram 
-    csi      = SAMTOOLS_MERGE.out.csi 
+    csi      = SAMTOOLS_MERGE.out.csi
     versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ]
 
 }

From a808cf24897fc3e8c3a87b35a55dcd99cd07faa4 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 10:16:31 +0100
Subject: [PATCH 09/27] Path samtools_merge module again add indexing, emit
 crai index file as well

---
 modules/nf-core/samtools/merge/main.nf        |  2 ++
 modules/nf-core/samtools/merge/meta.yml       |  4 +++
 .../samtools/merge/samtools-merge.diff        | 27 ++++++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
index 3c7faf4..90ddfbe 100644
--- a/modules/nf-core/samtools/merge/main.nf
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -17,6 +17,7 @@ process SAMTOOLS_MERGE {
     tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
     tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram
     tuple val(meta), path("*.csi")         , optional:true, emit: csi
+    tuple val(meta), path("*.crai")        , optional:true, emit: crai
     path  "versions.yml"                                  , emit: versions
 
 
@@ -32,6 +33,7 @@ process SAMTOOLS_MERGE {
     samtools \\
         merge \\
         --threads ${task.cpus-1} \\
+        --write-index \\
         $args \\
         ${reference} \\
         ${prefix}.${file_type} \\
diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml
index 4a6bc23..056a95f 100644
--- a/modules/nf-core/samtools/merge/meta.yml
+++ b/modules/nf-core/samtools/merge/meta.yml
@@ -69,6 +69,10 @@ output:
       type: file
       description: BAM index file (optional)
       pattern: "*.csi"
+  - crai:
+      type: file
+      description: CRAM index file (optional)
+      pattern: "*.crai"
 authors:
   - "@drpatelh"
   - "@yuukiiwa "
diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff
index 4c48cc0..a740ce0 100644
--- a/modules/nf-core/samtools/merge/samtools-merge.diff
+++ b/modules/nf-core/samtools/merge/samtools-merge.diff
@@ -12,10 +12,21 @@ Changes in module 'nf-core/samtools/merge'
  output:
    - meta:
        type: map
+@@ -65,6 +69,10 @@
+       type: file
+       description: BAM index file (optional)
+       pattern: "*.csi"
++  - crai:
++      type: file
++      description: CRAM index file (optional)
++      pattern: "*.crai"
+ authors:
+   - "@drpatelh"
+   - "@yuukiiwa "
 
 --- modules/nf-core/samtools/merge/main.nf
 +++ modules/nf-core/samtools/merge/main.nf
-@@ -11,6 +11,7 @@
+@@ -11,11 +11,13 @@
      tuple val(meta), path(input_files, stageAs: "?/*")
      tuple val(meta2), path(fasta)
      tuple val(meta3), path(fai)
@@ -23,5 +34,19 @@ Changes in module 'nf-core/samtools/merge'
  
      output:
      tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
+     tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram
+     tuple val(meta), path("*.csi")         , optional:true, emit: csi
++    tuple val(meta), path("*.crai")        , optional:true, emit: crai
+     path  "versions.yml"                                  , emit: versions
+ 
+ 
+@@ -31,6 +33,7 @@
+     samtools \\
+         merge \\
+         --threads ${task.cpus-1} \\
++        --write-index \\
+         $args \\
+         ${reference} \\
+         ${prefix}.${file_type} \\
 
 ************************************************************

From c646b8b1b4b4e5b979347f9f625f89c7e6ca6e01 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 10:27:29 +0100
Subject: [PATCH 10/27] emit crai files as well

---
 subworkflows/local/input_merge.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index d2eafb5..a0e7220 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -44,6 +44,7 @@ workflow INPUT_MERGE {
     bam      = SAMTOOLS_MERGE.out.bam
     cram     = SAMTOOLS_MERGE.out.cram 
     csi      = SAMTOOLS_MERGE.out.csi
+    crai     = SAMTOOLS_MERGE.out.crai
     versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ]
 
 }

From c4a762fed89d13c9e39ee99c017d30e4a1239ce0 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 10:43:26 +0100
Subject: [PATCH 11/27] nf-core modules install samtools/sort

---
 modules.json                           |  5 +++
 modules/nf-core/samtools/sort/main.nf  | 49 ++++++++++++++++++++++++++
 modules/nf-core/samtools/sort/meta.yml | 48 +++++++++++++++++++++++++
 3 files changed, 102 insertions(+)
 create mode 100644 modules/nf-core/samtools/sort/main.nf
 create mode 100644 modules/nf-core/samtools/sort/meta.yml

diff --git a/modules.json b/modules.json
index 133358e..183f5b3 100644
--- a/modules.json
+++ b/modules.json
@@ -36,6 +36,11 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
+                    "samtools/sort": {
+                        "branch": "master",
+                        "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf
new file mode 100644
index 0000000..2b7753f
--- /dev/null
+++ b/modules/nf-core/samtools/sort/main.nf
@@ -0,0 +1,49 @@
+process SAMTOOLS_SORT {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(bam)
+
+    output:
+    tuple val(meta), path("*.bam"), emit: bam
+    tuple val(meta), path("*.csi"), emit: csi, optional: true
+    path  "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    samtools sort \\
+        $args \\
+        -@ $task.cpus \\
+        -o ${prefix}.bam \\
+        -T $prefix \\
+        $bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml
new file mode 100644
index 0000000..0732843
--- /dev/null
+++ b/modules/nf-core/samtools/sort/meta.yml
@@ -0,0 +1,48 @@
+name: samtools_sort
+description: Sort SAM/BAM/CRAM file
+keywords:
+  - sort
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - csi:
+      type: file
+      description: BAM index file (optional)
+      pattern: "*.csi"
+authors:
+  - "@drpatelh"
+  - "@ewels"

From 9fa3696989b7d4398654c507e0018d983fb37067 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 11:37:33 +0100
Subject: [PATCH 12/27] Add an option to sort input if not sorted.

---
 nextflow.config                   |  1 +
 nextflow_schema.json              |  4 ++++
 subworkflows/local/input_merge.nf | 18 ++++++++++++++++--
 workflows/variantcalling.nf       |  6 ++++--
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 83e274f..4659669 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,6 +11,7 @@ params {
 
     // Input options
     input                      = null
+    sort_input                 = false
     fasta                      = null
     fai                        = null
     gzi                        = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 3a5272c..5e04d99 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -28,6 +28,10 @@
                     "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
                     "fa_icon": "fas fa-folder-open"
                 },
+                "sort_input": {
+                    "type": "boolean",
+                    "description": "Boolean whether to sort input reads files"
+                },
                 "email": {
                     "type": "string",
                     "description": "Email address for completion summary.",
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index a0e7220..53b5e0f 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -3,6 +3,7 @@
 //
 
 include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge'
+include { SAMTOOLS_SORT }  from '../../modules/nf-core/samtools/sort'
 
 workflow INPUT_MERGE {
     take:
@@ -10,10 +11,22 @@ workflow INPUT_MERGE {
     fai                // file: /path/to/genome.*.fai
     gzi                // file: /path/to/genome.fasta.gz.gzi or null
     reads              // channel: [ val(meta), data ]
+    sort_input         // bollean: true or false
 
     main:
+    ch_versions = Channel.empty()
+    
+    // sort input reads if asked
+    if ( sort_input ) {
+      SAMTOOLS_SORT( reads )
+      ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
+      sorted_reads = SAMTOOLS_SORT.out.bam
+    } else {
+      sorted_reads = reads
+    }
+    
     // group input reads file by sample name
-    reads
+    sorted_reads
      .map{ it -> [ it[0].sample, it[1] ] }
      .groupTuple()
      .set{ merged_reads } 
@@ -39,12 +52,13 @@ workflow INPUT_MERGE {
                     [ [], fai ],
                     [ [], gzi ]
     )
+    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
     emit:
     bam      = SAMTOOLS_MERGE.out.bam
     cram     = SAMTOOLS_MERGE.out.cram 
     csi      = SAMTOOLS_MERGE.out.csi
     crai     = SAMTOOLS_MERGE.out.crai
-    versions = SAMTOOLS_MERGE.out.versions // channel: [ versions.yml ]
+    versions = ch_versions // channel: [ versions.yml ]
 
 }
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 5b2c581..2c1a78f 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -28,7 +28,8 @@ if (params.gzi) {
 }
 
 // Check optional parameters
-if (params.interval)            { interval_file = file(params.interval)          } else { interval_file = null }
+if (params.sort_input)          { sort_input    = params.sort_input              } else { sort_input    = false       }
+if (params.interval)            { interval_file = file(params.interval)          } else { interval_file = null        }
 if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 }
 
 /*
@@ -89,7 +90,8 @@ workflow VARIANTCALLING {
         fasta_file,
         fai_file,
         gzi_file,
-        INPUT_CHECK.out.reads
+        INPUT_CHECK.out.reads,
+        sort_input
     )
     ch_versions = ch_versions.mix(INPUT_MERGE.out.versions)
 

From 5cbea5bd50e0816ab7946c29d006d0975b6532c9 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 12:03:27 +0100
Subject: [PATCH 13/27] combine merged bam/cram together, add with their index
 files as well.

---
 subworkflows/local/input_merge.nf | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 53b5e0f..da9047e 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -54,11 +54,16 @@ workflow INPUT_MERGE {
     )
     ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
+    SAMTOOLS_MERGE.out.bam
+      .join(SAMTOOLS_MERGE.out.csi)
+      .concat(
+        SAMTOOLS_MERGE.out.cram
+         .join(SAMTOOLS_MERGE.out.crai)
+      )
+    .set{ indexed_merged_reads };
+
     emit:
-    bam      = SAMTOOLS_MERGE.out.bam
-    cram     = SAMTOOLS_MERGE.out.cram 
-    csi      = SAMTOOLS_MERGE.out.csi
-    crai     = SAMTOOLS_MERGE.out.crai
+    indexed_merged_reads = indexed_merged_reads
     versions = ch_versions // channel: [ versions.yml ]
 
 }

From 3ac73e9482d849661920ccc216f0d2cb5ff3dcca Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 12:20:16 +0100
Subject: [PATCH 14/27] add filtered to distinguish the samtools input and
 output name

---
 conf/modules.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/modules.config b/conf/modules.config
index 7dc8677..6672bfc 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -22,6 +22,7 @@ process {
 
     withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' {
         ext.args   = '--output-fmt cram --write-index -F 0x900'
+        ext.prefix = { "${meta.id}_filtered" }
     }
 
     withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' {

From bc6e5a0982503673e2898347b8d2253b21da6ea5 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 12:22:42 +0100
Subject: [PATCH 15/27] use the merged read for the rest of pipeline

---
 workflows/variantcalling.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 2c1a78f..3b36a99 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -103,7 +103,7 @@ workflow VARIANTCALLING {
         fasta_file,
         fai_file,
         gzi_file,
-        INPUT_CHECK.out.reads,
+        INPUT_MERGE.out.indexed_merged_reads,
         interval_file,
         split_fasta_cutoff
     )

From a7131a8d087c2e69e46ef9c0164d70e19310e916 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 19:00:58 +0100
Subject: [PATCH 16/27] covert all input files into channels, and make
 reference fasta index file optional

---
 conf/test.config                         |  4 +-
 nextflow_schema.json                     |  2 +-
 subworkflows/local/input_filter_split.nf | 10 ++--
 subworkflows/local/input_merge.nf        | 17 +++++--
 workflows/variantcalling.nf              | 60 +++++++++++++++---------
 5 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index be32ebe..f89e185 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -26,8 +26,8 @@ params {
     fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz'
 
     // Reference index file
-    fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
-    gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
+    // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
+    // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
 
     // Interval bed file
     interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5e04d99..dc6428b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -70,7 +70,7 @@
                     "description": "The minimum fasta file size when splitting the input fasta file by sequence."
                 }
             },
-            "required": ["fasta", "fai"]
+            "required": ["fasta"]
         },
         "institutional_config_options": {
             "title": "Institutional config options",
diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf
index c820901..95555d7 100644
--- a/subworkflows/local/input_filter_split.nf
+++ b/subworkflows/local/input_filter_split.nf
@@ -19,8 +19,7 @@ workflow INPUT_FILTER_SPLIT {
     ch_versions = Channel.empty()
 
     // split the fasta file into files with one sequence each, group them by file size
-    Channel
-     .fromPath ( fasta )
+    fasta
      .splitFasta ( file:true )
      .branch {
         small: it.size() < split_fasta_cutoff
@@ -62,13 +61,16 @@ workflow INPUT_FILTER_SPLIT {
      .set { fasta_fai }
 
     // filter reads
-    SAMTOOLS_VIEW ( reads, [ [], fasta ], [] )
+    fasta
+      .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
+      .set { ch_fasta }
+    SAMTOOLS_VIEW ( reads, ch_fasta, [] )
     ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() )
     
     // combine reads with splitted references
     SAMTOOLS_VIEW.out.cram
      .join ( SAMTOOLS_VIEW.out.crai )
-     .map { filtered_reads -> filtered_reads + [interval ?: []] }
+     .combine(interval)
      .combine ( fasta_fai )
      .set { cram_crai_fasta_fai }
 
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index da9047e..05776ce 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -18,10 +18,12 @@ workflow INPUT_MERGE {
     
     // sort input reads if asked
     if ( sort_input ) {
+
       SAMTOOLS_SORT( reads )
       ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
       sorted_reads = SAMTOOLS_SORT.out.bam
     } else {
+      
       sorted_reads = reads
     }
     
@@ -47,10 +49,19 @@ workflow INPUT_MERGE {
      .set { merged_reads_with_meta }
 
     // call samtool merge
+    fasta
+      .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
+      .set { ch_fasta }
+    fai
+      .map { fai -> [ [ 'id': fai.baseName ], fai ] }
+      .set { ch_fai }
+    gzi
+      .map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }
+      .set { ch_gzi }
     SAMTOOLS_MERGE( merged_reads_with_meta, 
-                    [ [], fasta ],
-                    [ [], fai ],
-                    [ [], gzi ]
+                    ch_fasta,
+                    ch_fai,
+                    ch_gzi
     )
     ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 3b36a99..8a933b3 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -14,22 +14,15 @@ def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, p
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
-if (params.input) { input_file = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
-if (params.fasta) { fasta_file = file(params.fasta) } else { exit 1, 'Reference fasta not specified!' }
-if (params.fai)   { fai_file   = file(params.fai)   } else { exit 1, 'Reference fasta index not specified!' }
-
-// Check gzi being given if compressed fasta is provided
-if (params.gzi) {
-    gzi_file = file(params.gzi)
-} else if ( params.fasta.endsWith('fasta.gz') ) { 
-    exit 1, 'Reference fasta index gzi file not specified for fasta.gz file!' 
-} else {
-    gzi_file = null
-}
+if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' }
+if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!'   }
 
 // Check optional parameters
+if (params.fai)     { ch_fai   = Channel.fromPath(params.fai)         } else { ch_fai      = Channel.empty() }
+if (params.gzi)     { ch_gzi   = Channel.fromPath(params.gzi)         } else { ch_gzi      = Channel.empty() }
+if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() }
+
 if (params.sort_input)          { sort_input    = params.sort_input              } else { sort_input    = false       }
-if (params.interval)            { interval_file = file(params.interval)          } else { interval_file = null        }
 if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 }
 
 /*
@@ -62,6 +55,7 @@ include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller'
 // MODULE: Installed directly from nf-core/modules
 //
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
+include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -75,11 +69,33 @@ workflow VARIANTCALLING {
 
     ch_versions = Channel.empty()
 
+    //
+    // check reference fasta index given or not
+    //
+    if( params.fai == null || ( params.fasta.endsWith('fasta.gz') && params.gzi == null ) ){ 
+   
+       ch_fasta
+        .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
+        .set { ch_genome }
+
+       SAMTOOLS_FAIDX ( ch_genome,  [[], []])
+       ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions )
+       
+       SAMTOOLS_FAIDX.out.fai
+        .map{ mata, fai -> fai }
+        .set{ ch_fai }
+  
+       SAMTOOLS_FAIDX.out.gzi
+        .map{ meta, gzi -> gzi }
+        .set{ ch_gzi }
+
+    }
+
     //
     // SUBWORKFLOW: Read in samplesheet, validate and stage input files
     //
     INPUT_CHECK (
-        input_file
+        ch_input
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
@@ -87,9 +103,9 @@ workflow VARIANTCALLING {
     // SUBWORKFLOW: merge the input reads by sample name
     //
     INPUT_MERGE (
-        fasta_file,
-        fai_file,
-        gzi_file,
+        ch_fasta,
+        ch_fai,
+        ch_gzi,
         INPUT_CHECK.out.reads,
         sort_input
     )
@@ -100,15 +116,15 @@ workflow VARIANTCALLING {
     // SUBWORKFLOW: split the input fasta file and filter input reads
     //
     INPUT_FILTER_SPLIT (
-        fasta_file,
-        fai_file,
-        gzi_file,
+        ch_fasta,
+        ch_fai,
+        ch_gzi,
         INPUT_MERGE.out.indexed_merged_reads,
-        interval_file,
+        ch_interval,
         split_fasta_cutoff
     )
     ch_versions = ch_versions.mix(INPUT_FILTER_SPLIT.out.versions)
-    
+
     //
     // SUBWORKFLOW: call deepvariant
     //

From 0265cbf8053998245ab5ae14e3caeb50c8f05e63 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 21:52:31 +0100
Subject: [PATCH 17/27] use the first for the reference fasta channel

---
 subworkflows/local/input_filter_split.nf |  5 ++--
 subworkflows/local/input_merge.nf        | 35 ++++++++++--------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf
index 95555d7..0482cf2 100644
--- a/subworkflows/local/input_filter_split.nf
+++ b/subworkflows/local/input_filter_split.nf
@@ -61,9 +61,8 @@ workflow INPUT_FILTER_SPLIT {
      .set { fasta_fai }
 
     // filter reads
-    fasta
-      .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
-      .set { ch_fasta }
+    ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first()
+
     SAMTOOLS_VIEW ( reads, ch_fasta, [] )
     ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() )
     
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 05776ce..4e7efa5 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -18,46 +18,39 @@ workflow INPUT_MERGE {
     
     // sort input reads if asked
     if ( sort_input ) {
-
       SAMTOOLS_SORT( reads )
       ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
       sorted_reads = SAMTOOLS_SORT.out.bam
-    } else {
-      
+    } else {     
       sorted_reads = reads
     }
-    
+
     // group input reads file by sample name
     sorted_reads
-     .map{ it -> [ it[0].sample, it[1] ] }
+     .map{ meta, bam_cram -> [ meta.sample, bam_cram ] }
      .groupTuple()
      .set{ merged_reads } 
-    
+
     // group input meta data together by sample name as well
     // use the first meta data for the combined reads
     reads
-     .map{ it -> [ it[0].sample, it[0] ] }
+     .map{ meta, bam_cram -> [ meta.sample, meta ] }
      .groupTuple()
-     .map { it -> [it[0], it[1][0]] }
+     .map { sample, meta_list -> [sample, meta_list[0]] }
      .join( merged_reads )
-     .map { it -> [ 
-          [ id: ( it[2].size() == 1 ) ? it[1].sample : it[1].sample + '_combined',
-            type: it[1].type 
+     .map { sample, meta, bam_cram_list -> [ 
+          [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined',
+            type: meta.type 
           ], 
-            it[2] 
+            bam_cram_list 
           ]}
      .set { merged_reads_with_meta }
 
     // call samtool merge
-    fasta
-      .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
-      .set { ch_fasta }
-    fai
-      .map { fai -> [ [ 'id': fai.baseName ], fai ] }
-      .set { ch_fai }
-    gzi
-      .map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }
-      .set { ch_gzi }
+    ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first()
+    ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first()
+    ch_gzi = gzi.map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }.first()
+
     SAMTOOLS_MERGE( merged_reads_with_meta, 
                     ch_fasta,
                     ch_fai,

From d4d38cc08e2100a3a48c0beca5932e834b76017a Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 6 Oct 2023 22:17:10 +0100
Subject: [PATCH 18/27] move write-index flag to the config file

---
 conf/modules.config                                | 4 ++++
 modules/nf-core/samtools/merge/main.nf             | 1 -
 modules/nf-core/samtools/merge/samtools-merge.diff | 8 --------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 6672bfc..0deaa90 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -25,6 +25,10 @@ process {
         ext.prefix = { "${meta.id}_filtered" }
     }
 
+    withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' {
+        ext.args   = '--write-index'
+    }
+
     withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' {
         ext.args = '--model_type=PACBIO'
     }
diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
index 90ddfbe..2dc4008 100644
--- a/modules/nf-core/samtools/merge/main.nf
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -33,7 +33,6 @@ process SAMTOOLS_MERGE {
     samtools \\
         merge \\
         --threads ${task.cpus-1} \\
-        --write-index \\
         $args \\
         ${reference} \\
         ${prefix}.${file_type} \\
diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff
index a740ce0..afe2536 100644
--- a/modules/nf-core/samtools/merge/samtools-merge.diff
+++ b/modules/nf-core/samtools/merge/samtools-merge.diff
@@ -40,13 +40,5 @@ Changes in module 'nf-core/samtools/merge'
      path  "versions.yml"                                  , emit: versions
  
  
-@@ -31,6 +33,7 @@
-     samtools \\
-         merge \\
-         --threads ${task.cpus-1} \\
-+        --write-index \\
-         $args \\
-         ${reference} \\
-         ${prefix}.${file_type} \\
 
 ************************************************************

From ea046b9b3a94661860382e185d53cf8fc79b31a9 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Sat, 7 Oct 2023 11:12:47 +0100
Subject: [PATCH 19/27] make sure work file when no interval file given

---
 subworkflows/local/input_filter_split.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf
index 0482cf2..2467b68 100644
--- a/subworkflows/local/input_filter_split.nf
+++ b/subworkflows/local/input_filter_split.nf
@@ -69,7 +69,7 @@ workflow INPUT_FILTER_SPLIT {
     // combine reads with splitted references
     SAMTOOLS_VIEW.out.cram
      .join ( SAMTOOLS_VIEW.out.crai )
-     .combine(interval)
+     .combine(interval.ifEmpty([[]]))
      .combine ( fasta_fai )
      .set { cram_crai_fasta_fai }
 

From 7506e81addf69b1960e521438a1e1cdbe7c81e2f Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Sat, 7 Oct 2023 19:47:51 +0100
Subject: [PATCH 20/27] formating and documents

---
 README.md                         |  3 ++-
 docs/usage.md                     | 24 ++++++++++++------------
 subworkflows/local/input_check.nf |  5 ++++-
 subworkflows/local/input_merge.nf |  5 +++++
 workflows/variantcalling.nf       |  2 +-
 5 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 62da4b2..ed207d1 100644
--- a/README.md
+++ b/README.md
@@ -18,11 +18,12 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 ## Pipeline summary
 
-The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.
+The pipeline takes aligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.
 
 Steps involved:
 
 - Split fasta file into smaller files, normally one sequence per file unless the sequences are too small.
+- Merge input BAM/CRAM files together if they have the same sample names.
 - Filter out reads using the `-F 0x900` option to only retain the primary alignments.
 - Run DeepVariant using filtered BAM/CRAM files against each of split fasta files.
 - Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file.
diff --git a/docs/usage.md b/docs/usage.md
index af4811c..71cc1c7 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -2,11 +2,11 @@
 
 ## Introduction
 
-The pipleline takes aligned sample reads (CRAM/BAM files and their index files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants.
+The pipeline takes aligned sample reads (CRAM/BAM files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants.
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with at least 4 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
 
 ```bash
 --input '[path to samplesheet file]'
@@ -17,21 +17,22 @@ You will need to create a samplesheet with information about the samples you wou
 The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. Below is an example for the same sample sequenced across 3 lanes:
 
 ```console
-sample,datatype,datafile,indexfile
-sample1,pacbio,sample1_1.cram,sample1_1.cram.crai
-sample1,pacbio,sample1_2.cram,sample1_3.cram.crai
-sample1,pacbio,sample1_3.cram,sample1_3.cram.crai
+sample,datatype,datafile
+sample1,pacbio,sample1_1.cram
+sample1,pacbio,sample1_2.cram
+sample1,pacbio,sample1_3.cram
 ```
+If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples.
 
 ### Full samplesheet
 
 A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data.
 
 ```console
-sample,datatype,datafile,indexfile
-sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
-sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
-sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
+sample,datatype,datafile
+sample1,pacbio,/path/to/data/file/file1.bam
+sample2,pacbio,/path/to/data/file/file2.cram
+sample3,pacbio,/path/to/data/file/file3.bam
 ```
 
 | Column      | Description                                                                                                                                                                            |
@@ -39,7 +40,6 @@ sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
 | `sample`    | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
 | `datatype`  | Sequencing data type. Must be `pacbio`.                                                                                                                                                |
 | `datafile`  | The location for either BAM or CRAM file.                                                                                                                                              |
-| `indexfile` | The location for BAM or CRAM index file – BAI, CSI or CRAI.                                                                                                                            |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -62,7 +62,7 @@ work                # Directory containing the nextflow working files
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
 
-The pipeline will split the intput fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`.
+The pipeline will split the input fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`.
 
 ### Updating the pipeline
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index aa0bbc9..c3e0e49 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -12,7 +12,10 @@ workflow INPUT_CHECK {
     SAMPLESHEET_CHECK ( samplesheet )
         .csv
         .splitCsv ( header:true, sep:',' )
-        .map { [[id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype], file(it.datafile)] }
+        .map { [
+            [ id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype ], 
+            file(it.datafile)
+            ] }
         .set { reads }
         
     emit:
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 4e7efa5..58395d0 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -18,11 +18,15 @@ workflow INPUT_MERGE {
     
     // sort input reads if asked
     if ( sort_input ) {
+  
       SAMTOOLS_SORT( reads )
       ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
       sorted_reads = SAMTOOLS_SORT.out.bam
+
     } else {     
+
       sorted_reads = reads
+
     }
 
     // group input reads file by sample name
@@ -58,6 +62,7 @@ workflow INPUT_MERGE {
     )
     ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
+    // concat merged bam or cram together along with their index file
     SAMTOOLS_MERGE.out.bam
       .join(SAMTOOLS_MERGE.out.csi)
       .concat(
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 8a933b3..a4cf47c 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -78,7 +78,7 @@ workflow VARIANTCALLING {
         .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
         .set { ch_genome }
 
-       SAMTOOLS_FAIDX ( ch_genome,  [[], []])
+       SAMTOOLS_FAIDX ( ch_genome,  [[], []] )
        ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions )
        
        SAMTOOLS_FAIDX.out.fai

From 4dacfa801896b36c07377bd6623729008796ae94 Mon Sep 17 00:00:00 2001
From: nf-core-bot <core@nf-co.re>
Date: Sat, 7 Oct 2023 18:51:06 +0000
Subject: [PATCH 21/27] [automated] Fix linting with Prettier

---
 docs/usage.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 71cc1c7..1764889 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -22,6 +22,7 @@ sample1,pacbio,sample1_1.cram
 sample1,pacbio,sample1_2.cram
 sample1,pacbio,sample1_3.cram
 ```
+
 If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples.
 
 ### Full samplesheet
@@ -35,11 +36,11 @@ sample2,pacbio,/path/to/data/file/file2.cram
 sample3,pacbio,/path/to/data/file/file3.bam
 ```
 
-| Column      | Description                                                                                                                                                                            |
-| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`    | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `datatype`  | Sequencing data type. Must be `pacbio`.                                                                                                                                                |
-| `datafile`  | The location for either BAM or CRAM file.                                                                                                                                              |
+| Column     | Description                                                                                                                                                                            |
+| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `datatype` | Sequencing data type. Must be `pacbio`.                                                                                                                                                |
+| `datafile` | The location for either BAM or CRAM file.                                                                                                                                              |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

From ebaa7e47361d9129a103d75e7c01709de92413c4 Mon Sep 17 00:00:00 2001
From: Guoying Qi <729395+gq1@users.noreply.github.com>
Date: Fri, 13 Oct 2023 09:49:21 +0100
Subject: [PATCH 22/27] Update conf/test.config

Co-authored-by: Matthieu Muffato <mm49@sanger.ac.uk>
---
 conf/test.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test.config b/conf/test.config
index f89e185..ad1e731 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -25,7 +25,7 @@ params {
     // Fasta references
     fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz'
 
-    // Reference index file
+    // Reference index file (optional)
     // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
     // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
 

From 1b2c45e77be1c11cfadfcb2ba608bed08d575651 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Fri, 27 Oct 2023 15:50:37 +0100
Subject: [PATCH 23/27] nf-core modules update samtools/merge with conda
 environment file and maintainers list

---
 modules.json                                   | 2 +-
 modules/nf-core/samtools/merge/environment.yml | 6 ++++++
 modules/nf-core/samtools/merge/main.nf         | 2 +-
 modules/nf-core/samtools/merge/meta.yml        | 6 ++++++
 4 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 modules/nf-core/samtools/merge/environment.yml

diff --git a/modules.json b/modules.json
index 183f5b3..9781734 100644
--- a/modules.json
+++ b/modules.json
@@ -32,7 +32,7 @@
                     },
                     "samtools/merge": {
                         "branch": "master",
-                        "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
+                        "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml
new file mode 100644
index 0000000..04c82f1
--- /dev/null
+++ b/modules/nf-core/samtools/merge/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
index 2dc4008..0affdbf 100644
--- a/modules/nf-core/samtools/merge/main.nf
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_MERGE {
     tag "$meta.id"
     label 'process_low'
 
-    conda "bioconda::samtools=1.17"
+    conda 'modules/nf-core/samtools/merge/environment.yml'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml
index 056a95f..bf0da8b 100644
--- a/modules/nf-core/samtools/merge/meta.yml
+++ b/modules/nf-core/samtools/merge/meta.yml
@@ -79,3 +79,9 @@ authors:
   - "@maxulysse"
   - "@FriederikeHanssen"
   - "@ramprasadn"
+maintainers:
+  - "@drpatelh"
+  - "@yuukiiwa "
+  - "@maxulysse"
+  - "@FriederikeHanssen"
+  - "@ramprasadn"

From a76298fb06809c6202ebe45ddac6f2ca9b408186 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Wed, 1 Nov 2023 19:22:10 +0000
Subject: [PATCH 24/27] Update samtools/merge module and remove its patch.
 Combine gzi and fai parameters as one.

---
 conf/test.config                              |  2 +-
 modules.json                                  |  2 +-
 modules/nf-core/samtools/merge/main.nf        |  3 +-
 modules/nf-core/samtools/merge/meta.yml       |  4 --
 .../samtools/merge/samtools-merge.diff        | 44 -------------------
 nextflow.config                               |  1 -
 nextflow_schema.json                          |  8 +---
 subworkflows/local/input_filter_split.nf      |  2 -
 subworkflows/local/input_merge.nf             |  7 +--
 workflows/variantcalling.nf                   | 34 +++++++++-----
 10 files changed, 31 insertions(+), 76 deletions(-)
 delete mode 100644 modules/nf-core/samtools/merge/samtools-merge.diff

diff --git a/conf/test.config b/conf/test.config
index ad1e731..49b740b 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -27,7 +27,7 @@ params {
 
     // Reference index file (optional)
     // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
-    // gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
+     fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
 
     // Interval bed file
     interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed'
diff --git a/modules.json b/modules.json
index 9781734..956cd97 100644
--- a/modules.json
+++ b/modules.json
@@ -32,7 +32,7 @@
                     },
                     "samtools/merge": {
                         "branch": "master",
-                        "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc",
+                        "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf
index 0affdbf..21f785c 100644
--- a/modules/nf-core/samtools/merge/main.nf
+++ b/modules/nf-core/samtools/merge/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_MERGE {
     tag "$meta.id"
     label 'process_low'
 
-    conda 'modules/nf-core/samtools/merge/environment.yml'
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
@@ -11,7 +11,6 @@ process SAMTOOLS_MERGE {
     tuple val(meta), path(input_files, stageAs: "?/*")
     tuple val(meta2), path(fasta)
     tuple val(meta3), path(fai)
-    tuple val(meta4), path(gzi)
 
     output:
     tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml
index bf0da8b..2e8f3db 100644
--- a/modules/nf-core/samtools/merge/meta.yml
+++ b/modules/nf-core/samtools/merge/meta.yml
@@ -43,10 +43,6 @@ input:
       type: file
       description: Index of the reference file the CRAM was created with (optional)
       pattern: "*.fai"
-  - gzi:
-      type: file
-      description: Index of the reference file the CRAM was created with (optional)
-      pattern: "*.gzi"
 output:
   - meta:
       type: map
diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff
deleted file mode 100644
index afe2536..0000000
--- a/modules/nf-core/samtools/merge/samtools-merge.diff
+++ /dev/null
@@ -1,44 +0,0 @@
-Changes in module 'nf-core/samtools/merge'
---- modules/nf-core/samtools/merge/meta.yml
-+++ modules/nf-core/samtools/merge/meta.yml
-@@ -43,6 +43,10 @@
-       type: file
-       description: Index of the reference file the CRAM was created with (optional)
-       pattern: "*.fai"
-+  - gzi:
-+      type: file
-+      description: Index of the reference file the CRAM was created with (optional)
-+      pattern: "*.gzi"
- output:
-   - meta:
-       type: map
-@@ -65,6 +69,10 @@
-       type: file
-       description: BAM index file (optional)
-       pattern: "*.csi"
-+  - crai:
-+      type: file
-+      description: CRAM index file (optional)
-+      pattern: "*.crai"
- authors:
-   - "@drpatelh"
-   - "@yuukiiwa "
-
---- modules/nf-core/samtools/merge/main.nf
-+++ modules/nf-core/samtools/merge/main.nf
-@@ -11,11 +11,13 @@
-     tuple val(meta), path(input_files, stageAs: "?/*")
-     tuple val(meta2), path(fasta)
-     tuple val(meta3), path(fai)
-+    tuple val(meta4), path(gzi)
- 
-     output:
-     tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
-     tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram
-     tuple val(meta), path("*.csi")         , optional:true, emit: csi
-+    tuple val(meta), path("*.crai")        , optional:true, emit: crai
-     path  "versions.yml"                                  , emit: versions
- 
- 
-
-************************************************************
diff --git a/nextflow.config b/nextflow.config
index 4659669..c1e8ac3 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,7 +14,6 @@ params {
     sort_input                 = false
     fasta                      = null
     fai                        = null
-    gzi                        = null
     interval                   = null
     split_fasta_cutoff         = 100000
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index dc6428b..66cfeda 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -49,15 +49,11 @@
             "properties": {
                 "fasta": {
                     "type": "string",
-                    "description": "Path to FASTA genome file, either fasta or fast.gz"
+                    "description": "Path to FASTA genome file, either fasta or fast.gz."
                 },
                 "fai": {
                     "type": "string",
-                    "description": "Path to the index file of the FASTA genome file."
-                },
-                "gzi": {
-                    "type": "string",
-                    "description": "Path to the gzi index file of  the FASTA genome file. Required if fasta in gz format."
+                    "description": "Path to the index file of the FASTA genome file, either fai or gzi."
                 },
                 "interval": {
                     "type": "string",
diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf
index 2467b68..dc0710f 100644
--- a/subworkflows/local/input_filter_split.nf
+++ b/subworkflows/local/input_filter_split.nf
@@ -9,8 +9,6 @@ include { CAT_CAT        } from '../../modules/nf-core/cat/cat/main'
 workflow INPUT_FILTER_SPLIT {
     take:
     fasta              // file: /path/to/genome.fasta or /path/to/genome.fasta.gz
-    fai                // file: /path/to/genome.*.fai
-    gzi                // file: /path/to/genome.fasta.gz.gzi or null
     reads              // [ val(meta), data, index ]
     interval           // file: /path/to/intervals.bed
     split_fasta_cutoff // val(min_file_size)
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 58395d0..1c8a588 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -8,8 +8,7 @@ include { SAMTOOLS_SORT }  from '../../modules/nf-core/samtools/sort'
 workflow INPUT_MERGE {
     take:
     fasta              // file: /path/to/genome.fasta or /path/to/genome.fasta.gz
-    fai                // file: /path/to/genome.*.fai
-    gzi                // file: /path/to/genome.fasta.gz.gzi or null
+    fai                // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi
     reads              // channel: [ val(meta), data ]
     sort_input         // bollean: true or false
 
@@ -53,12 +52,10 @@ workflow INPUT_MERGE {
     // call samtool merge
     ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first()
     ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first()
-    ch_gzi = gzi.map { gzi -> [ [ 'id': gzi.baseName ], gzi ] }.first()
 
     SAMTOOLS_MERGE( merged_reads_with_meta, 
                     ch_fasta,
-                    ch_fai,
-                    ch_gzi
+                    ch_fai
     )
     ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index a4cf47c..68e2fef 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 WorkflowVariantcalling.initialise(params, log)
 
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, params.interval ]
+def checkPathParamList = [ params.input, params.fasta, params.fai, params.interval ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
@@ -18,8 +18,17 @@ if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, '
 if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!'   }
 
 // Check optional parameters
-if (params.fai)     { ch_fai   = Channel.fromPath(params.fai)         } else { ch_fai      = Channel.empty() }
-if (params.gzi)     { ch_gzi   = Channel.fromPath(params.gzi)         } else { ch_gzi      = Channel.empty() }
+if (params.fai){
+    if( ( params.fasta.endsWith('.gz') && params.fai.endsWith('.fai') )
+        ||
+        ( !params.fasta.endsWith('.gz') && params.fai.endsWith('.gzi') )
+    ){
+      exit 1, 'Reference fasta and its index file format not matched!'
+    }
+    ch_fai   = Channel.fromPath(params.fai)
+} else { 
+    ch_fai      = Channel.empty() 
+}
 if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() }
 
 if (params.sort_input)          { sort_input    = params.sort_input              } else { sort_input    = false       }
@@ -72,7 +81,7 @@ workflow VARIANTCALLING {
     //
     // check reference fasta index given or not
     //
-    if( params.fai == null || ( params.fasta.endsWith('fasta.gz') && params.gzi == null ) ){ 
+    if( params.fai == null ){ 
    
        ch_fasta
         .map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }
@@ -80,15 +89,23 @@ workflow VARIANTCALLING {
 
        SAMTOOLS_FAIDX ( ch_genome,  [[], []] )
        ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions )
-       
+
        SAMTOOLS_FAIDX.out.fai
         .map{ mata, fai -> fai }
         .set{ ch_fai }
-  
+
        SAMTOOLS_FAIDX.out.gzi
         .map{ meta, gzi -> gzi }
         .set{ ch_gzi }
 
+       if( params.fasta.endsWith('.gz') ){
+            ch_index = ch_gzi
+       }else{
+            ch_index = ch_fai
+       }
+
+    }else{
+       ch_index = ch_fai
     }
 
     //
@@ -104,8 +121,7 @@ workflow VARIANTCALLING {
     //
     INPUT_MERGE (
         ch_fasta,
-        ch_fai,
-        ch_gzi,
+        ch_index,
         INPUT_CHECK.out.reads,
         sort_input
     )
@@ -117,8 +133,6 @@ workflow VARIANTCALLING {
     //
     INPUT_FILTER_SPLIT (
         ch_fasta,
-        ch_fai,
-        ch_gzi,
         INPUT_MERGE.out.indexed_merged_reads,
         ch_interval,
         split_fasta_cutoff

From 3cf953d19c3f7c9fc40e5f7878d379364286bd14 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Wed, 1 Nov 2023 20:42:24 +0000
Subject: [PATCH 25/27] remove sort_input params. Always sort the input before
 merging.

---
 conf/test.config                  |  2 +-
 docs/usage.md                     |  2 --
 nextflow.config                   |  1 -
 nextflow_schema.json              |  4 ----
 subworkflows/local/input_merge.nf | 35 +++++++++++++------------------
 workflows/variantcalling.nf       |  2 --
 6 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 49b740b..01515e9 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -27,7 +27,7 @@ params {
 
     // Reference index file (optional)
     // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai'
-     fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
+    // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi'
 
     // Interval bed file
     interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed'
diff --git a/docs/usage.md b/docs/usage.md
index 1764889..46b74b3 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -23,8 +23,6 @@ sample1,pacbio,sample1_2.cram
 sample1,pacbio,sample1_3.cram
 ```
 
-If the given BAM/CRAM files are not sorted, you need to add `--sort_input` in the run command to sort them before merging the files together from the same samples.
-
 ### Full samplesheet
 
 A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data.
diff --git a/nextflow.config b/nextflow.config
index c1e8ac3..399b382 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,7 +11,6 @@ params {
 
     // Input options
     input                      = null
-    sort_input                 = false
     fasta                      = null
     fai                        = null
     interval                   = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 66cfeda..d40c501 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -28,10 +28,6 @@
                     "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
                     "fa_icon": "fas fa-folder-open"
                 },
-                "sort_input": {
-                    "type": "boolean",
-                    "description": "Boolean whether to sort input reads files"
-                },
                 "email": {
                     "type": "string",
                     "description": "Email address for completion summary.",
diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
index 1c8a588..90bb82f 100644
--- a/subworkflows/local/input_merge.nf
+++ b/subworkflows/local/input_merge.nf
@@ -10,50 +10,45 @@ workflow INPUT_MERGE {
     fasta              // file: /path/to/genome.fasta or /path/to/genome.fasta.gz
     fai                // file: /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi
     reads              // channel: [ val(meta), data ]
-    sort_input         // bollean: true or false
 
     main:
     ch_versions = Channel.empty()
-    
-    // sort input reads if asked
-    if ( sort_input ) {
-  
-      SAMTOOLS_SORT( reads )
-      ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
-      sorted_reads = SAMTOOLS_SORT.out.bam
 
-    } else {     
-
-      sorted_reads = reads
+    // group input meta data together by sample name
+    reads
+     .map{ meta, bam_cram -> [ meta.sample, meta ] }
+     .groupTuple()
+     .set{ grouped_reads_meta }
 
-    }
+    // sort input reads
+    SAMTOOLS_SORT( reads )
+    ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions )
+    sorted_reads = SAMTOOLS_SORT.out.bam
 
     // group input reads file by sample name
     sorted_reads
      .map{ meta, bam_cram -> [ meta.sample, bam_cram ] }
      .groupTuple()
-     .set{ merged_reads } 
+     .set{ grouped_reads } 
 
-    // group input meta data together by sample name as well
+    // join grouped reads and meta
     // use the first meta data for the combined reads
-    reads
-     .map{ meta, bam_cram -> [ meta.sample, meta ] }
-     .groupTuple()
+    grouped_reads_meta 
      .map { sample, meta_list -> [sample, meta_list[0]] }
-     .join( merged_reads )
+     .join( grouped_reads )
      .map { sample, meta, bam_cram_list -> [ 
           [ id: ( bam_cram_list.size() == 1 ) ? sample : sample + '_combined',
             type: meta.type 
           ], 
             bam_cram_list 
           ]}
-     .set { merged_reads_with_meta }
+     .set { grouped_reads_with_meta }
 
     // call samtool merge
     ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first()
     ch_fai = fai.map { fai -> [ [ 'id': fai.baseName ], fai ] }.first()
 
-    SAMTOOLS_MERGE( merged_reads_with_meta, 
+    SAMTOOLS_MERGE( grouped_reads_with_meta, 
                     ch_fasta,
                     ch_fai
     )
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 68e2fef..99c35e5 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -31,7 +31,6 @@ if (params.fai){
 }
 if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() }
 
-if (params.sort_input)          { sort_input    = params.sort_input              } else { sort_input    = false       }
 if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 }
 
 /*
@@ -123,7 +122,6 @@ workflow VARIANTCALLING {
         ch_fasta,
         ch_index,
         INPUT_CHECK.out.reads,
-        sort_input
     )
     ch_versions = ch_versions.mix(INPUT_MERGE.out.versions)
 

From d6fa00f37eb2729b6c67b3d0a4d36f02e1536b35 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 2 Nov 2023 11:41:59 +0000
Subject: [PATCH 26/27] only validate the sample sheet not transform the sample
 names

---
 bin/check_samplesheet.py          | 22 +++++++---------------
 subworkflows/local/input_check.nf |  2 +-
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 52af146..d088e65 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-"""Provide a command line tool to validate and transform tabular samplesheets."""
+"""Provide a command line tool to validate tabular samplesheets."""
 
 
 import argparse
@@ -55,9 +55,9 @@ def __init__(
         self._type_col = type_col
         self._file_col = file_col
         self._seen = set()
-        self.modified = []
+        self.validated = []
 
-    def validate_and_transform(self, row):
+    def validate(self, row):
         """
         Perform all validations on the given row.
 
@@ -70,7 +70,7 @@ def validate_and_transform(self, row):
         self._validate_type(row)
         self._validate_data_file(row)
         self._seen.add((row[self._sample_col], row[self._file_col]))
-        self.modified.append(row)
+        self.validated.append(row)
 
     def _validate_sample(self, row):
         """Assert that the sample name exists and convert spaces to underscores."""
@@ -105,17 +105,9 @@ def validate_unique_samples(self):
         """
         Assert that the combination of sample name and data filename is unique.
 
-        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different files, e.g., multiple runs per experiment.
-
         """
-        if len(self._seen) != len(self.modified):
+        if len(self._seen) != len(self.validated):
             raise AssertionError("The combination of sample name and data file must be unique.")
-        seen = Counter()
-        for row in self.modified:
-            sample = row[self._sample_col]
-            seen[sample] += 1
-            row[self._sample_col] = f"{sample}_T{seen[sample]}"
 
 
 def read_head(handle, num_lines=10):
@@ -195,7 +187,7 @@ def check_samplesheet(file_in, file_out):
         checker = RowChecker()
         for i, row in enumerate(reader):
             try:
-                checker.validate_and_transform(row)
+                checker.validate(row)
             except AssertionError as error:
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
@@ -205,7 +197,7 @@ def check_samplesheet(file_in, file_out):
     with file_out.open(mode="w", newline="") as out_handle:
         writer = csv.DictWriter(out_handle, header, delimiter=",")
         writer.writeheader()
-        for row in checker.modified:
+        for row in checker.validated:
             writer.writerow(row)
 
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index c3e0e49..7e9f667 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -13,7 +13,7 @@ workflow INPUT_CHECK {
         .csv
         .splitCsv ( header:true, sep:',' )
         .map { [
-            [ id: it.sample, sample: it.sample.replaceAll(/_T\d+$/, ''), type: it.datatype ], 
+            [ id: it.sample, sample: it.sample, type: it.datatype ], 
             file(it.datafile)
             ] }
         .set { reads }

From 3e8705cdee3e4adb71309ab4c40ba246c3cf9a73 Mon Sep 17 00:00:00 2001
From: Guoying Qi <gq2@sanger.ac.uk>
Date: Thu, 2 Nov 2023 11:42:32 +0000
Subject: [PATCH 27/27] update fai file for the full test and formating

---
 conf/test_full.config       | 3 +--
 workflows/variantcalling.nf | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/test_full.config b/conf/test_full.config
index 8532e0d..3a2d38e 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -23,6 +23,5 @@ params {
     fasta = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz'
 
     // Reference index file
-    fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.fai'
-    gzi = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi'
+    fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi'
 }
diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf
index 99c35e5..82267f7 100644
--- a/workflows/variantcalling.nf
+++ b/workflows/variantcalling.nf
@@ -25,10 +25,11 @@ if (params.fai){
     ){
       exit 1, 'Reference fasta and its index file format not matched!'
     }
-    ch_fai   = Channel.fromPath(params.fai)
+    ch_fai = Channel.fromPath(params.fai)
 } else { 
-    ch_fai      = Channel.empty() 
+    ch_fai = Channel.empty()
 }
+
 if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() }
 
 if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 }