Merge pull request #330 from sanger-tol/dp24_steps

Steps into DEV not into MAIN
sanger-tol · Nov 20, 2024 · 52f5bf9 · 52f5bf9
2 parents f7d9db2 + 35d9f23
commit 52f5bf9
Show file tree

Hide file tree

Showing 7 changed files with 259 additions and 158 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,11 +40,20 @@ This builds on the initial release by adding subworkflows which generate kmer ba
 - Fix a bug in build_alignment_blocks.py to avoid indexing errors happening in large genomes.
 - Change output BEDGRAPH from EXTRACT_TELO module.
 
+#### Hot Fix 1
+
+- Adding support for multi-library cram input.
+
+#### Hot Fix 2
+
+- Adding support to select subworkflows to use in pipeline run.
+
 ### Parameters
 
 | Old Parameter | New Parameter |
 | ------------- | ------------- |
 | -             | --juicer      |
+| -             | --steps       |
 
 ### Software dependencies
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -34,6 +34,10 @@
 
   > Durand, N.C. et al. 2016. ‘Juicer provides a one-click system for analyzing loop-resolution hi-C experiments’, Cell Systems, 3(1), pp. 95–98. doi:10.1016/j.cels.2016.07.002.
 
+- [Merqury_FK](https://github.com/thegenemyers/MERQURY.FK)
+
+  > Myers, G., Rhie, A. (2024). MerquryFK & KatFK. [online]. https://github.com/thegenemyers/MERQURY.FK. (Accessed on 20 September 2024).
+
 - [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)
 
   > Li, H. 2021. ‘New strategies to improve MINIMAP2 alignment accuracy’, Bioinformatics, 37(23), pp. 4572–4574. doi:10.1093/bioinformatics/btab705.
@@ -72,7 +76,7 @@
 
 - [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/)
 
-  > Di Tommaso, Paolo, et al. 2017. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, 35(4), pp. 316–19, https://doi.org/10.1038/nbt.3820.
+  > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819.
 
 - [SeqTK](https://github.com/lh3/seqtk)
 

diff --git a/nextflow.config b/nextflow.config
@@ -14,6 +14,7 @@ params {
     input                      = null
     outdir                     = "./results"
     juicer                     = false
+    steps                      = "NONE"
     tracedir                   = "${params.outdir}/pipeline_info"
     publish_dir_mode           = 'copy'
     email                      = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -35,6 +35,11 @@
                     "default": false,
                     "fa_icon": "fas fa-check"
                 },
+                "steps": {
+                    "type": "string",
+                    "description": "A csv list of steps to skip",
+                    "fa_icon": "fas fa-folder-open"
+                },
                 "email": {
                     "type": "string",
                     "description": "Email address for completion summary.",

diff --git a/workflows/treeval.nf b/workflows/treeval.nf
@@ -63,6 +63,14 @@ workflow TREEVAL {
     //
     ch_versions     = Channel.empty()
 
+    exclude_workflow_steps  = params.steps ? params.steps.split(",") : "NONE"
+
+    full_list       = ["insilico_digest", "gene_alignments", "repeat_density", "gap_finder", "selfcomp", "synteny", "read_coverage", "telo_finder", "busco", "kmer", "hic_mapping", "NONE"]
+
+    if (!full_list.containsAll(exclude_workflow_steps)) {
+        exit 1, "There is an extra argument given on Command Line: \n Check contents of --exclude: $exclude_workflow_steps\nMaster list is: $full_list"
+    }
+
     params.entry    = 'FULL'
     input_ch        = Channel.fromPath(params.input, checkIfExists: true)
 
@@ -111,15 +119,17 @@ workflow TREEVAL {
     // SUBWORKFLOW: Takes reference, channel of enzymes, my.genome, assembly_id and as file to generate
     //              file with enzymatic digest sites.
     //
-    ch_enzyme       = Channel.of( "bspq1","bsss1","DLE1" )
+    if ( !exclude_workflow_steps.contains("insilico_digest")) {
+        ch_enzyme       = Channel.of( "bspq1","bsss1","DLE1" )
 
-    INSILICO_DIGEST (
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.reference_ch,
-        ch_enzyme,
-        digest_asfile
-    )
-    ch_versions     = ch_versions.mix( INSILICO_DIGEST.out.versions )
+        INSILICO_DIGEST (
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.reference_ch,
+            ch_enzyme,
+            digest_asfile
+        )
+        ch_versions     = ch_versions.mix( INSILICO_DIGEST.out.versions )
+    }
 
     //
     // SUBWORKFLOW: FOR SPLITTING THE REF GENOME INTO SCAFFOLD CHUNKS AND RUNNING SOME SUBWORKFLOWS
@@ -135,115 +145,141 @@ workflow TREEVAL {
     //
     // SUBWORKFLOW: Takes input fasta to generate BB files containing alignment data
     //
-    GENE_ALIGNMENT (
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.reference_ch,
-        GENERATE_GENOME.out.ref_index,
-        YAML_INPUT.out.align_data_dir,
-        YAML_INPUT.out.align_geneset,
-        YAML_INPUT.out.align_common,
-        YAML_INPUT.out.intron_size,
-        gene_alignment_asfiles
-    )
-    ch_versions     = ch_versions.mix(GENE_ALIGNMENT.out.versions)
+    if ( !exclude_workflow_steps.contains("gene_alignment")) {
+        GENE_ALIGNMENT (
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.reference_ch,
+            GENERATE_GENOME.out.ref_index,
+            YAML_INPUT.out.align_data_dir,
+            YAML_INPUT.out.align_geneset,
+            YAML_INPUT.out.align_common,
+            YAML_INPUT.out.intron_size,
+            gene_alignment_asfiles
+        )
+        ch_versions     = ch_versions.mix(GENE_ALIGNMENT.out.versions)
+    }
 
     //
     // SUBWORKFLOW: GENERATES A BIGWIG FOR A REPEAT DENSITY TRACK
     //
-    REPEAT_DENSITY (
-        YAML_INPUT.out.reference_ch,
-        GENERATE_GENOME.out.dot_genome
-    )
-    ch_versions     = ch_versions.mix( REPEAT_DENSITY.out.versions )
+    if ( !exclude_workflow_steps.contains("repeat_density")) {
+        REPEAT_DENSITY (
+            YAML_INPUT.out.reference_ch,
+            GENERATE_GENOME.out.dot_genome
+        )
+        ch_versions     = ch_versions.mix( REPEAT_DENSITY.out.versions )
+    }
 
     //
     // SUBWORKFLOW: GENERATES A GAP.BED FILE TO ID THE LOCATIONS OF GAPS
     //
-    GAP_FINDER (
-        YAML_INPUT.out.reference_ch
-    )
-    ch_versions     = ch_versions.mix( GAP_FINDER.out.versions )
+    if ( !exclude_workflow_steps.contains("gap_finder")) {
+        GAP_FINDER (
+            YAML_INPUT.out.reference_ch
+        )
+        ch_versions     = ch_versions.mix( GAP_FINDER.out.versions )
+    }
 
     //
     // SUBWORKFLOW: Takes reference file, .genome file, mummer variables, motif length variable and as
     //              file to generate a file containing sites of self-complementary sequnce.
     //
-    SELFCOMP (
-        YAML_INPUT.out.reference_ch,
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.mummer_chunk,
-        YAML_INPUT.out.motif_len,
-        selfcomp_asfile
-    )
-    ch_versions     = ch_versions.mix( SELFCOMP.out.versions )
+    if ( !exclude_workflow_steps.contains("selfcomp")) {
+        SELFCOMP (
+            YAML_INPUT.out.reference_ch,
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.mummer_chunk,
+            YAML_INPUT.out.motif_len,
+            selfcomp_asfile
+        )
+        ch_versions     = ch_versions.mix( SELFCOMP.out.versions )
+    }
 
     //
     // SUBWORKFLOW: Takes reference, the directory of syntenic genomes and order/clade of sequence
     //              and generated a file of syntenic blocks.
     //
-    SYNTENY (
-        YAML_INPUT.out.reference_ch,
-        YAML_INPUT.out.synteny_path
-    )
-    ch_versions     = ch_versions.mix( SYNTENY.out.versions )
+    if ( !exclude_workflow_steps.contains("synteny")) {
+        SYNTENY (
+            YAML_INPUT.out.reference_ch,
+            YAML_INPUT.out.synteny_path
+        )
+        ch_versions     = ch_versions.mix( SYNTENY.out.versions )
+    }
 
     //
     // SUBWORKFLOW: Takes reference, pacbio reads
     //
-    READ_COVERAGE (
-        YAML_INPUT.out.reference_ch,
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.read_ch
-    )
-    ch_versions     = ch_versions.mix( READ_COVERAGE.out.versions )
+    if ( !exclude_workflow_steps.contains("read_coverage")) {
+        READ_COVERAGE (
+            YAML_INPUT.out.reference_ch,
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.read_ch
+        )
+        coverage_report = READ_COVERAGE.out.ch_reporting
+        ch_versions     = ch_versions.mix(READ_COVERAGE.out.versions)
+    } else {
+        coverage_report = []
+    }
 
     //
     // SUBWORKFLOW: GENERATE TELOMERE WINDOW FILES WITH PACBIO READS AND REFERENCE
     //
-    TELO_FINDER (   YAML_INPUT.out.reference_ch,
-                    YAML_INPUT.out.teloseq
-    )
-    ch_versions     = ch_versions.mix( TELO_FINDER.out.versions )
+    if ( !exclude_workflow_steps.contains("telo_finder")) {
+        TELO_FINDER (   YAML_INPUT.out.reference_ch,
+                        YAML_INPUT.out.teloseq
+        )
+        ch_versions     = ch_versions.mix( TELO_FINDER.out.versions )
+    }
 
     //
     // SUBWORKFLOW: GENERATE BUSCO ANNOTATION FOR ANCESTRAL UNITS
     //
-    BUSCO_ANNOTATION (
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.reference_ch,
-        YAML_INPUT.out.lineageinfo,
-        YAML_INPUT.out.lineagespath,
-        buscogene_asfile,
-        ancestral_table
-    )
-    ch_versions = ch_versions.mix( BUSCO_ANNOTATION.out.versions )
+    if ( !exclude_workflow_steps.contains("busco")) {
+        BUSCO_ANNOTATION (
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.reference_ch,
+            YAML_INPUT.out.lineageinfo,
+            YAML_INPUT.out.lineagespath,
+            buscogene_asfile,
+            ancestral_table
+        )
+        ch_versions = ch_versions.mix( BUSCO_ANNOTATION.out.versions )
+    }
 
     //
     // SUBWORKFLOW: Takes reads and assembly, produces kmer plot
     //
-    KMER (
-        YAML_INPUT.out.reference_ch,
-        YAML_INPUT.out.read_ch
-    )
-    ch_versions     = ch_versions.mix( KMER.out.versions )
+    if ( !exclude_workflow_steps.contains("kmer")) {
+        KMER (
+            YAML_INPUT.out.reference_ch,
+            YAML_INPUT.out.read_ch
+        )
+        ch_versions     = ch_versions.mix( KMER.out.versions )
+    }
 
     //
     // SUBWORKFLOW: GENERATE HIC MAPPING TO GENERATE PRETEXT FILES AND JUICEBOX
     //
-    HIC_MAPPING (
-        YAML_INPUT.out.reference_ch,
-        GENERATE_GENOME.out.ref_index,
-        GENERATE_GENOME.out.dot_genome,
-        YAML_INPUT.out.hic_reads_ch,
-        YAML_INPUT.out.assembly_id,
-        GAP_FINDER.out.gap_file,
-        READ_COVERAGE.out.ch_covbw_nor,
-        READ_COVERAGE.out.ch_covbw_avg,
-        TELO_FINDER.out.bedgraph_file,
-        REPEAT_DENSITY.out.repeat_density,
-        params.entry
-    )
-    ch_versions     = ch_versions.mix( HIC_MAPPING.out.versions )
+    if ( !exclude_workflow_steps.contains("hic_mapping")) {
+        HIC_MAPPING (
+            YAML_INPUT.out.reference_ch,
+            GENERATE_GENOME.out.ref_index,
+            GENERATE_GENOME.out.dot_genome,
+            YAML_INPUT.out.hic_reads_ch,
+            YAML_INPUT.out.assembly_id,
+            GAP_FINDER.out.gap_file,
+            READ_COVERAGE.out.ch_covbw_nor,
+            READ_COVERAGE.out.ch_covbw_avg,
+            TELO_FINDER.out.bedgraph_file,
+            REPEAT_DENSITY.out.repeat_density,
+            params.entry
+        )
+        ch_versions     = ch_versions.mix( HIC_MAPPING.out.versions )
+        hic_report = HIC_MAPPING.out.ch_reporting
+    } else {
+        hic_report = []
+    }
 
     //
     // SUBWORKFLOW: Collates version data from prior subworflows
@@ -256,8 +292,8 @@ workflow TREEVAL {
     // LOGIC: GENERATE SOME CHANNELS FOR REPORTING
     //
     YAML_INPUT.out.reference_ch
-        .combine( READ_COVERAGE.out.ch_reporting )
-        .combine( HIC_MAPPING.out.ch_reporting )
+        .combine( coverage_report )
+        .combine( hic_report )
         .combine( CUSTOM_DUMPSOFTWAREVERSIONS.out.versions )
         .map { meta, reference, read_meta, read_files, hic_meta, hic_files, custom_file -> [
             rf_data: tuple(