Skip to content

Commit

Permalink
Merge pull request #330 from sanger-tol/dp24_steps
Browse files Browse the repository at this point in the history
Steps into DEV not into MAIN
  • Loading branch information
DLBPointon authored Nov 20, 2024
2 parents f7d9db2 + 35d9f23 commit 52f5bf9
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 158 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,20 @@ This builds on the initial release by adding subworkflows which generate kmer ba
- Fix a bug in build_alignment_blocks.py to avoid indexing errors happening in large genomes.
- Change output BEDGRAPH from EXTRACT_TELO module.

#### Hot Fix 1

- Adding support for multi-library cram input.

#### Hot Fix 2

- Adding support to select subworkflows to use in pipeline run.

### Parameters

| Old Parameter | New Parameter |
| ------------- | ------------- |
| - | --juicer |
| - | --steps |

### Software dependencies

Expand Down
6 changes: 5 additions & 1 deletion CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@

> Durand, N.C. et al. 2016. ‘Juicer provides a one-click system for analyzing loop-resolution hi-C experiments’, Cell Systems, 3(1), pp. 95–98. doi:10.1016/j.cels.2016.07.002.
- [Merqury_FK](https://github.com/thegenemyers/MERQURY.FK)

> Myers, G., Rhie, A. (2024). MerquryFK & KatFK. [online]. https://github.com/thegenemyers/MERQURY.FK. (Accessed on 20 September 2024).
- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)

> Li, H. 2021. ‘New strategies to improve MINIMAP2 alignment accuracy’, Bioinformatics, 37(23), pp. 4572–4574. doi:10.1093/bioinformatics/btab705.
Expand Down Expand Up @@ -72,7 +76,7 @@
- [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/)

> Di Tommaso, Paolo, et al. 2017. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, 35(4), pp. 316–19, https://doi.org/10.1038/nbt.3820.
> Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819.
- [SeqTK](https://github.com/lh3/seqtk)

Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ params {
input = null
outdir = "./results"
juicer = false
steps = "NONE"
tracedir = "${params.outdir}/pipeline_info"
publish_dir_mode = 'copy'
email = null
Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
"default": false,
"fa_icon": "fas fa-check"
},
"steps": {
"type": "string",
"description": "A csv list of steps to skip",
"fa_icon": "fas fa-folder-open"
},
"email": {
"type": "string",
"description": "Email address for completion summary.",
Expand Down
198 changes: 117 additions & 81 deletions workflows/treeval.nf
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ workflow TREEVAL {
//
ch_versions = Channel.empty()

exclude_workflow_steps = params.steps ? params.steps.split(",") : "NONE"

full_list = ["insilico_digest", "gene_alignments", "repeat_density", "gap_finder", "selfcomp", "synteny", "read_coverage", "telo_finder", "busco", "kmer", "hic_mapping", "NONE"]

if (!full_list.containsAll(exclude_workflow_steps)) {
exit 1, "There is an extra argument given on Command Line: \n Check contents of --exclude: $exclude_workflow_steps\nMaster list is: $full_list"
}

params.entry = 'FULL'
input_ch = Channel.fromPath(params.input, checkIfExists: true)

Expand Down Expand Up @@ -111,15 +119,17 @@ workflow TREEVAL {
// SUBWORKFLOW: Takes reference, channel of enzymes, my.genome, assembly_id and as file to generate
// file with enzymatic digest sites.
//
ch_enzyme = Channel.of( "bspq1","bsss1","DLE1" )
if ( !exclude_workflow_steps.contains("insilico_digest")) {
ch_enzyme = Channel.of( "bspq1","bsss1","DLE1" )

INSILICO_DIGEST (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
ch_enzyme,
digest_asfile
)
ch_versions = ch_versions.mix( INSILICO_DIGEST.out.versions )
INSILICO_DIGEST (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
ch_enzyme,
digest_asfile
)
ch_versions = ch_versions.mix( INSILICO_DIGEST.out.versions )
}

//
// SUBWORKFLOW: FOR SPLITTING THE REF GENOME INTO SCAFFOLD CHUNKS AND RUNNING SOME SUBWORKFLOWS
Expand All @@ -135,115 +145,141 @@ workflow TREEVAL {
//
// SUBWORKFLOW: Takes input fasta to generate BB files containing alignment data
//
GENE_ALIGNMENT (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.ref_index,
YAML_INPUT.out.align_data_dir,
YAML_INPUT.out.align_geneset,
YAML_INPUT.out.align_common,
YAML_INPUT.out.intron_size,
gene_alignment_asfiles
)
ch_versions = ch_versions.mix(GENE_ALIGNMENT.out.versions)
if ( !exclude_workflow_steps.contains("gene_alignment")) {
GENE_ALIGNMENT (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.ref_index,
YAML_INPUT.out.align_data_dir,
YAML_INPUT.out.align_geneset,
YAML_INPUT.out.align_common,
YAML_INPUT.out.intron_size,
gene_alignment_asfiles
)
ch_versions = ch_versions.mix(GENE_ALIGNMENT.out.versions)
}

//
// SUBWORKFLOW: GENERATES A BIGWIG FOR A REPEAT DENSITY TRACK
//
REPEAT_DENSITY (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome
)
ch_versions = ch_versions.mix( REPEAT_DENSITY.out.versions )
if ( !exclude_workflow_steps.contains("repeat_density")) {
REPEAT_DENSITY (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome
)
ch_versions = ch_versions.mix( REPEAT_DENSITY.out.versions )
}

//
// SUBWORKFLOW: GENERATES A GAP.BED FILE TO ID THE LOCATIONS OF GAPS
//
GAP_FINDER (
YAML_INPUT.out.reference_ch
)
ch_versions = ch_versions.mix( GAP_FINDER.out.versions )
if ( !exclude_workflow_steps.contains("gap_finder")) {
GAP_FINDER (
YAML_INPUT.out.reference_ch
)
ch_versions = ch_versions.mix( GAP_FINDER.out.versions )
}

//
// SUBWORKFLOW: Takes reference file, .genome file, mummer variables, motif length variable and as
// file to generate a file containing sites of self-complementary sequnce.
//
SELFCOMP (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.mummer_chunk,
YAML_INPUT.out.motif_len,
selfcomp_asfile
)
ch_versions = ch_versions.mix( SELFCOMP.out.versions )
if ( !exclude_workflow_steps.contains("selfcomp")) {
SELFCOMP (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.mummer_chunk,
YAML_INPUT.out.motif_len,
selfcomp_asfile
)
ch_versions = ch_versions.mix( SELFCOMP.out.versions )
}

//
// SUBWORKFLOW: Takes reference, the directory of syntenic genomes and order/clade of sequence
// and generated a file of syntenic blocks.
//
SYNTENY (
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.synteny_path
)
ch_versions = ch_versions.mix( SYNTENY.out.versions )
if ( !exclude_workflow_steps.contains("synteny")) {
SYNTENY (
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.synteny_path
)
ch_versions = ch_versions.mix( SYNTENY.out.versions )
}

//
// SUBWORKFLOW: Takes reference, pacbio reads
//
READ_COVERAGE (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.read_ch
)
ch_versions = ch_versions.mix( READ_COVERAGE.out.versions )
if ( !exclude_workflow_steps.contains("read_coverage")) {
READ_COVERAGE (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.read_ch
)
coverage_report = READ_COVERAGE.out.ch_reporting
ch_versions = ch_versions.mix(READ_COVERAGE.out.versions)
} else {
coverage_report = []
}

//
// SUBWORKFLOW: GENERATE TELOMERE WINDOW FILES WITH PACBIO READS AND REFERENCE
//
TELO_FINDER ( YAML_INPUT.out.reference_ch,
YAML_INPUT.out.teloseq
)
ch_versions = ch_versions.mix( TELO_FINDER.out.versions )
if ( !exclude_workflow_steps.contains("telo_finder")) {
TELO_FINDER ( YAML_INPUT.out.reference_ch,
YAML_INPUT.out.teloseq
)
ch_versions = ch_versions.mix( TELO_FINDER.out.versions )
}

//
// SUBWORKFLOW: GENERATE BUSCO ANNOTATION FOR ANCESTRAL UNITS
//
BUSCO_ANNOTATION (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.lineageinfo,
YAML_INPUT.out.lineagespath,
buscogene_asfile,
ancestral_table
)
ch_versions = ch_versions.mix( BUSCO_ANNOTATION.out.versions )
if ( !exclude_workflow_steps.contains("busco")) {
BUSCO_ANNOTATION (
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.lineageinfo,
YAML_INPUT.out.lineagespath,
buscogene_asfile,
ancestral_table
)
ch_versions = ch_versions.mix( BUSCO_ANNOTATION.out.versions )
}

//
// SUBWORKFLOW: Takes reads and assembly, produces kmer plot
//
KMER (
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.read_ch
)
ch_versions = ch_versions.mix( KMER.out.versions )
if ( !exclude_workflow_steps.contains("kmer")) {
KMER (
YAML_INPUT.out.reference_ch,
YAML_INPUT.out.read_ch
)
ch_versions = ch_versions.mix( KMER.out.versions )
}

//
// SUBWORKFLOW: GENERATE HIC MAPPING TO GENERATE PRETEXT FILES AND JUICEBOX
//
HIC_MAPPING (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.ref_index,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.hic_reads_ch,
YAML_INPUT.out.assembly_id,
GAP_FINDER.out.gap_file,
READ_COVERAGE.out.ch_covbw_nor,
READ_COVERAGE.out.ch_covbw_avg,
TELO_FINDER.out.bedgraph_file,
REPEAT_DENSITY.out.repeat_density,
params.entry
)
ch_versions = ch_versions.mix( HIC_MAPPING.out.versions )
if ( !exclude_workflow_steps.contains("hic_mapping")) {
HIC_MAPPING (
YAML_INPUT.out.reference_ch,
GENERATE_GENOME.out.ref_index,
GENERATE_GENOME.out.dot_genome,
YAML_INPUT.out.hic_reads_ch,
YAML_INPUT.out.assembly_id,
GAP_FINDER.out.gap_file,
READ_COVERAGE.out.ch_covbw_nor,
READ_COVERAGE.out.ch_covbw_avg,
TELO_FINDER.out.bedgraph_file,
REPEAT_DENSITY.out.repeat_density,
params.entry
)
ch_versions = ch_versions.mix( HIC_MAPPING.out.versions )
hic_report = HIC_MAPPING.out.ch_reporting
} else {
hic_report = []
}

//
// SUBWORKFLOW: Collates version data from prior subworflows
Expand All @@ -256,8 +292,8 @@ workflow TREEVAL {
// LOGIC: GENERATE SOME CHANNELS FOR REPORTING
//
YAML_INPUT.out.reference_ch
.combine( READ_COVERAGE.out.ch_reporting )
.combine( HIC_MAPPING.out.ch_reporting )
.combine( coverage_report )
.combine( hic_report )
.combine( CUSTOM_DUMPSOFTWAREVERSIONS.out.versions )
.map { meta, reference, read_meta, read_files, hic_meta, hic_files, custom_file -> [
rf_data: tuple(
Expand Down
Loading

0 comments on commit 52f5bf9

Please sign in to comment.