From 41eefc4d9f968e6fad6a9086ce06d6e61649d2bc Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 21 Nov 2023 11:56:32 +0000 Subject: [PATCH] Updating channels and assignments, removing unused channels --- subworkflows/local/busco_annotation.nf | 1 - subworkflows/local/gene_alignment.nf | 7 +- subworkflows/local/generate_genome.nf | 17 +---- subworkflows/local/insilico_digest.nf | 1 - subworkflows/local/kmer.nf | 18 +++--- subworkflows/local/longread_coverage.nf | 56 ++++++++-------- subworkflows/local/synteny.nf | 9 ++- subworkflows/local/yaml_input.nf | 85 +++++++++++++++++-------- 8 files changed, 108 insertions(+), 86 deletions(-) diff --git a/subworkflows/local/busco_annotation.nf b/subworkflows/local/busco_annotation.nf index 09dbfe42..8e20abb0 100755 --- a/subworkflows/local/busco_annotation.nf +++ b/subworkflows/local/busco_annotation.nf @@ -23,7 +23,6 @@ workflow BUSCO_ANNOTATION { take: dot_genome // channel: [val(meta), [ datafile ]] reference_tuple // channel: [val(meta), [ datafile ]] - assembly_classT // channel: val(class) lineageinfo // channel: val(lineage_db) lineagespath // channel: val(/path/to/buscoDB) buscogene_as // channel: val(dot_as location) diff --git a/subworkflows/local/gene_alignment.nf b/subworkflows/local/gene_alignment.nf index 0d081af8..e59e5eef 100755 --- a/subworkflows/local/gene_alignment.nf +++ b/subworkflows/local/gene_alignment.nf @@ -19,7 +19,6 @@ workflow GENE_ALIGNMENT { reference_tuple // Channel [ val(meta), path(file) ] reference_index // Channel [ val(meta), path(file) ] max_scaff_size // Channel val(size of largest scaffold in bp) - assembly_classT // Channel val(clade_id) alignment_datadir // Channel val(geneset_dir) alignment_genesets // Channel val(geneset_id) alignment_common // Channel val(common_name) // Not yet in use @@ -46,10 +45,10 @@ workflow GENE_ALIGNMENT { // ch_data .combine( alignment_datadir ) - .combine( assembly_classT ) + .combine( reference_tuple ) .map { - ch_org, data_dir, classT -> - file("${data_dir}${classT}/csv_data/${ch_org}-data.csv") + ch_org, data_dir, meta, ref -> + file("${data_dir}${meta.class}/csv_data/${ch_org}-data.csv") } .splitCsv( header: true, sep:',') .map( row -> diff --git a/subworkflows/local/generate_genome.nf b/subworkflows/local/generate_genome.nf index 60791a40..d88776a8 100755 --- a/subworkflows/local/generate_genome.nf +++ b/subworkflows/local/generate_genome.nf @@ -9,29 +9,17 @@ include { GET_LARGEST_SCAFF } from '../../modules/local/get_largest_scaff' workflow GENERATE_GENOME { take: - assembly_id // Channel val(assembly_id) - reference_file // Channel path(file) + reference_ch // tuple( [id: name] , file) main: ch_versions = Channel.empty() - // - // LOGIC: GENERATES A REFERENCE DATA TUPLE - // - reference_file - .combine( assembly_id ) - .map { file, sample_id -> - tuple ([id: sample_id], - file) - } - .set { to_chromsize } - // // MODULE: GENERATE INDEX OF REFERENCE // EMITS REFERENCE INDEX FILE MODIFIED FOR SCAFF SIZES // CUSTOM_GETCHROMSIZES ( - to_chromsize, + reference_ch, "temp.genome" ) ch_versions = ch_versions.mix( CUSTOM_GETCHROMSIZES.out.versions ) @@ -56,6 +44,5 @@ workflow GENERATE_GENOME { max_scaff_size = GET_LARGEST_SCAFF.out.scaff_size.toInteger() dot_genome = GNU_SORT.out.sorted ref_index = CUSTOM_GETCHROMSIZES.out.fai - reference_tuple = to_chromsize versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/insilico_digest.nf b/subworkflows/local/insilico_digest.nf index 2e2989dc..737a52d6 100755 --- a/subworkflows/local/insilico_digest.nf +++ b/subworkflows/local/insilico_digest.nf @@ -14,7 +14,6 @@ include { UCSC_BEDTOBIGBED } from '../../modules/nf-core/ucsc/bedto workflow INSILICO_DIGEST { take: - myid // Channel val(sample_id) sizefile // Channel [ val(meta), path(my.genome_file) ] sample // Channel [ val(meta), path(reference_file) ] ch_enzyme // Channel val( "bspq1","bsss1","DLE1" ) diff --git a/subworkflows/local/kmer.nf b/subworkflows/local/kmer.nf index 2ad695e7..7d8800d0 100755 --- a/subworkflows/local/kmer.nf +++ b/subworkflows/local/kmer.nf @@ -16,8 +16,8 @@ include { MERQURYFK_MERQURYFK } from '../../modules/nf-core/merquryfk/merquryfk/ workflow KMER { take: - reference_tuple // Channel [ val(meta), path(file) ] - reads_path // Channel: [ val(meta), val( str ) ] + reference_tuple // Channel: [ val( meta ), path( file ) ] + reads_path // Channel: [ val( meta ), val( str ) ] main: ch_versions = Channel.empty() @@ -52,7 +52,7 @@ workflow KMER { // CAT_CAT.out.file_out .map{ meta, reads -> - reads.getName().endsWith('gz') ? [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa.gz'] : [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa'] + reads.getName().endsWith('gz') ? [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa.gz'] : [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa'] } .set{ ch_reads_merged } @@ -60,23 +60,23 @@ workflow KMER { // LOGIC: PREPARE FASTK INPUT // CAT_CAT.out.file_out - .join(ch_reads_merged) + .join( ch_reads_merged ) .map{ meta, reads_old, reads_new -> - reads_old.renameTo(reads_new); + reads_old.renameTo( reads_new ); } // // MODULE: COUNT KMERS // FASTK_FASTK( ch_reads_merged ) - ch_versions = ch_versions.mix(FASTK_FASTK.out.versions.first()) + ch_versions = ch_versions.mix( FASTK_FASTK.out.versions.first() ) // // LOGIC: PREPARE MERQURYFK INPUT // FASTK_FASTK.out.hist - .combine(FASTK_FASTK.out.ktab) - .combine(reference_tuple) + .combine( FASTK_FASTK.out.ktab ) + .combine( reference_tuple ) .map{ meta_hist, hist, meta_ktab, ktab, meta_ref, primary -> tuple( meta_hist, hist, ktab, primary, []) } @@ -86,7 +86,7 @@ workflow KMER { // MODULE: USE KMER HISTOGRAM TO PRODUCE SPECTRA // MERQURYFK_MERQURYFK ( ch_merq ) - ch_versions = ch_versions.mix(MERQURYFK_MERQURYFK.out.versions.first()) + ch_versions = ch_versions.mix( MERQURYFK_MERQURYFK.out.versions.first() ) emit: merquryk_completeness = MERQURYFK_MERQURYFK.out.stats // meta, stats diff --git a/subworkflows/local/longread_coverage.nf b/subworkflows/local/longread_coverage.nf index bb0fd718..23a0b118 100755 --- a/subworkflows/local/longread_coverage.nf +++ b/subworkflows/local/longread_coverage.nf @@ -39,7 +39,7 @@ workflow LONGREAD_COVERAGE { MINIMAP2_INDEX( reference_tuple ) - ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) // // LOGIC: PREPARE GET_READS_FROM_DIRECTORY INPUT @@ -58,7 +58,7 @@ workflow LONGREAD_COVERAGE { // // MODULE: GETS PACBIO READ PATHS FROM READS_PATH // - ch_grabbed_read_paths = GrabFiles( get_reads_input ) + ch_grabbed_read_paths = GrabFiles( get_reads_input ) // // LOGIC: PACBIO READS FILES TO CHANNEL @@ -126,8 +126,8 @@ workflow LONGREAD_COVERAGE { small.bool_cigar_paf, small.bool_cigar_bam ) - ch_versions = ch_versions.mix(MINIMAP2_ALIGN.out.versions) - ch_align_bams = MINIMAP2_ALIGN.out.bam + ch_versions = ch_versions.mix( MINIMAP2_ALIGN.out.versions ) + ch_align_bams = MINIMAP2_ALIGN.out.bam // // MODULE: ALIGN READS TO REFERENCE WHEN REFERENCE >5GB PER SCAFFOLD @@ -139,7 +139,7 @@ workflow LONGREAD_COVERAGE { large.bool_cigar_paf, large.bool_cigar_bam ) - ch_versions = ch_versions.mix(MINIMAP2_ALIGN_SPLIT.out.versions) + ch_versions = ch_versions.mix( MINIMAP2_ALIGN_SPLIT.out.versions ) // // LOGIC: COLLECT OUTPUTTED BAM FILES FROM BOTH PROCESSES @@ -172,7 +172,7 @@ workflow LONGREAD_COVERAGE { reference_tuple, [[],[]] ) - ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions) + ch_versions = ch_versions.mix( SAMTOOLS_MERGE.out.versions ) // // MODULE: SORT THE MERGED BAM BEFORE CONVERSION @@ -180,7 +180,7 @@ workflow LONGREAD_COVERAGE { SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix( SAMTOOLS_MERGE.out.versions ) + ch_versions = ch_versions.mix( SAMTOOLS_MERGE.out.versions ) // // LOGIC: PREPARING MERGE INPUT WITH REFERENCE GENOME AND REFERENCE INDEX @@ -209,7 +209,7 @@ workflow LONGREAD_COVERAGE { view_input.ref_input, [] ) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW.out.versions) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions ) // // MODULE: BAM TO PRIMARY BED @@ -217,7 +217,7 @@ workflow LONGREAD_COVERAGE { BEDTOOLS_BAMTOBED( SAMTOOLS_VIEW.out.bam ) - ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions) + ch_versions = ch_versions.mix( BEDTOOLS_BAMTOBED.out.versions ) // // LOGIC: PREPARING Genome2Cov INPUT @@ -244,7 +244,7 @@ workflow LONGREAD_COVERAGE { genomecov_input.dot_genome, genomecov_input.file_suffix ) - ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV.out.versions) + ch_versions = ch_versions.mix( BEDTOOLS_GENOMECOV.out.versions ) // // MODULE: SORT THE PRIMARY BED FILE @@ -252,7 +252,7 @@ workflow LONGREAD_COVERAGE { GNU_SORT( BEDTOOLS_GENOMECOV.out.genomecov ) - ch_versions = ch_versions.mix(GNU_SORT.out.versions) + ch_versions = ch_versions.mix( GNU_SORT.out.versions ) // // MODULE: get_minmax_punches @@ -260,7 +260,7 @@ workflow LONGREAD_COVERAGE { GETMINMAXPUNCHES( GNU_SORT.out.sorted ) - ch_versions = ch_versions.mix(GETMINMAXPUNCHES.out.versions) + ch_versions = ch_versions.mix( GETMINMAXPUNCHES.out.versions) // // MODULE: get_minmax_punches @@ -268,7 +268,7 @@ workflow LONGREAD_COVERAGE { BEDTOOLS_MERGE_MAX( GETMINMAXPUNCHES.out.max ) - ch_versions = ch_versions.mix(BEDTOOLS_MERGE_MAX.out.versions) + ch_versions = ch_versions.mix( BEDTOOLS_MERGE_MAX.out.versions ) // // MODULE: get_minmax_punches @@ -276,7 +276,7 @@ workflow LONGREAD_COVERAGE { BEDTOOLS_MERGE_MIN( GETMINMAXPUNCHES.out.min ) - ch_versions = ch_versions.mix(BEDTOOLS_MERGE_MIN.out.versions) + ch_versions = ch_versions.mix( BEDTOOLS_MERGE_MIN.out.versions ) // // MODULE: GENERATE DEPTHGRAPH @@ -284,8 +284,8 @@ workflow LONGREAD_COVERAGE { GRAPHOVERALLCOVERAGE( GNU_SORT.out.sorted ) - ch_versions = ch_versions.mix(GRAPHOVERALLCOVERAGE.out.versions) - ch_depthgraph = GRAPHOVERALLCOVERAGE.out.part + ch_versions = ch_versions.mix( GRAPHOVERALLCOVERAGE.out.versions ) + ch_depthgraph = GRAPHOVERALLCOVERAGE.out.part // // LOGIC: PREPARING FINDHALFCOVERAGE INPUT @@ -308,7 +308,7 @@ workflow LONGREAD_COVERAGE { halfcov_input.genome_file, halfcov_input.depthgraph_file ) - ch_versions = ch_versions.mix(FINDHALFCOVERAGE.out.versions) + ch_versions = ch_versions.mix(FINDHALFCOVERAGE.out.versions) // // LOGIC: PREPARING NORMAL COVERAGE INPUT @@ -329,7 +329,7 @@ workflow LONGREAD_COVERAGE { bed2bw_normal_input.ch_coverage_bed, bed2bw_normal_input.genome_file ) - ch_versions = ch_versions.mix(BED2BW_NORMAL.out.versions) + ch_versions = ch_versions.mix(BED2BW_NORMAL.out.versions) // // MODULE: CONVERT COVERAGE TO LOG2 @@ -337,7 +337,7 @@ workflow LONGREAD_COVERAGE { LONGREADCOVERAGESCALELOG2( GNU_SORT.out.sorted ) - ch_versions = ch_versions.mix(LONGREADCOVERAGESCALELOG2.out.versions) + ch_versions = ch_versions.mix(LONGREADCOVERAGESCALELOG2.out.versions) // // LOGIC: PREPARING LOG2 COVERAGE INPUT @@ -358,13 +358,11 @@ workflow LONGREAD_COVERAGE { bed2bw_log2_input.ch_coverage_bed, bed2bw_log2_input.genome_file ) - ch_versions = ch_versions.mix(BED2BW_LOG2.out.versions) + ch_versions = ch_versions.mix(BED2BW_LOG2.out.versions) // // LOGIC: GENERATE A SUMMARY TUPLE FOR OUTPUT // - ch_grabbed_read_paths.map{ it } - ch_grabbed_read_paths .collect() .map { meta, fasta -> @@ -376,13 +374,13 @@ workflow LONGREAD_COVERAGE { .set { ch_reporting_pacbio } emit: - ch_minbed = BEDTOOLS_MERGE_MIN.out.bed - ch_halfbed = FINDHALFCOVERAGE.out.bed - ch_maxbed = BEDTOOLS_MERGE_MAX.out.bed - ch_reporting = ch_reporting_pacbio.collect() - ch_covbw_nor = BED2BW_NORMAL.out.bigwig - ch_covbw_log = BED2BW_LOG2.out.bigwig - versions = ch_versions + ch_minbed = BEDTOOLS_MERGE_MIN.out.bed + ch_halfbed = FINDHALFCOVERAGE.out.bed + ch_maxbed = BEDTOOLS_MERGE_MAX.out.bed + ch_reporting = ch_reporting_pacbio.collect() + ch_covbw_nor = BED2BW_NORMAL.out.bigwig + ch_covbw_log = BED2BW_LOG2.out.bigwig + versions = ch_versions } process GrabFiles { diff --git a/subworkflows/local/synteny.nf b/subworkflows/local/synteny.nf index 3c631887..2c228821 100755 --- a/subworkflows/local/synteny.nf +++ b/subworkflows/local/synteny.nf @@ -10,7 +10,6 @@ workflow SYNTENY { take: reference_tuple // Channel [ val(meta), path(file) ] synteny_path // Channel val(meta) - assembly_classT // Channel val(meta) main: ch_versions = Channel.empty() @@ -19,9 +18,15 @@ workflow SYNTENY { // MODULE: SEARCHES PREDETERMINED PATH FOR SYNTENIC GENOME FILES BASED ON CLASS // EMITS PATH LIST // + reference_ch + .map{meta, file -> + meta.class + } + .set { defined_class } + GET_SYNTENY_GENOMES( synteny_path, - assembly_classT + defined_class ) ch_versions = ch_versions.mix( GET_SYNTENY_GENOMES.out.versions ) diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 50101f8d..6bfb50a0 100755 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -37,21 +37,22 @@ workflow YAML_INPUT { group .assembly .multiMap { data -> - level: data.level + level: data.assem_level + asmVersion: data.assem_version sample_id: data.sample_id - classT: data.classT - asmVersion: data.asmVersion - dbVersion: data.dbVersion - gevalType: data.gevalType + latin_name: data.latin_name + defined_class: data.defined_class + project_id: data.project_id } .set { assembly_data } group .assembly_reads .multiMap { data -> - pacbio: data.pacbio - hic: data.hic - supplement: data.supplementary + longread_type: data.longread_type + longread_data: data.longread_data + hic: data.hic_data + supplement: data.supplementary_data } .set { assem_reads } @@ -60,7 +61,7 @@ workflow YAML_INPUT { .multiMap { data -> data_dir: data.data_dir common_name: data.common_name - geneset: data.geneset + geneset: data.geneset_id } .set{ alignment_data } @@ -105,24 +106,58 @@ workflow YAML_INPUT { // LOGIC: COMBINE SOME CHANNELS INTO VALUES REQUIRED DOWNSTREAM // assembly_data.sample_id - .combine( assembly_data.asmVersion ) - .map { it1, it2 -> - ("${it1}_${it2}")} - .set { tolid_version} + .combine( assembly_data.assem_version ) + .combine( group.reference ) + .combine( assembly_data.defined_class ) + .combine( assembly_data.project_id ) + .map { it1, it2, ref_file, defined_class, project -> + tuple( [ id: "${it1}_${it2}", + class: defined_class, + project_type: project + ], + data + ) + } + .set { ref_ch } + + assembly_data.sample_id + .combine( assem_reads.longread_type ) + .combine( assem_reads.longread_data ) + .map{ sample, type, data -> + tuple( [ id : sample, + single_end : true, + longread_type : type + ], + data + ) + } + .set { longread_ch } + + assembly_data.sample_id + .combine( assem_reads.hic_data ) + .map { sample, data -> + tuple( [ id: sample ], + data + ) + } + .set { hic_ch } + + assembly_data.sample_id + .combine( assem_reads.supplement ) + .map { sample, data -> + tuple( [ id: sample ], + data + ) + } + .set { supplement_ch } emit: - assembly_id = tolid_version - assembly_classT = assembly_data.classT - assembly_level = assembly_data.level - assembly_asmVer = assembly_data.asmVersion - assembly_dbVer = assembly_data.dbVersion - assembly_ttype = assembly_data.gevalType - - pacbio_reads = assem_reads.pacbio - hic_reads = assem_reads.hic - supp_reads = assem_reads.supplement - - reference = group.reference + reference = ref_ch + assembly_level = assembly_data.assem_level + + longreads = longread_ch + hic_reads = hic_ch + supp_reads = supplement_ch align_data_dir = alignment_data.data_dir align_geneset = alignment_data.geneset