diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc3a1d64..7140b7b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,11 @@ jobs: - "22.10.1" - "latest-everything" steps: - - name: Check out pipeline code - uses: actions/checkout@v3 + - name: Get branch names + # Pulls the names of current branches in repo + # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to + id: branch-names + uses: tj-actions/branch-names@v8 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 @@ -45,6 +48,23 @@ jobs: mkdir -p $NXF_SINGULARITY_CACHEDIR mkdir -p $NXF_SINGULARITY_LIBRARYDIR + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install nf-core + run: | + pip install nf-core + + - name: NF-Core Download - download singularity containers + # Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found + # Must occur after singularity install or will crash trying to dl containers + # Zip up this fresh download and run the checked out version + run: | + nf-core download sanger-tol/treeval --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-treeval --container-cache-utilisation amend --container-system singularity + tree * + - name: Download Tiny test data # Download A fungal test data set that is full enough to show some real output. run: | @@ -53,4 +73,4 @@ jobs: - name: Singularity - Run FULL pipeline with test data # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_github,singularity --outdir ./Sing-Full + nextflow run sanger-treeval/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test_github,singularity --outdir ./Sing-Full diff --git a/assets/github_testing/TreeValTinyFullTest.yaml b/assets/github_testing/TreeValTinyFullTest.yaml index 65009b6d..013d1d4b 100755 --- a/assets/github_testing/TreeValTinyFullTest.yaml +++ b/assets/github_testing/TreeValTinyFullTest.yaml @@ -8,9 +8,11 @@ assembly: reference_file: /home/runner/work/treeval/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa assem_reads: read_type: hifi - read_data: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/pacbio/ - hic_data: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/hic-arima/ + read_data: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/pacbio supplementary_data: path +hic_data: + hic_cram: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/hic-arima/ + hic_aligner: bwamem2 kmer_profile: # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir kmer_length: 31 @@ -28,7 +30,7 @@ intron: telomere: teloseq: TTAGGG synteny: - synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/ + synteny_path: /home/runner/work/treeval/treeval/treeval/TreeValTinyData/synteny synteny_genomes: "LaetiporusSulphureus" busco: lineages_path: /home/runner/work/treeval/treeval/TreeValTinyData/busco/subset/ diff --git a/assets/local_testing/nxOscDF5033.yaml b/assets/local_testing/nxOscDF5033.yaml index a8eee89f..8189a1ee 100755 --- a/assets/local_testing/nxOscDF5033.yaml +++ b/assets/local_testing/nxOscDF5033.yaml @@ -9,8 +9,10 @@ reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeVa assem_reads: read_type: hifi read_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/ - hic_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/ supplementary_data: path +hic_data: + hic_cram: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/ + hic_aligner: minimap2 kmer_profile: # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir kmer_length: 31 diff --git a/assets/local_testing/nxOscSUBSET.yaml b/assets/local_testing/nxOscSUBSET.yaml index bad13196..2e0d590f 100755 --- a/assets/local_testing/nxOscSUBSET.yaml +++ b/assets/local_testing/nxOscSUBSET.yaml @@ -1,16 +1,18 @@ assembly: assem_level: scaffold + assem_version: 1 sample_id: OscheiusSUBSET latin_name: to_provide_taxonomic_rank defined_class: nematode - assem_version: 1 project_id: DTOL reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/assembly/draft/SUBSET_genome/Oscheius_SUBSET.fasta assem_reads: - longread_type: hifi - longread_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/genomic_data/pacbio/ - hic_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/subset/ + read_type: hifi + read_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/genomic_data/pacbio/ supplementary_data: path +hic_data: + hic_cram: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/subset/ + hic_aligner: minimap2 kmer_profile: # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir kmer_length: 31 @@ -18,11 +20,11 @@ kmer_profile: alignment: data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/ common_name: "" # For future implementation (adding bee, wasp, ant etc) - geneset: "Gae_host.Gae" + geneset_id: "Gae_host.Gae" #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv" self_comp: motif_len: 0 - mummer_chunk: 4 + mummer_chunk: 10 intron: size: "50k" telomere: diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh new file mode 100755 index 00000000..d50aa9ec --- /dev/null +++ b/bin/awk_filter_reads.sh @@ -0,0 +1 @@ +awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}' diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl new file mode 100755 index 00000000..6f9b4d58 --- /dev/null +++ b/bin/filter_five_end.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $prev_id = ""; +my @five; +my @three; +my @unmap; +my @mid; +my @all; +my $counter = 0; + +while (){ + chomp; + if (/^@/){ + print $_."\n"; + next; + } + my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/; + my $bin = reverse(dec2bin($flag)); + my @binary = split(//,$bin); + if ($prev_id ne $id && $prev_id ne ""){ + if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + } + elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + + $counter = 0; + undef @unmap; + undef @five; + undef @three; + undef @mid; + undef @all; + } + + $counter++; + $prev_id = $id; + push @all,$_; + if ($binary[2]==1){ + push @unmap,$_; + } + elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){ + push @five, $_; + } + elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){ + push @three, $_; + } + elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){ + push @mid, $_; + } +} + +if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } +} +elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; +} +else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); +} + +sub dec2bin { + my $str = unpack("B32", pack("N", shift)); + return $str; +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +} diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh new file mode 100755 index 00000000..680b5ec2 --- /dev/null +++ b/bin/grep_pg.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# grep_pg.sh +# ------------------- +# A shell script to exclude pg lines and label read 1 and read 2 from cram containers +# +# ------------------- +# Author = yy5 + +grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}' diff --git a/conf/base.config b/conf/base.config index 2f7e330a..962f5ceb 100755 --- a/conf/base.config +++ b/conf/base.config @@ -105,6 +105,7 @@ process { // RESOURCES: CHANGES TO FREQUENT FAILURES BELOW THIS MEM POINT withName: '.*:.*:GENE_ALIGNMENT:.*:(MINIPROT_ALIGN|MINIMAP2_ALIGN)' { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 50.GB * Math.ceil( task.attempt * 1.5 ) , 'memory' ) } time = { check_max( 10.h * task.attempt, 'time' ) } } @@ -137,6 +138,11 @@ process { memory = { check_max( 130.GB * task.attempt, 'memory' ) } } + withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { + cpus = { check_max( 16 * 1, 'cpus' ) } + memory = { check_max( 130.GB * task.attempt, 'memory' ) } + } + withName: SNAPSHOT_SRES{ cpus = { check_max( 6 * 1, 'cpus' ) } memory = { check_max( 4.GB * task.attempt, 'memory' ) } @@ -181,6 +187,11 @@ process { memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) } } + withName: MINIMAP2_INDEX { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) } + } + // add a cpus 16 if bam.size() >= 50GB withName: '(SAMTOOLS_MARKDUP|BAMTOBED_SORT)' { cpus = { check_max( 12 * 1, 'cpus' ) } @@ -192,6 +203,11 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } } + withName: MERQURYFK_MERQURYFK { + cpus = { check_max( 16 * 1, 'cpus' ) } + memory = { check_max( 100.GB * task.attempt, 'memory' ) } + } + withName: BUSCO { cpus = { check_max( 16 * task.attempt, 'cpus' ) } memory = { check_max( 50.GB * task.attempt, 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index 88b6d4bf..d4c0912c 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -327,7 +327,7 @@ process { ext.prefix = { "${meta.id}_mkdup" } } - withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { + withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = "-F0xB00 -nt" ext.args2 = { "-5SPCp -H'${rglines}'" } @@ -335,12 +335,23 @@ process { ext.args4 = { "--write-index -l1" } } + withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = "" + ext.args2 = { "-ax sr" } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + withName: ".*:.*:GENERATE_GENOME:GNU_SORT" { ext.prefix = { "${meta.id}" } ext.suffix = { "genome" } ext.args = { "-k2,2 -nr -S${task.memory.mega - 100}M -T ." } } + withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" { + ext.args = { "${reference.size() > 2.5e9 ? (" -I " + Math.ceil(reference.size()/1e9)+"G") : ""} "} + } // // SUBWORKFLOW: KMER @@ -349,7 +360,11 @@ process { ext.prefix = { "${meta.id}_merged.fasta.gz" } } - withName: FASTK_FASTK { + withName: ".*:.*:KMER_READ_COVERAGE:FASTK_FASTK" { + ext.args = "-k31 -t -P." + } + + withName: ".*:.*:KMER:FASTK_FASTK" { ext.args = "-k31 -t -P." } diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf index 3c639b5b..91533fe3 100755 --- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -2,9 +2,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { tag "$meta.id" label "process_high" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' : - 'biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }" + container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' input: tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix) diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf new file mode 100755 index 00000000..b7043726 --- /dev/null +++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf @@ -0,0 +1,55 @@ +process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { + tag "$meta.id" + label "process_high" + + container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' + + input: + tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(ref) + + output: + tuple val(meta), path("*.bam"), emit: mappedbam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args1 = task.ext.args1 ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + cram_filter -n ${from}-${to} ${cramfile} - | \\ + samtools fastq ${args1} - | \\ + minimap2 -t${task.cpus} -R '${rglines}' ${args2} ${ref} - | \\ + grep_pg.sh | \\ + filter_five_end.pl | \\ + awk_filter_reads.sh | \\ + samtools fixmate ${args3} - - | \\ + samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mm.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + minimap2: \$(minimap2 --version | sed 's/minimap2 //g') + END_VERSIONS + """ + // temp removal staden_io_lib: \$(echo \$(staden_io_lib --version 2>&1) | sed 's/^.*staden_io_lib //; s/Using.*\$//') CAUSES ERROR + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def base = "45022_3#2" + def chunkid = "1" + """ + touch ${prefix}_${base}_${chunkid}_mm.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + minimap2: \$(echo \$(minimap2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ +} diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index fdfd5cb2..b5b8f4fe 100755 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -2,10 +2,7 @@ process GENERATE_CRAM_CSV { tag "${meta.id}" label 'process_tiny' - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' input: tuple val(meta), path(crampath) diff --git a/modules/local/graphoverallcoverage.nf b/modules/local/graphoverallcoverage.nf index 10c0a112..b8cc8777 100755 --- a/modules/local/graphoverallcoverage.nf +++ b/modules/local/graphoverallcoverage.nf @@ -2,10 +2,7 @@ process GRAPHOVERALLCOVERAGE { tag "$meta.id" label "process_single" - conda "conda-forge::perl=5.26.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'biocontainers/perl:5.26.2' }" + container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' input: tuple val(meta), path(bed) diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf new file mode 100755 index 00000000..06ad7cfe --- /dev/null +++ b/subworkflows/local/hic_bwamem2.nf @@ -0,0 +1,92 @@ +#!/usr/bin/env nextflow + +// This subworkflow takes an input fasta sequence and csv style list of hic cram file to return +// alignment files including .mcool, pretext and .hic. +// Input - Assembled genomic fasta file, cram file directory +// Output - .mcool, .pretext, .hic + +// +// MODULE IMPORT BLOCK +// +include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index/main' +include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' + +workflow HIC_BWAMEM2 { + take: + reference_tuple // Channel: tuple [ val(meta), path( file ) ] + csv_ch + reference_index + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + BWAMEM2_INDEX ( + reference_tuple + ) + ch_versions = ch_versions.mix( BWAMEM2_INDEX.out.versions ) + + csv_ch + .splitCsv() + .combine ( reference_tuple ) + .combine ( BWAMEM2_INDEX.out.index ) + .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path -> + tuple([ + id: cram_id.id + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1] + ) + } + .set { ch_filtering_input } + + // + // MODULE: map hic reads by 10,000 container per time using bwamem2 + // + CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( + ch_filtering_input + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + reference_tuple, + reference_index + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf index 8db69902..57ae79b4 100755 --- a/subworkflows/local/hic_mapping.nf +++ b/subworkflows/local/hic_mapping.nf @@ -8,23 +8,26 @@ // // MODULE IMPORT BLOCK // -include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index/main' -include { COOLER_CLOAD } from '../../modules/nf-core/cooler/cload/main' -include { COOLER_ZOOMIFY } from '../../modules/nf-core/cooler/zoomify/main' -include { PRETEXTMAP as PRETEXTMAP_STANDRD } from '../../modules/nf-core/pretextmap/main' -include { PRETEXTMAP as PRETEXTMAP_HIGHRES } from '../../modules/nf-core/pretextmap/main' -include { PRETEXTSNAPSHOT as SNAPSHOT_SRES } from '../../modules/nf-core/pretextsnapshot/main' -include { PRETEXTSNAPSHOT as SNAPSHOT_HRES } from '../../modules/nf-core/pretextsnapshot/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' -include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' -include { JUICER_TOOLS_PRE } from '../../modules/local/juicer_tools_pre' -include { SUBSAMPLE_BAM } from '../../modules/local/subsample_bam.nf' -include { PRETEXT_INGESTION as PRETEXT_INGEST_SNDRD } from '../../subworkflows/local/pretext_ingestion' -include { PRETEXT_INGESTION as PRETEXT_INGEST_HIRES } from '../../subworkflows/local/pretext_ingestion' -include { HIC_BAMTOBED as HIC_BAMTOBED_COOLER } from '../../subworkflows/local/hic_bamtobed' -include { HIC_BAMTOBED as HIC_BAMTOBED_JUICER } from '../../subworkflows/local/hic_bamtobed' - +include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index/main' +include { COOLER_CLOAD } from '../../modules/nf-core/cooler/cload/main' +include { COOLER_ZOOMIFY } from '../../modules/nf-core/cooler/zoomify/main' +include { PRETEXTMAP as PRETEXTMAP_STANDRD } from '../../modules/nf-core/pretextmap/main' +include { PRETEXTMAP as PRETEXTMAP_HIGHRES } from '../../modules/nf-core/pretextmap/main' +include { PRETEXTSNAPSHOT as SNAPSHOT_SRES } from '../../modules/nf-core/pretextsnapshot/main' +include { PRETEXTSNAPSHOT as SNAPSHOT_HRES } from '../../modules/nf-core/pretextsnapshot/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' +include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort' +include { JUICER_TOOLS_PRE } from '../../modules/local/juicer_tools_pre' +include { SUBSAMPLE_BAM } from '../../modules/local/subsample_bam.nf' +include { PRETEXT_INGESTION as PRETEXT_INGEST_SNDRD } from '../../subworkflows/local/pretext_ingestion' +include { PRETEXT_INGESTION as PRETEXT_INGEST_HIRES } from '../../subworkflows/local/pretext_ingestion' +include { HIC_BAMTOBED as HIC_BAMTOBED_COOLER } from '../../subworkflows/local/hic_bamtobed' +include { HIC_BAMTOBED as HIC_BAMTOBED_JUICER } from '../../subworkflows/local/hic_bamtobed' +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' +include { HIC_MINIMAP2 } from '../../subworkflows/local/hic_minimap2' +include { HIC_BWAMEM2 } from '../../subworkflows/local/hic_bwamem2' workflow HIC_MAPPING { take: @@ -46,14 +49,6 @@ workflow HIC_MAPPING { // COMMENT: 1000bp BIN SIZE INTERVALS FOR CLOAD ch_cool_bin = Channel.of( 1000 ) - // - // MODULE: Indexing on reference output the folder of indexing files - // - BWAMEM2_INDEX ( - reference_tuple - ) - ch_versions = ch_versions.mix( BWAMEM2_INDEX.out.versions ) - // // LOGIC: make channel of hic reads as input for GENERATE_CRAM_CSV // @@ -76,68 +71,50 @@ workflow HIC_MAPPING { ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) // - // LOGIC: organise all parametres into a channel for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT - // - GENERATE_CRAM_CSV.out.csv - .splitCsv() - .combine ( reference_tuple ) - .combine ( BWAMEM2_INDEX.out.index ) - .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path -> - tuple([ - id: cram_id.id - ], - file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6], - bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1] - ) + // LOGIC: make branches for different hic aligner. + // + hic_reads_path + .combine(reference_tuple) + .map{ meta, hic_read_path, ref_meta, ref-> + tuple( + [ id : ref_meta, + aligner : meta.aligner + ], + ref + ) + } + .branch{ + minimap2 : it[0].aligner == "minimap2" + bwamem2 : it[0].aligner == "bwamem2" } - .set { ch_filtering_input } + .set{ch_aligner} // - // MODULE: parallel proccessing bwa-mem2 alignment by given interval of containers from cram files + // SUBWORKFLOW: mapping hic reads using minimap2 // - CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( - ch_filtering_input + HIC_MINIMAP2 ( + ch_aligner.minimap2, + GENERATE_CRAM_CSV.out.csv, + reference_index ) - ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) - - // - // LOGIC: PREPARING BAMS FOR MERGE - // - CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam - .map{ meta, file -> - tuple( file ) - } - .collect() - .map { file -> - tuple ( - [ - id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] - ], - file - ) - } - .set { collected_files_for_merge } + ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) + mergedbam = HIC_MINIMAP2.out.mergedbam // - // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // SUBWORKFLOW: mapping hic reads using bwamem2 // - SAMTOOLS_MERGE ( - collected_files_for_merge, - reference_tuple, + HIC_BWAMEM2 ( + ch_aligner.bwamem2, + GENERATE_CRAM_CSV.out.csv, reference_index ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) + mergedbam = mergedbam.mix(HIC_BWAMEM2.out.mergedbam) // // LOGIC: PREPARING PRETEXT MAP INPUT // - SAMTOOLS_MERGE.out.bam + mergedbam .combine( reference_tuple ) .multiMap { bam_meta, bam, ref_meta, ref_fa -> input_bam: tuple( [ id: bam_meta.id, @@ -208,12 +185,31 @@ workflow HIC_MAPPING { // // SNAPSHOT_HRES ( PRETEXTMAP_HIGHRES.out.pretext ) // ch_versions = ch_versions.mix ( SNAPSHOT_HRES.out.versions ) + + // + // LOGIC: BRANCH TO SUBSAMPLE BAM IF LARGER THAN 50G + // + mergedbam + .map{ meta, bam -> + tuple( + [ id : meta.id, + sz : file(bam).size() + ], + bam + ) + } + .branch { + tosubsample : it[0].sz >= 50000000000 + unmodified : it[0].sz < 50000000000 + } + .set { ch_merged_bam } + // LOGIC: PREPARE BAMTOBED JUICER INPUT. Leave it for now, but the release should provide juicer either for rapid or full for people to use else where in the world. if (workflow_setting == "FULL") { // // LOGIC: BRANCH TO SUBSAMPLE BAM IF LARGER THAN 50G // - SAMTOOLS_MERGE.out.bam + mergedbam .map{ meta, bam -> tuple( [ id : meta.id, @@ -288,7 +284,7 @@ workflow HIC_MAPPING { // // LOGIC: PREPARE BAMTOBED COOLER INPUT // - SAMTOOLS_MERGE.out.bam + mergedbam .combine( reference_tuple ) .multiMap { meta, merged_bam, meta_ref, ref -> bam : tuple(meta, merged_bam ) diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf new file mode 100755 index 00000000..53de3205 --- /dev/null +++ b/subworkflows/local/hic_minimap2.nf @@ -0,0 +1,101 @@ +#!/usr/bin/env nextflow + +// This subworkflow takes an input fasta sequence and csv style list of hic cram file to return +// alignment files including .mcool, pretext and .hic. +// Input - Assembled genomic fasta file, cram file directory +// Output - .mcool, .pretext, .hic + +// +// MODULE IMPORT BLOCK +// +include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' + + +workflow HIC_MINIMAP2 { + + take: + reference_tuple // Channel: tuple [ val(meta), path( file ) ] + csv_ch + reference_index + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + // + // MODULE: generate minimap2 mmi file + // + MINIMAP2_INDEX ( + reference_tuple + ) + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + + // + // LOGIC: generate input channel for mapping + // + csv_ch + .splitCsv() + .combine ( reference_tuple ) + .combine ( MINIMAP2_INDEX.out.index ) + .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> + tuple([ + id: cram_id.id + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + mmi_path.toString() + ) + } + .set { ch_filtering_input } + + // + // MODULE: map hic reads by 10,000 container per time + // + CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT ( + ch_filtering_input + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam + + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + reference_tuple, + reference_index + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} diff --git a/subworkflows/local/read_coverage.nf b/subworkflows/local/read_coverage.nf index 977fa45a..561bbf75 100755 --- a/subworkflows/local/read_coverage.nf +++ b/subworkflows/local/read_coverage.nf @@ -121,10 +121,10 @@ workflow READ_COVERAGE { // // MODULE: INDEXING SORTED MAPPED BAM // - SAMTOOLS_INDEX ( - SAMTOOLS_SORT.out.bam - ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + //SAMTOOLS_INDEX ( + // SAMTOOLS_SORT.out.bam + //) + //ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) // // LOGIC: PREPARING MERGE INPUT WITH REFERENCE GENOME AND REFERENCE INDEX diff --git a/subworkflows/local/selfcomp.nf b/subworkflows/local/selfcomp.nf index d36070c5..a258ce85 100755 --- a/subworkflows/local/selfcomp.nf +++ b/subworkflows/local/selfcomp.nf @@ -51,7 +51,7 @@ workflow SELFCOMP { file_size .sum{it / 1e9} - .collect { new BigDecimal (it).setScale(0, RoundingMode.UP) } + .collect { new java.math.BigDecimal (it).setScale(0, RoundingMode.UP) } .flatten() .set { chunk_number } @@ -113,7 +113,7 @@ workflow SELFCOMP { .set{ mummer_input } // - // MODULE: ALIGNS 1GB CHUNKS TO 50KB CHUNKS + // MODULE: ALIGNS 1GB CHUNKS TO 500KB CHUNKS // EMITS MUMMER ALIGNMENT FILE // MUMMER( diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index d3be9a8c..9c1280ad 100755 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -26,6 +26,7 @@ workflow YAML_INPUT { .multiMap { data, id -> assembly: ( data.assembly ) assembly_reads: ( data.assem_reads ) + hic_data: ( data.hic_data ) kmer_profile: ( data.kmer_profile ) reference: ( file(data.reference_file, checkIfExists: true) ) alignment: ( id == "FULL" ? data.alignment : "" ) @@ -57,11 +58,18 @@ workflow YAML_INPUT { .multiMap { data -> read_type: data.read_type read_data: data.read_data - hic: data.hic_data supplement: data.supplementary_data } .set { assem_reads } + group + .hic_data + .multiMap { data -> + hic_cram: data.hic_cram + hic_aligner: data.hic_aligner + } + .set { hic } + group .kmer_profile .multiMap { data -> @@ -173,9 +181,11 @@ workflow YAML_INPUT { } tolid_version - .combine( assem_reads.hic ) - .map { sample, data -> - tuple( [ id: sample ], + .combine( hic.hic_cram ) + .combine( hic.hic_aligner ) + .map { sample, data, aligner -> + tuple( [ id: sample, + aligner: aligner ], data ) }