From 9734e661df5bb8129bd0ca52e0e0be09e3a9cec4 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Fri, 30 Aug 2024 20:46:47 +0700 Subject: [PATCH 01/16] Adapting hic_mapping.nf from TreeVal to implement Minimap2 for HiC data and chunking --- bin/awk_filter_reads.sh | 1 + bin/filter_five_end.pl | 109 ++++++++++++++++ bin/generate_cram_csv.sh | 86 +++++++++++++ bin/grep_pg.sh | 10 ++ conf/base.config | 21 ++++ conf/modules.config | 28 +++++ conf/test.config | 3 + modules.json | 5 + .../cram_filter_align_bwamem2_fixmate_sort.nf | 55 +++++++++ ...filter_minimap2_filter5end_fixmate_sort.nf | 59 +++++++++ modules/local/generate_cram_csv.nf | 35 ++++++ .../nf-core/minimap2/index/environment.yml | 7 ++ modules/nf-core/minimap2/index/main.nf | 44 +++++++ modules/nf-core/minimap2/index/meta.yml | 43 +++++++ .../nf-core/minimap2/index/tests/main.nf.test | 32 +++++ .../minimap2/index/tests/main.nf.test.snap | 68 ++++++++++ modules/nf-core/minimap2/index/tests/tags.yml | 2 + nextflow.config | 3 +- seq_cache_populate.pl | 0 subworkflows/local/align_short_hic.nf | 116 ++++++++++++++++++ subworkflows/local/hic_bwamem2.nf | 90 ++++++++++++++ subworkflows/local/hic_minimap2.nf | 101 +++++++++++++++ workflows/readmapping.nf | 2 +- 23 files changed, 918 insertions(+), 2 deletions(-) create mode 100755 bin/awk_filter_reads.sh create mode 100644 bin/filter_five_end.pl create mode 100755 bin/generate_cram_csv.sh create mode 100755 bin/grep_pg.sh create mode 100644 modules/local/cram_filter_align_bwamem2_fixmate_sort.nf create mode 100644 modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf create mode 100644 modules/local/generate_cram_csv.nf create mode 100644 modules/nf-core/minimap2/index/environment.yml create mode 100644 modules/nf-core/minimap2/index/main.nf create mode 100644 modules/nf-core/minimap2/index/meta.yml create mode 100644 modules/nf-core/minimap2/index/tests/main.nf.test create mode 100644 modules/nf-core/minimap2/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/minimap2/index/tests/tags.yml create mode 100644 seq_cache_populate.pl create mode 100644 subworkflows/local/align_short_hic.nf create mode 100644 subworkflows/local/hic_bwamem2.nf create mode 100644 subworkflows/local/hic_minimap2.nf diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh new file mode 100755 index 0000000..3066f11 --- /dev/null +++ b/bin/awk_filter_reads.sh @@ -0,0 +1 @@ +awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}' \ No newline at end of file diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl new file mode 100644 index 0000000..0beb2ac --- /dev/null +++ b/bin/filter_five_end.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $prev_id = ""; +my @five; +my @three; +my @unmap; +my @mid; +my @all; +my $counter = 0; + +while (){ + chomp; + if (/^@/){ + print $_."\n"; + next; + } + my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/; + my $bin = reverse(dec2bin($flag)); + my @binary = split(//,$bin); + if ($prev_id ne $id && $prev_id ne ""){ + if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + } + elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + + $counter = 0; + undef @unmap; + undef @five; + undef @three; + undef @mid; + undef @all; + } + + $counter++; + $prev_id = $id; + push @all,$_; + if ($binary[2]==1){ + push @unmap,$_; + } + elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){ + push @five, $_; + } + elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){ + push @three, $_; + } + elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){ + push @mid, $_; + } +} + +if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } +} +elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; +} +else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); +} + +sub dec2bin { + my $str = unpack("B32", pack("N", shift)); + return $str; +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +} \ No newline at end of file diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh new file mode 100755 index 0000000..1a951da --- /dev/null +++ b/bin/generate_cram_csv.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# generate_cram_csv.sh +# ------------------- +# Generate a csv file describing the CRAM folder +# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°> +# Author = yy5 +# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°> + +# Function to process chunking of a CRAM file +chunk_cram() { + local cram=$1 + local chunkn=$2 + local outcsv=$3 + realcram=$(readlink -f ${cram}) + realcrai=$(readlink -f ${cram}.crai) + local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g") + # local ncontainers=$(zcat "${realcrai}" | wc -l) + local ncontainers=$(cat "${realcram}" | wc -l) + local base=$(basename "${realcram}" .cram) + local from=0 + local to=10000 + + + while [ $to -lt $ncontainers ]; do + echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv + from=$((to + 1)) + ((to += 10000)) + ((chunkn++)) + done + + if [ $from -le $ncontainers ]; then + echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv + ((chunkn++)) + fi + + echo $chunkn +} + +# Function to process a CRAM file +process_cram_file() { + local cram=$1 + local chunkn=$2 + local outcsv=$3 + + local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}') + local num_read_groups=$(echo "$read_groups" | wc -w) + + if [ "$num_read_groups" -gt 1 ]; then + # Multiple read groups: process each separately + for rg in $read_groups; do + local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram" + samtools view -h -r "$rg" -o "$output_cram" "$cram" + samtools index "$output_cram" + chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv") + done + else + # Single read group or no read groups + chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv") + fi + + echo $chunkn +} + +# /\_/\ /\_/\ +# ( o.o ) main ( o.o ) +# > ^ < > ^ < + +# Check if cram_path is provided +if [ -z "$1" ]; then + echo "Usage: $0 " + exit 1 +fi + +# cram_path=$1 +cram=$1 +chunkn=0 +outcsv=$2 + +# # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram +# for cram in ${cram_path}/*.cram; do +# realcram=$(readlink -f $cram) +# chunkn=$(process_cram_file $realcram $chunkn $outcsv) +# done +realcram=$(readlink -f $cram) +chunkn=$(process_cram_file $realcram $chunkn $outcsv) diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh new file mode 100755 index 0000000..8f76e23 --- /dev/null +++ b/bin/grep_pg.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# grep_pg.sh +# ------------------- +# A shell script to exclude pg lines and label read 1 and read 2 from cram containers +# +# ------------------- +# Author = yy5 + +grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}' \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index ca753f3..33cc764 100644 --- a/conf/base.config +++ b/conf/base.config @@ -52,6 +52,12 @@ process { time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } + withName: SAMTOOLS_INDEX { + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } + } + withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } @@ -88,6 +94,21 @@ process { time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } + withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { + cpus = { check_max( 16 * 1 , 'cpus' ) } + memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') } + } + + withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { + cpus = { check_max( 16 * 1 , 'cpus' ) } + memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') } + } + + withName: MINIMAP2_INDEX { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) } + } + withName: CRUMBLE { // No correlation between memory usage and the number of reads or the genome size. // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. diff --git a/conf/modules.config b/conf/modules.config index a2d4464..88180ed 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -46,6 +46,34 @@ process { ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } + withName: CONVERT_CRAM { + ext.args = "--output-fmt cram" + } + + withName: SAMTOOLS_INDEX { + ext.args = "-C --output-fmt cram --write-index" + } + + withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { + ext.args = "" + ext.args1 = "-F0xB00 -nt" + ext.args2 = { "-5SPCp -H'${rglines}'" } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = "" + ext.args2 = { "-ax sr" } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" { + ext.args = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "} + } + // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp diff --git a/conf/test.config b/conf/test.config index d78e008..e59b389 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,6 +25,9 @@ params { // Output directory outdir = "${projectDir}/results" + // Aligner + hic_aligner = "minimap2" + // Fasta references fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam" diff --git a/modules.json b/modules.json index b7bb93d..8896416 100644 --- a/modules.json +++ b/modules.json @@ -40,6 +40,11 @@ "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", "installed_by": ["modules"] }, + "minimap2/index": { + "branch": "master", + "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5", + "installed_by": ["modules"] + }, "samtools/faidx": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf new file mode 100644 index 0000000..666b8bd --- /dev/null +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -0,0 +1,55 @@ +process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { + tag "$meta.id" + label "process_high" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' : + 'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }" + + input: + tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix), path(reference) + + output: + tuple val(meta), path("*.bam"), emit: mappedbam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args1 = task.ext.args1 ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // Please be aware one of the tools here required mem = 28 * reference size!!! + """ + cram_filter -n ${from}-${to} ${cramfile} - | \\ + samtools fastq ${args1} | \\ + bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\ + samtools fixmate ${args3} - - | \\ + samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + bwa-mem2: \$(bwa-mem2 --version | sed 's/bwa-mem2 //g') + END_VERSIONS + """ + // temp removal staden_io_lib: \$(echo \$(staden_io_lib --version 2>&1) | sed 's/^.*staden_io_lib //; s/Using.*\$//') CAUSES ERROR + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def base = "45022_3#2" + def chunkid = "1" + """ + touch ${prefix}_${base}_${chunkid}_mem.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf new file mode 100644 index 0000000..2127880 --- /dev/null +++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf @@ -0,0 +1,59 @@ +process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { + tag "$meta.id" + label "process_high" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' : + 'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }" + + input: + tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(ref), path(reference) + + output: + tuple val(meta), path("*.bam"), emit: mappedbam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args1 = task.ext.args1 ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.15" // Staden_io versions break the pipeline + """ + cram_filter -n ${from}-${to} ${cramfile} - | \\ + samtools fastq ${args1} - | \\ + minimap2 -t${task.cpus} -R '${rglines}' ${args2} ${ref} - | \\ + ${projectDir}/bin/grep_pg.sh | \\ + perl ${projectDir}/bin/filter_five_end.pl | \\ + ${projectDir}/bin/awk_filter_reads.sh | \\ + samtools fixmate ${args3} - - | \\ + samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mm.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + minimap2: \$(minimap2 --version | sed 's/minimap2 //g') + staden_io: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def base = "45022_3#2" + def chunkid = "1" + """ + touch ${prefix}_${base}_${chunkid}_mm.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + minimap2: \$(echo \$(minimap2 version 2>&1) | sed 's/.* //') + staden_io: $VERSION + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf new file mode 100644 index 0000000..3e6e224 --- /dev/null +++ b/modules/local/generate_cram_csv.nf @@ -0,0 +1,35 @@ +process GENERATE_CRAM_CSV { + tag "${meta.id}" + // label 'process_tiny' + + container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' + + input: + tuple val(meta), path(crampath) + + + output: + tuple val(meta), path('*.csv'), emit: csv + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + generate_cram_csv.sh $crampath ${prefix}_cram.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + END_VERSIONS + """ + + stub: + """ + touch ${meta.id}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/index/environment.yml b/modules/nf-core/minimap2/index/environment.yml new file mode 100644 index 0000000..8a912a1 --- /dev/null +++ b/modules/nf-core/minimap2/index/environment.yml @@ -0,0 +1,7 @@ +name: minimap2_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.28 diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf new file mode 100644 index 0000000..3832021 --- /dev/null +++ b/modules/nf-core/minimap2/index/main.nf @@ -0,0 +1,44 @@ +process MINIMAP2_INDEX { + label 'process_low' + + // Note: the versions here need to match the versions used in minimap2/align + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.28--he4a0461_0' : + 'biocontainers/minimap2:2.28--he4a0461_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.mmi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ + + stub: + """ + touch ${fasta.baseName}.mmi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml new file mode 100644 index 0000000..1d29e3f --- /dev/null +++ b/modules/nf-core/minimap2/index/meta.yml @@ -0,0 +1,43 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" +maintainers: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/minimap2/index/tests/main.nf.test b/modules/nf-core/minimap2/index/tests/main.nf.test new file mode 100644 index 0000000..97840ff --- /dev/null +++ b/modules/nf-core/minimap2/index/tests/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process MINIMAP2_INDEX" + script "../main.nf" + process "MINIMAP2_INDEX" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/index" + + test("minimap2 index") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/index/tests/main.nf.test.snap b/modules/nf-core/minimap2/index/tests/main.nf.test.snap new file mode 100644 index 0000000..0b09882 --- /dev/null +++ b/modules/nf-core/minimap2/index/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test_ref" + }, + "genome.mmi:md5,72e450f12dc691e763c697463bdb1571" + ] + ], + "1": [ + "versions.yml:md5,0fced0ee8015e7f50b82566e3db8f7b0" + ], + "index": [ + [ + { + "id": "test_ref" + }, + "genome.mmi:md5,72e450f12dc691e763c697463bdb1571" + ] + ], + "versions": [ + "versions.yml:md5,0fced0ee8015e7f50b82566e3db8f7b0" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T11:46:30.000058092" + }, + "minimap2 index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.mmi:md5,72e450f12dc691e763c697463bdb1571" + ] + ], + "1": [ + "versions.yml:md5,2f8340380c6741e9261a284262a90bde" + ], + "index": [ + [ + { + "id": "test" + }, + "genome.mmi:md5,72e450f12dc691e763c697463bdb1571" + ] + ], + "versions": [ + "versions.yml:md5,2f8340380c6741e9261a284262a90bde" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T10:58:29.828187662" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/index/tests/tags.yml b/modules/nf-core/minimap2/index/tests/tags.yml new file mode 100644 index 0000000..e5ef8e1 --- /dev/null +++ b/modules/nf-core/minimap2/index/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/index: + - modules/nf-core/minimap2/index/** diff --git a/nextflow.config b/nextflow.config index 437ff82..8c3c4d0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,8 @@ params { bwamem2_index = null fasta = null header = null - + // Aligner option + hic_aligner = "minimap2" // Can choose minimap2 and bwamem2 // Output format options outfmt = "cram" compression = "crumble" diff --git a/seq_cache_populate.pl b/seq_cache_populate.pl new file mode 100644 index 0000000..e69de29 diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf new file mode 100644 index 0000000..4a4a31d --- /dev/null +++ b/subworkflows/local/align_short_hic.nf @@ -0,0 +1,116 @@ +// +// Align short read (HiC and Illumina) data against the genome +// + +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { SAMTOOLS_VIEW as SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/view/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' +include { HIC_MINIMAP2 } from '../../subworkflows/local/hic_minimap2' +include { HIC_BWAMEM2 } from '../../subworkflows/local/hic_bwamem2' + +workflow ALIGN_SHORT_HIC { + take: + fasta // channel: [ val(meta), /path/to/fasta ] reference_tuple + index // channel: [ val(meta), /path/to/bwamem2/ ] + reads // channel: [ val(meta), /path/to/datafile ] hic_reads_path + + + main: + ch_versions = Channel.empty() + ch_merged_bam = Channel.empty() + + // Check file types and branch + reads + | branch { + meta, reads -> + fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } + cram : true + } + | set { ch_reads } + + + // Convert FASTQ to CRAM only if FASTQ were provided as input + CONVERT_CRAM ( ch_reads.fastq, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions.first() ) + + CONVERT_CRAM.out.bam + | mix ( ch_reads.cram ) + | map { meta, cram -> [ meta, cram, [] ] } + | set { ch_reads_cram } + + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram, fasta, [] ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + SAMTOOLS_INDEX.out.cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + + // + // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // + if (params.hic_aligner == 'minimap2') { + HIC_MINIMAP2 ( + fasta, + GENERATE_CRAM_CSV.out.csv, + index + ) + ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) + ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) + } else { + HIC_BWAMEM2 ( + fasta, + GENERATE_CRAM_CSV.out.csv, + index + ) + ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) + ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) + } + + ch_merged_bam + | combine( ch_reads_cram ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } + + // // Collect all BWAMEM2 output by sample name + ch_merged_bam + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } + | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } + | set { ch_bams } + + + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + + // Mark duplicates + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) + + emit: + bam = SAMTOOLS_SORMADUP.out.bam // channel: [ val(meta), /path/to/bam ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf new file mode 100644 index 0000000..8a06f44 --- /dev/null +++ b/subworkflows/local/hic_bwamem2.nf @@ -0,0 +1,90 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index/main' +include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' + +workflow HIC_BWAMEM2 { + take: + fasta // Channel: tuple [ val(meta), path( file ) ] + csv_ch + index + + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + BWAMEM2_INDEX ( + fasta + ) + ch_versions = ch_versions.mix( BWAMEM2_INDEX.out.versions ) + + csv_ch + .splitCsv() + .combine ( fasta ) + .combine ( BWAMEM2_INDEX.out.index ) + .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path -> + tuple([ + id: cram_id.id + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1], + ref_dir + ) + } + .set { ch_filtering_input } + + // + // MODULE: map hic reads by 10,000 container per time using bwamem2 + // + CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( + ch_filtering_input + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + fasta, + index + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf new file mode 100644 index 0000000..ddda9ac --- /dev/null +++ b/subworkflows/local/hic_minimap2.nf @@ -0,0 +1,101 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' + + +workflow HIC_MINIMAP2 { + take: + fasta // Channel: tuple [ val(meta), path( file ) ] + csv_ch + index + + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + + // + // MODULE: generate minimap2 mmi file + // + MINIMAP2_INDEX ( + fasta + ) + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + + + // + // LOGIC: generate input channel for mapping + // + csv_ch + .splitCsv() + .combine ( fasta ) + .combine ( MINIMAP2_INDEX.out.index ) + .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> + tuple([ + id: cram_id.id + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + mmi_path.toString(), + ref_dir + ) + } + .set { ch_filtering_input } + + + // + // MODULE: map hic reads by 10,000 container per time + // + CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT ( + ch_filtering_input + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam + + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + fasta, + index + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index c70a406..bbc0d23 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER } from '../modules/local/samtools_replaceh include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_SHORT as ALIGN_HIC } from '../subworkflows/local/align_short' +include { ALIGN_SHORT_HIC as ALIGN_HIC } from '../subworkflows/local/align_short_hic' include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short' include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacbio' include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio' From 168566fc2e276c567b642c6c85e3a611f7f32189 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 2 Sep 2024 16:24:24 +0100 Subject: [PATCH 02/16] generate csv seems to be working now? --- assets/samplesheet_s3.csv | 3 - conf/modules.config | 4 - modules.json | 93 +++++-- modules/local/generate_cram_csv.nf | 4 +- .../nf-core/samtools/index/environment.yml | 8 + modules/nf-core/samtools/index/main.nf | 49 ++++ modules/nf-core/samtools/index/meta.yml | 57 ++++ .../samtools/index/tests/csi.nextflow.config | 7 + .../nf-core/samtools/index/tests/main.nf.test | 140 ++++++++++ .../samtools/index/tests/main.nf.test.snap | 250 ++++++++++++++++++ modules/nf-core/samtools/index/tests/tags.yml | 2 + subworkflows/local/align_short_hic.nf | 148 ++++++----- 12 files changed, 668 insertions(+), 97 deletions(-) create mode 100644 modules/nf-core/samtools/index/environment.yml create mode 100644 modules/nf-core/samtools/index/main.nf create mode 100644 modules/nf-core/samtools/index/meta.yml create mode 100644 modules/nf-core/samtools/index/tests/csi.nextflow.config create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/index/tests/tags.yml diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv index ff9641b..c059c97 100644 --- a/assets/samplesheet_s3.csv +++ b/assets/samplesheet_s3.csv @@ -2,6 +2,3 @@ sample,datatype,datafile,library mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3%231.subset.cram, mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz, mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram, -mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz, -mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam, -mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz, diff --git a/conf/modules.config b/conf/modules.config index 88180ed..e7ed388 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,10 +50,6 @@ process { ext.args = "--output-fmt cram" } - withName: SAMTOOLS_INDEX { - ext.args = "-C --output-fmt cram --write-index" - } - withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = "-F0xB00 -nt" diff --git a/modules.json b/modules.json index 8896416..05aa1cc 100644 --- a/modules.json +++ b/modules.json @@ -8,93 +8,136 @@ "blast/blastn": { "branch": "master", "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwamem2/index": { "branch": "master", "git_sha": "7081e04c18de9480948d34513a1c1e2d0fa9126d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwamem2/mem": { "branch": "master", "git_sha": "2201e21b09213f083832ac58e33353d410a6fde7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "crumble": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gunzip": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/index": { "branch": "master", "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/faidx": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/fastq": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/idxstats": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "samtools/index": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": [ + "modules" + ] }, "samtools/merge": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, "samtools/stats": { "branch": "master", "git_sha": "1fe379cf6e6c1e6fa5e32bcbeefea0f1e874dac6", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqkit/fq2fa": { "branch": "master", "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqtk/subseq": { "branch": "master", "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -103,20 +146,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index 3e6e224..c1d8965 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -5,7 +5,7 @@ process GENERATE_CRAM_CSV { container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' input: - tuple val(meta), path(crampath) + tuple val(meta), path(crampath), path(crai) output: @@ -32,4 +32,4 @@ process GENERATE_CRAM_CSV { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 0000000..260d516 --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,8 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 0000000..e002585 --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def extension = file(input).getExtension() == 'cram' ? + "crai" : args.contains("-c") ? "csi" : "bai" + """ + touch ${input}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 0000000..01a4ee0 --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 0000000..0ed260e --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 0000000..ca34fb5 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,140 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("bai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi") { + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.csi[0][1]).name, + process.out.versions + ).match() } + ) + } + } + + test("bai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi - stub") { + options "-stub" + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 0000000..799d199 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,250 @@ +{ + "csi - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:53.9057" + }, + "crai - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:45.931558" + }, + "bai - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:34.807525" + }, + "csi": { + "content": [ + "test.paired_end.sorted.bam.csi", + [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:52:55.688799" + }, + "crai": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:17.609533" + }, + "bai": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:04.16585" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 0000000..e0f58a7 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf index 4a4a31d..0675236 100644 --- a/subworkflows/local/align_short_hic.nf +++ b/subworkflows/local/align_short_hic.nf @@ -5,14 +5,14 @@ include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' -include { SAMTOOLS_VIEW as SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' include { HIC_MINIMAP2 } from '../../subworkflows/local/hic_minimap2' include { HIC_BWAMEM2 } from '../../subworkflows/local/hic_bwamem2' workflow ALIGN_SHORT_HIC { - take: + take: fasta // channel: [ val(meta), /path/to/fasta ] reference_tuple index // channel: [ val(meta), /path/to/bwamem2/ ] reads // channel: [ val(meta), /path/to/datafile ] hic_reads_path @@ -31,86 +31,102 @@ workflow ALIGN_SHORT_HIC { } | set { ch_reads } - // Convert FASTQ to CRAM only if FASTQ were provided as input CONVERT_CRAM ( ch_reads.fastq, fasta ) - ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions.first() ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) CONVERT_CRAM.out.bam | mix ( ch_reads.cram ) - | map { meta, cram -> [ meta, cram, [] ] } | set { ch_reads_cram } + ch_reads_cram.view() + + // fasta + // | combine( ch_reads.cram ) + // | map { meta, ref, hic_meta, hic_reads_path -> + // tuple( + // [ id: meta.id, single_end: true], + // hic_reads_path + // ) + // } + // .set { get_reads_input } + + // get_reads_input.view() + // Index the CRAM file - SAMTOOLS_INDEX ( ch_reads_cram, fasta, [] ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + SAMTOOLS_INDEX.out.crai.view() - SAMTOOLS_INDEX.out.cram + ch_reads_cram | join ( SAMTOOLS_INDEX.out.crai ) - | set { ch_reads_cram } + | set { ch_reads_cram_crai } + ch_reads_cram_crai.view() // // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT // - GENERATE_CRAM_CSV( ch_reads_cram ) - ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) - - - // - // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 - // - if (params.hic_aligner == 'minimap2') { - HIC_MINIMAP2 ( - fasta, - GENERATE_CRAM_CSV.out.csv, - index - ) - ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) - ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) - } else { - HIC_BWAMEM2 ( - fasta, - GENERATE_CRAM_CSV.out.csv, - index - ) - ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) - ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) - } - - ch_merged_bam - | combine( ch_reads_cram ) - | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } - | set { ch_merged_bam } - - // // Collect all BWAMEM2 output by sample name - ch_merged_bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple( by: [0] ) - | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - | branch { - meta, bams -> - single_bam: bams.size() == 1 - multi_bams: true - } - | set { ch_bams } - - - // Merge, but only if there is more than 1 file - SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - SAMTOOLS_MERGE.out.bam - | mix ( ch_bams.single_bam ) - | set { ch_bam } - - - // Mark duplicates - SAMTOOLS_SORMADUP ( ch_bam, fasta ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // GENERATE_CRAM_CSV.out.csv.view() + + // // + // // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // // + // if (params.hic_aligner == 'minimap2') { + // HIC_MINIMAP2 ( + // fasta, + // GENERATE_CRAM_CSV.out.csv, + // index + // ) + // ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) + // ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) + // } else { + // HIC_BWAMEM2 ( + // fasta, + // GENERATE_CRAM_CSV.out.csv, + // index + // ) + // ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) + // ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) + // } + + // ch_merged_bam + // | combine( ch_reads_cram ) + // | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + // | set { ch_merged_bam } + + // // // Collect all BWAMEM2 output by sample name + // ch_merged_bam + // | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } + // | groupTuple( by: [0] ) + // | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + // | branch { + // meta, bams -> + // single_bam: bams.size() == 1 + // multi_bams: true + // } + // | set { ch_bams } + + + // // Merge, but only if there is more than 1 file + // SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + // ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + // SAMTOOLS_MERGE.out.bam + // | mix ( ch_bams.single_bam ) + // | set { ch_bam } + + + // // Mark duplicates + // SAMTOOLS_SORMADUP ( ch_bam, fasta ) + // ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) emit: - bam = SAMTOOLS_SORMADUP.out.bam // channel: [ val(meta), /path/to/bam ] + bam = ch_reads_cram_crai // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } From d7e2761bacd136538838b819f826daf380ff85da Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 3 Sep 2024 00:47:55 +0700 Subject: [PATCH 03/16] adapt generate_cram_csv.sh to handle crai file --- bin/generate_cram_csv.sh | 17 ++-- modules/local/generate_cram_csv.nf | 2 +- subworkflows/local/align_short_hic.nf | 127 ++++++++++++-------------- workflows/readmapping.nf | 2 +- 4 files changed, 70 insertions(+), 78 deletions(-) diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index 1a951da..b36fa5d 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -12,11 +12,13 @@ chunk_cram() { local cram=$1 local chunkn=$2 local outcsv=$3 + local crai=$4 realcram=$(readlink -f ${cram}) - realcrai=$(readlink -f ${cram}.crai) + # realcrai=$(readlink -f ${cram}.crai) + realcrai=$(readlink -f ${crai}) local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g") - # local ncontainers=$(zcat "${realcrai}" | wc -l) - local ncontainers=$(cat "${realcram}" | wc -l) + local ncontainers=$(zcat "${realcrai}" | wc -l) + # local ncontainers=$(cat "${realcram}" | wc -l) local base=$(basename "${realcram}" .cram) local from=0 local to=10000 @@ -42,6 +44,7 @@ process_cram_file() { local cram=$1 local chunkn=$2 local outcsv=$3 + local crai=$4 local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}') local num_read_groups=$(echo "$read_groups" | wc -w) @@ -52,11 +55,11 @@ process_cram_file() { local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram" samtools view -h -r "$rg" -o "$output_cram" "$cram" samtools index "$output_cram" - chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv") + chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai") done else # Single read group or no read groups - chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv") + chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai") fi echo $chunkn @@ -76,6 +79,7 @@ fi cram=$1 chunkn=0 outcsv=$2 +crai=$3 # # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram # for cram in ${cram_path}/*.cram; do @@ -83,4 +87,5 @@ outcsv=$2 # chunkn=$(process_cram_file $realcram $chunkn $outcsv) # done realcram=$(readlink -f $cram) -chunkn=$(process_cram_file $realcram $chunkn $outcsv) +realcrai=$(readlink -f $crai) +chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index c1d8965..6a3b120 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -15,7 +15,7 @@ process GENERATE_CRAM_CSV { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - generate_cram_csv.sh $crampath ${prefix}_cram.csv + generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf index 0675236..1055b61 100644 --- a/subworkflows/local/align_short_hic.nf +++ b/subworkflows/local/align_short_hic.nf @@ -31,6 +31,7 @@ workflow ALIGN_SHORT_HIC { } | set { ch_reads } + // Convert FASTQ to CRAM only if FASTQ were provided as input CONVERT_CRAM ( ch_reads.fastq, fasta ) ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) @@ -39,94 +40,80 @@ workflow ALIGN_SHORT_HIC { | mix ( ch_reads.cram ) | set { ch_reads_cram } - ch_reads_cram.view() - - // fasta - // | combine( ch_reads.cram ) - // | map { meta, ref, hic_meta, hic_reads_path -> - // tuple( - // [ id: meta.id, single_end: true], - // hic_reads_path - // ) - // } - // .set { get_reads_input } - - // get_reads_input.view() // Index the CRAM file SAMTOOLS_INDEX ( ch_reads_cram ) ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) - SAMTOOLS_INDEX.out.crai.view() - ch_reads_cram | join ( SAMTOOLS_INDEX.out.crai ) | set { ch_reads_cram_crai } ch_reads_cram_crai.view() + // // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT // GENERATE_CRAM_CSV( ch_reads_cram_crai ) ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) - // GENERATE_CRAM_CSV.out.csv.view() - - // // - // // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 - // // - // if (params.hic_aligner == 'minimap2') { - // HIC_MINIMAP2 ( - // fasta, - // GENERATE_CRAM_CSV.out.csv, - // index - // ) - // ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) - // ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) - // } else { - // HIC_BWAMEM2 ( - // fasta, - // GENERATE_CRAM_CSV.out.csv, - // index - // ) - // ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) - // ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) - // } - - // ch_merged_bam - // | combine( ch_reads_cram ) - // | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } - // | set { ch_merged_bam } - - // // // Collect all BWAMEM2 output by sample name - // ch_merged_bam - // | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - // | groupTuple( by: [0] ) - // | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - // | branch { - // meta, bams -> - // single_bam: bams.size() == 1 - // multi_bams: true - // } - // | set { ch_bams } - - - // // Merge, but only if there is more than 1 file - // SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - // ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - // SAMTOOLS_MERGE.out.bam - // | mix ( ch_bams.single_bam ) - // | set { ch_bam } - - - // // Mark duplicates - // SAMTOOLS_SORMADUP ( ch_bam, fasta ) - // ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) + + // + // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // + if (params.hic_aligner == 'minimap2') { + HIC_MINIMAP2 ( + fasta, + GENERATE_CRAM_CSV.out.csv, + index + ) + ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) + ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) + } else { + HIC_BWAMEM2 ( + fasta, + GENERATE_CRAM_CSV.out.csv, + index + ) + ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) + ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) + } + + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } + + + // Collect all BAM output by sample name + ch_merged_bam + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } + | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } + | set { ch_bams } + + + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + + // Mark duplicates + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) emit: - bam = ch_reads_cram_crai // channel: [ val(meta), /path/to/bam ] + bam = SAMTOOLS_SORMADUP.out.bam // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index bbc0d23..1a06b5e 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER } from '../modules/local/samtools_replaceh include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_SHORT_HIC as ALIGN_HIC } from '../subworkflows/local/align_short_hic' +include { ALIGN_SHORT_HIC as ALIGN_HIC } from '../subworkflows/local/align_short_hic' include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short' include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacbio' include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio' From b67459c774762112ae21f8438b6268825a4edf16 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 9 Sep 2024 16:02:26 +0100 Subject: [PATCH 04/16] spoof REF_PATH to disallow remote lookkup for cram --- conf/modules.config | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index a2d4464..44d6cf0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,6 +12,7 @@ process { withName: SAMTOOLS_FASTQ { + beforeScript = { "export REF_PATH=spoof"} ext.args = '-F 0x200 -nt' } @@ -24,6 +25,7 @@ process { } withName: SAMTOOLS_MERGE { + beforeScript = { "export REF_PATH=spoof"} ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } } @@ -35,6 +37,7 @@ process { } withName: SAMTOOLS_COLLATETOFASTA { + beforeScript = { "export REF_PATH=spoof"} ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } } @@ -43,6 +46,7 @@ process { } withName: SAMTOOLS_CONVERT { + beforeScript = { "export REF_PATH=spoof"} ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } @@ -64,6 +68,7 @@ process { } withName: '.*:CONVERT_STATS:SAMTOOLS_CRAM' { + beforeScript = { "export REF_PATH=spoof"} ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } ext.args = '--output-fmt cram --write-index' } @@ -82,6 +87,7 @@ process { } withName: SAMTOOLS_STATS { + beforeScript = { "export REF_PATH=spoof"} ext.prefix = { "${input.baseName}" } } @@ -91,6 +97,7 @@ process { } withName: '.*:CONVERT_STATS:SAMTOOLS_.*' { + beforeScript = { "export REF_PATH=spoof"} publishDir = [ path: { "${params.outdir}/read_mapping/${meta.datatype}" }, mode: params.publish_dir_mode, From 7c35dad4776d7ab5f32b82a70e11a40fc6837eef Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 10 Sep 2024 16:11:16 +0700 Subject: [PATCH 05/16] ensure the sam cram output for HiC reads --- conf/modules.config | 6 +++--- .../cram_filter_align_bwamem2_fixmate_sort.nf | 9 +++++++-- subworkflows/local/align_short_hic.nf | 5 +---- subworkflows/local/hic_bwamem2.nf | 19 ++++++------------- subworkflows/local/hic_minimap2.nf | 3 +-- 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index e7ed388..a7a59cb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,15 +52,15 @@ process { withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" - ext.args1 = "-F0xB00 -nt" - ext.args2 = { "-5SPCp -H'${rglines}'" } + ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args2 = { "-5SPCp -R '${rglines}'" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } } withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" - ext.args1 = "" + ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } ext.args2 = { "-ax sr" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf index 666b8bd..cf5241f 100644 --- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -7,7 +7,9 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { 'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }" input: - tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix), path(reference) + tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines) + tuple val(meta2), path(fasta) + tuple val(meta3), path(index) output: tuple val(meta), path("*.bam"), emit: mappedbam @@ -24,10 +26,13 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { def args4 = task.ext.args4 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" // Please be aware one of the tools here required mem = 28 * reference size!!! + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + cram_filter -n ${from}-${to} ${cramfile} - | \\ samtools fastq ${args1} | \\ - bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\ + bwa-mem2 mem \$INDEX -t ${task.cpus} ${args2} - | \\ samtools fixmate ${args3} - - | \\ samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam - diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf index 1055b61..ce50542 100644 --- a/subworkflows/local/align_short_hic.nf +++ b/subworkflows/local/align_short_hic.nf @@ -49,8 +49,6 @@ workflow ALIGN_SHORT_HIC { | join ( SAMTOOLS_INDEX.out.crai ) | set { ch_reads_cram_crai } - ch_reads_cram_crai.view() - // // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT @@ -65,8 +63,7 @@ workflow ALIGN_SHORT_HIC { if (params.hic_aligner == 'minimap2') { HIC_MINIMAP2 ( fasta, - GENERATE_CRAM_CSV.out.csv, - index + GENERATE_CRAM_CSV.out.csv ) ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf index 8a06f44..3c7176a 100644 --- a/subworkflows/local/hic_bwamem2.nf +++ b/subworkflows/local/hic_bwamem2.nf @@ -3,7 +3,6 @@ // // MODULE IMPORT BLOCK // -include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index/main' include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' @@ -18,16 +17,10 @@ workflow HIC_BWAMEM2 { ch_versions = Channel.empty() mappedbam_ch = Channel.empty() - BWAMEM2_INDEX ( - fasta - ) - ch_versions = ch_versions.mix( BWAMEM2_INDEX.out.versions ) csv_ch .splitCsv() - .combine ( fasta ) - .combine ( BWAMEM2_INDEX.out.index ) - .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path -> + .map{ cram_id, cram_info -> tuple([ id: cram_id.id ], @@ -37,9 +30,7 @@ workflow HIC_BWAMEM2 { cram_info[3], cram_info[4], cram_info[5], - cram_info[6], - bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1], - ref_dir + cram_info[6] ) } .set { ch_filtering_input } @@ -48,7 +39,9 @@ workflow HIC_BWAMEM2 { // MODULE: map hic reads by 10,000 container per time using bwamem2 // CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( - ch_filtering_input + ch_filtering_input, + fasta, + index ) ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) @@ -79,7 +72,7 @@ workflow HIC_BWAMEM2 { SAMTOOLS_MERGE ( collected_files_for_merge, fasta, - index + [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf index ddda9ac..8637c83 100644 --- a/subworkflows/local/hic_minimap2.nf +++ b/subworkflows/local/hic_minimap2.nf @@ -12,7 +12,6 @@ workflow HIC_MINIMAP2 { take: fasta // Channel: tuple [ val(meta), path( file ) ] csv_ch - index main: @@ -90,7 +89,7 @@ workflow HIC_MINIMAP2 { SAMTOOLS_MERGE ( collected_files_for_merge, fasta, - index + [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) From 842fb5d9bddb27f09df11999967acb90401f0157 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 10 Sep 2024 18:39:31 +0700 Subject: [PATCH 06/16] include hic_aligner in nextflow_schema.json --- assets/samplesheet_s3.csv | 3 +++ nextflow_schema.json | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv index c059c97..ff9641b 100644 --- a/assets/samplesheet_s3.csv +++ b/assets/samplesheet_s3.csv @@ -2,3 +2,6 @@ sample,datatype,datafile,library mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3%231.subset.cram, mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz, mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram, +mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz, +mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam, +mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz, diff --git a/nextflow_schema.json b/nextflow_schema.json index 2209d6b..5b5b4e5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -57,6 +57,22 @@ } } }, + "aligner_options": { + "title": "Aligner options", + "type": "object", + "fa_icon": "fas fa-code", + "description": "Options for the aligner used in the pipeline.", + "required": ["hic_aligner"], + "properties": { + "hic_aligner": { + "type": "string", + "description": "The aligner to be used for HiC reads.", + "enum": ["bwamem2", "minimap2"], + "default": "minimap2", + "fa_icon": "fas fa-code" + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", From 3fd90a66e1b3e616885d65a68274565557cc6b7a Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 10 Sep 2024 22:21:43 +0700 Subject: [PATCH 07/16] fix linting --- bin/awk_filter_reads.sh | 2 +- bin/filter_five_end.pl | 2 +- bin/generate_cram_csv.sh | 2 +- bin/grep_pg.sh | 2 +- conf/base.config | 2 +- modules.json | 90 +++++-------------- .../cram_filter_align_bwamem2_fixmate_sort.nf | 4 +- ...filter_minimap2_filter5end_fixmate_sort.nf | 2 +- nextflow_schema.json | 11 +-- subworkflows/local/align_short.nf | 4 +- subworkflows/local/align_short_hic.nf | 2 +- subworkflows/local/hic_bwamem2.nf | 4 +- subworkflows/local/hic_minimap2.nf | 14 +-- workflows/readmapping.nf | 2 +- 14 files changed, 45 insertions(+), 98 deletions(-) diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh index 3066f11..d50aa9e 100755 --- a/bin/awk_filter_reads.sh +++ b/bin/awk_filter_reads.sh @@ -1 +1 @@ -awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}' \ No newline at end of file +awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}' diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl index 0beb2ac..6f9b4d5 100644 --- a/bin/filter_five_end.pl +++ b/bin/filter_five_end.pl @@ -106,4 +106,4 @@ sub dec2bin { sub bin2dec { return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); -} \ No newline at end of file +} diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index b36fa5d..89920fd 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -88,4 +88,4 @@ crai=$3 # done realcram=$(readlink -f $cram) realcrai=$(readlink -f $crai) -chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) +chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh index 8f76e23..680b5ec 100755 --- a/bin/grep_pg.sh +++ b/bin/grep_pg.sh @@ -7,4 +7,4 @@ # ------------------- # Author = yy5 -grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}' \ No newline at end of file +grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}' diff --git a/conf/base.config b/conf/base.config index 33cc764..cd54c75 100644 --- a/conf/base.config +++ b/conf/base.config @@ -57,7 +57,7 @@ process { memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) } time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } - + withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } diff --git a/modules.json b/modules.json index 05aa1cc..8e24d3e 100644 --- a/modules.json +++ b/modules.json @@ -8,136 +8,98 @@ "blast/blastn": { "branch": "master", "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bwamem2/index": { "branch": "master", "git_sha": "7081e04c18de9480948d34513a1c1e2d0fa9126d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bwamem2/mem": { "branch": "master", "git_sha": "2201e21b09213f083832ac58e33353d410a6fde7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "crumble": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gunzip": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/index": { "branch": "master", "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/fastq": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/idxstats": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/merge": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, "samtools/stats": { "branch": "master", "git_sha": "1fe379cf6e6c1e6fa5e32bcbeefea0f1e874dac6", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqkit/fq2fa": { "branch": "master", "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqtk/subseq": { "branch": "master", "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -146,26 +108,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf index cf5241f..9b5ca29 100644 --- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -26,7 +26,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { def args4 = task.ext.args4 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" // Please be aware one of the tools here required mem = 28 * reference size!!! - + """ INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` @@ -57,4 +57,4 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf index 2127880..8d8d69e 100644 --- a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf +++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf @@ -56,4 +56,4 @@ process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { staden_io: $VERSION END_VERSIONS """ -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 5b5b4e5..63a6999 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -54,16 +54,7 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - } - } - }, - "aligner_options": { - "title": "Aligner options", - "type": "object", - "fa_icon": "fas fa-code", - "description": "Options for the aligner used in the pipeline.", - "required": ["hic_aligner"], - "properties": { + }, "hic_aligner": { "type": "string", "description": "The aligner to be used for HiC reads.", diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index e74b480..cebc194 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -31,8 +31,8 @@ workflow ALIGN_SHORT { // Convert from CRAM to FASTQ only if CRAM files were provided as input SAMTOOLS_FASTQ ( ch_reads.cram, false ) ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - - + + SAMTOOLS_FASTQ.out.fastq | mix ( ch_reads.fastq ) | set { ch_reads_fastq } diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf index ce50542..39e0cab 100644 --- a/subworkflows/local/align_short_hic.nf +++ b/subworkflows/local/align_short_hic.nf @@ -82,7 +82,7 @@ workflow ALIGN_SHORT_HIC { | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } | set { ch_merged_bam } - + // Collect all BAM output by sample name ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf index 3c7176a..edf4691 100644 --- a/subworkflows/local/hic_bwamem2.nf +++ b/subworkflows/local/hic_bwamem2.nf @@ -65,7 +65,7 @@ workflow HIC_BWAMEM2 { } .set { collected_files_for_merge } - + // // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES // @@ -80,4 +80,4 @@ workflow HIC_BWAMEM2 { emit: mergedbam = SAMTOOLS_MERGE.out.bam versions = ch_versions.ifEmpty(null) -} \ No newline at end of file +} diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf index 8637c83..23ab866 100644 --- a/subworkflows/local/hic_minimap2.nf +++ b/subworkflows/local/hic_minimap2.nf @@ -20,8 +20,8 @@ workflow HIC_MINIMAP2 { // - // MODULE: generate minimap2 mmi file - // + // MODULE: generate minimap2 mmi file + // MINIMAP2_INDEX ( fasta ) @@ -30,7 +30,7 @@ workflow HIC_MINIMAP2 { // // LOGIC: generate input channel for mapping - // + // csv_ch .splitCsv() .combine ( fasta ) @@ -55,7 +55,7 @@ workflow HIC_MINIMAP2 { // // MODULE: map hic reads by 10,000 container per time - // + // CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT ( ch_filtering_input @@ -92,9 +92,9 @@ workflow HIC_MINIMAP2 { [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - + + emit: mergedbam = SAMTOOLS_MERGE.out.bam versions = ch_versions.ifEmpty(null) -} \ No newline at end of file +} diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 1a06b5e..a5e9575 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -104,7 +104,7 @@ workflow READMAPPING { // ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic ) ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions ) - + ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina ) ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions ) From 69693036fc70779742071a061f6778cf4ebc96c8 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:12:23 +0100 Subject: [PATCH 08/16] schema update; hic_ to short_aligner and added chunksize --- nextflow.config | 4 +++- nextflow_schema.json | 30 +++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/nextflow.config b/nextflow.config index 8c3c4d0..9e1b306 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,9 @@ params { fasta = null header = null // Aligner option - hic_aligner = "minimap2" // Can choose minimap2 and bwamem2 + short_aligner = "minimap2" // Can choose minimap2 and bwamem2 + chunk_size = 10000 + // Output format options outfmt = "cram" compression = "crumble" diff --git a/nextflow_schema.json b/nextflow_schema.json index 63a6999..91eb9f6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -54,13 +54,6 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "hic_aligner": { - "type": "string", - "description": "The aligner to be used for HiC reads.", - "enum": ["bwamem2", "minimap2"], - "default": "minimap2", - "fa_icon": "fas fa-code" } } }, @@ -93,6 +86,26 @@ } } }, + "aligner_options": { + "title": "Aligner options", + "type": "object", + "description": "", + "default": "", + "properties": { + "short_aligner": { + "type": "string", + "default": "minimap2", + "description": "Alignment method to use for short-reads (options are: \"minimap2\" or \"bwamem2\"", + "enum": ["minimap2", "bwamem2"] + }, + "chunk_size": { + "type": "integer", + "default": 10000, + "description": "Chunk size for parallel processing of CRAM files, in number of containers", + "minimum": 1 + } + } + }, "execution": { "title": "Execution", "type": "object", @@ -292,6 +305,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/aligner_options" + }, { "$ref": "#/definitions/execution" }, From 6ee5f51e5e953f552ad3a8a73d9d23b076d8cde5 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:12:42 +0100 Subject: [PATCH 09/16] made chunk_size an argument and fix off-by-1 error --- bin/generate_cram_csv.sh | 58 +++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index 89920fd..71af968 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -7,36 +7,38 @@ # Author = yy5 # ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°> +# NOTE: chunk_size is the number of containers per chunk (not records/reads) + # Function to process chunking of a CRAM file + chunk_cram() { local cram=$1 local chunkn=$2 local outcsv=$3 local crai=$4 - realcram=$(readlink -f ${cram}) - # realcrai=$(readlink -f ${cram}.crai) - realcrai=$(readlink -f ${crai}) + local chunk_size=$5 + local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g") local ncontainers=$(zcat "${realcrai}" | wc -l) - # local ncontainers=$(cat "${realcram}" | wc -l) local base=$(basename "${realcram}" .cram) local from=0 - local to=10000 - + local to=$((chunk_size - 1)) while [ $to -lt $ncontainers ]; do + echo "chunk $chunkn: $from - $to" echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv from=$((to + 1)) - ((to += 10000)) + to=$((to + chunk_size)) ((chunkn++)) done - if [ $from -le $ncontainers ]; then - echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv + # Catch any remainder + if [ $from -lt $ncontainers ]; then + to=$((ncontainers - 1)) + echo "chunk $chunkn: $from - $to" + echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv ((chunkn++)) fi - - echo $chunkn } # Function to process a CRAM file @@ -45,24 +47,23 @@ process_cram_file() { local chunkn=$2 local outcsv=$3 local crai=$4 + local chunk_size=$5 local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}') local num_read_groups=$(echo "$read_groups" | wc -w) - if [ "$num_read_groups" -gt 1 ]; then # Multiple read groups: process each separately for rg in $read_groups; do local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram" samtools view -h -r "$rg" -o "$output_cram" "$cram" - samtools index "$output_cram" - chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai") + #chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size") + chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size" done else # Single read group or no read groups - chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai") + #chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size") + chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size" fi - - echo $chunkn } # /\_/\ /\_/\ @@ -70,22 +71,25 @@ process_cram_file() { # > ^ < > ^ < # Check if cram_path is provided -if [ -z "$1" ]; then - echo "Usage: $0 " +if [ $# -lt 3 ]; then + echo "Usage: $0 " exit 1 fi -# cram_path=$1 cram=$1 -chunkn=0 outcsv=$2 crai=$3 +if [ -z "$4" ]; then + chunk_size=10000 +else + chunk_size=$4 +fi +chunkn=0 -# # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram -# for cram in ${cram_path}/*.cram; do -# realcram=$(readlink -f $cram) -# chunkn=$(process_cram_file $realcram $chunkn $outcsv) -# done +# Operates on a single CRAM file realcram=$(readlink -f $cram) realcrai=$(readlink -f $crai) -chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) +if [ -f "$outcsv" ]; then + rm "$outcsv" +fi +process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size From 513229d22e413f6ebafdd337d66df5c3547e4964 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:14:32 +0100 Subject: [PATCH 10/16] change hic_ to _mapreduce subworkflows --- subworkflows/local/align_short_hic.nf | 6 +- subworkflows/local/bwamem2_mapreduce.nf | 84 +++++++++++++++++++ subworkflows/local/minimap2_mapreduce.nf | 101 +++++++++++++++++++++++ 3 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 subworkflows/local/bwamem2_mapreduce.nf create mode 100644 subworkflows/local/minimap2_mapreduce.nf diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf index 39e0cab..92c2010 100644 --- a/subworkflows/local/align_short_hic.nf +++ b/subworkflows/local/align_short_hic.nf @@ -8,8 +8,8 @@ include { SAMTOOLS_SORMADUP } from '../../modules/local/samt include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' -include { HIC_MINIMAP2 } from '../../subworkflows/local/hic_minimap2' -include { HIC_BWAMEM2 } from '../../subworkflows/local/hic_bwamem2' +include { MINIMAP2_MAPREDUCE as HIC_MINIMAP2 } from '../../subworkflows/local/minimap2_mapreduce' +include { BWAMEM2_MAPREDUCE as HIC_BWAMEM2 } from '../../subworkflows/local/bwamem2_mapreduce' workflow ALIGN_SHORT_HIC { take: @@ -60,7 +60,7 @@ workflow ALIGN_SHORT_HIC { // // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 // - if (params.hic_aligner == 'minimap2') { + if (params.short_aligner.startsWith("minimap")) { HIC_MINIMAP2 ( fasta, GENERATE_CRAM_CSV.out.csv diff --git a/subworkflows/local/bwamem2_mapreduce.nf b/subworkflows/local/bwamem2_mapreduce.nf new file mode 100644 index 0000000..ceaff1b --- /dev/null +++ b/subworkflows/local/bwamem2_mapreduce.nf @@ -0,0 +1,84 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' + +workflow BWAMEM2_MAPREDUCE { + take: + fasta // Channel: tuple [ val(meta), path( file ) ] + csv_ch + index + + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + + csv_ch + .splitCsv() + .map{ cram_id, cram_info -> + tuple([ + id: cram_id.id, + chunk_id: cram_id.id + "_" + cram_info[5] + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6] + ) + } + .set { ch_filtering_input } + + // + // MODULE: map hic reads by 10,000 container per time using bwamem2 + // + CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( + ch_filtering_input, + fasta, + index + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + fasta, + [ [], [] ] + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf new file mode 100644 index 0000000..35b5aae --- /dev/null +++ b/subworkflows/local/minimap2_mapreduce.nf @@ -0,0 +1,101 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' + + +workflow MINIMAP2_MAPREDUCE { + take: + fasta // Channel: tuple [ val(meta), path( file ) ] + csv_ch + + + main: + ch_versions = Channel.empty() + mappedbam_ch = Channel.empty() + + + // + // MODULE: generate minimap2 mmi file + // + MINIMAP2_INDEX ( + fasta + ) + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + + + // + // LOGIC: generate input channel for mapping + // + csv_ch + .splitCsv() + .combine ( fasta ) + .combine ( MINIMAP2_INDEX.out.index ) + .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> + tuple([ + id: cram_id.id, + chunk_id: cram_id.id + "_" + cram_info[5] + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + mmi_path.toString(), + ref_dir + ) + } + .set { ch_filtering_input } + + + // + // MODULE: map hic reads by 10,000 container per time + // + CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT ( + ch_filtering_input + + ) + ch_versions = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions ) + mappedbam_ch = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam + + + // + // LOGIC: PREPARING BAMS FOR MERGE + // + mappedbam_ch + .map{ meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ + id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + ], + file + ) + } + .set { collected_files_for_merge } + + + // + // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES + // + SAMTOOLS_MERGE ( + collected_files_for_merge, + fasta, + [ [], [] ] + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + emit: + mergedbam = SAMTOOLS_MERGE.out.bam + versions = ch_versions.ifEmpty(null) +} From 11c3c314c1857f3648f5fe8cc51e8e599f6d0f1f Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:15:23 +0100 Subject: [PATCH 11/16] change hic_ to _mapreduce subworkflows --- subworkflows/local/hic_bwamem2.nf | 83 ------------------------ subworkflows/local/hic_minimap2.nf | 100 ----------------------------- 2 files changed, 183 deletions(-) delete mode 100644 subworkflows/local/hic_bwamem2.nf delete mode 100644 subworkflows/local/hic_minimap2.nf diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf deleted file mode 100644 index edf4691..0000000 --- a/subworkflows/local/hic_bwamem2.nf +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env nextflow - -// -// MODULE IMPORT BLOCK -// -include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' - -workflow HIC_BWAMEM2 { - take: - fasta // Channel: tuple [ val(meta), path( file ) ] - csv_ch - index - - - main: - ch_versions = Channel.empty() - mappedbam_ch = Channel.empty() - - - csv_ch - .splitCsv() - .map{ cram_id, cram_info -> - tuple([ - id: cram_id.id - ], - file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6] - ) - } - .set { ch_filtering_input } - - // - // MODULE: map hic reads by 10,000 container per time using bwamem2 - // - CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( - ch_filtering_input, - fasta, - index - - ) - ch_versions = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions ) - mappedbam_ch = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam - - // - // LOGIC: PREPARING BAMS FOR MERGE - // - mappedbam_ch - .map{ meta, file -> - tuple( file ) - } - .collect() - .map { file -> - tuple ( - [ - id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] - ], - file - ) - } - .set { collected_files_for_merge } - - - // - // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES - // - SAMTOOLS_MERGE ( - collected_files_for_merge, - fasta, - [ [], [] ] - ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - emit: - mergedbam = SAMTOOLS_MERGE.out.bam - versions = ch_versions.ifEmpty(null) -} diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf deleted file mode 100644 index 23ab866..0000000 --- a/subworkflows/local/hic_minimap2.nf +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env nextflow - -// -// MODULE IMPORT BLOCK -// -include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' - - -workflow HIC_MINIMAP2 { - take: - fasta // Channel: tuple [ val(meta), path( file ) ] - csv_ch - - - main: - ch_versions = Channel.empty() - mappedbam_ch = Channel.empty() - - - // - // MODULE: generate minimap2 mmi file - // - MINIMAP2_INDEX ( - fasta - ) - ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) - - - // - // LOGIC: generate input channel for mapping - // - csv_ch - .splitCsv() - .combine ( fasta ) - .combine ( MINIMAP2_INDEX.out.index ) - .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> - tuple([ - id: cram_id.id - ], - file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6], - mmi_path.toString(), - ref_dir - ) - } - .set { ch_filtering_input } - - - // - // MODULE: map hic reads by 10,000 container per time - // - CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT ( - ch_filtering_input - - ) - ch_versions = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions ) - mappedbam_ch = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam - - - // - // LOGIC: PREPARING BAMS FOR MERGE - // - mappedbam_ch - .map{ meta, file -> - tuple( file ) - } - .collect() - .map { file -> - tuple ( - [ - id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] - ], - file - ) - } - .set { collected_files_for_merge } - - - // - // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES - // - SAMTOOLS_MERGE ( - collected_files_for_merge, - fasta, - [ [], [] ] - ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - emit: - mergedbam = SAMTOOLS_MERGE.out.bam - versions = ch_versions.ifEmpty(null) -} From fe7dd7a9bffa9215e98fbc74fd3a8ab28e0a5854 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:19:45 +0100 Subject: [PATCH 12/16] now passes params.chunk_size --- modules/local/generate_cram_csv.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index 6a3b120..d64933f 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -1,8 +1,10 @@ process GENERATE_CRAM_CSV { tag "${meta.id}" - // label 'process_tiny' + label 'process_single' - container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' : + 'sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' }" input: tuple val(meta), path(crampath), path(crai) @@ -15,7 +17,7 @@ process GENERATE_CRAM_CSV { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai + generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai ${params.chunk_size} cat <<-END_VERSIONS > versions.yml "${task.process}": From b26d3403daa79b87eb2a46255fee74e3c22c7fd5 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 15:20:10 +0100 Subject: [PATCH 13/16] tag with chunk_id so process tracking is accurate --- modules/local/cram_filter_align_bwamem2_fixmate_sort.nf | 2 +- modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf index 9b5ca29..667e49b 100644 --- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -1,5 +1,5 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { - tag "$meta.id" + tag "$meta.chunk_id" label "process_high" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf index 8d8d69e..ceafb86 100644 --- a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf +++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf @@ -1,5 +1,5 @@ process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { - tag "$meta.id" + tag "$meta.chunk_id" label "process_high" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From fe4d66081acf993c6fcc2a11073c1c0ca95cede3 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 11 Sep 2024 16:18:19 +0100 Subject: [PATCH 14/16] samtools/addreplacerg module --- conf/modules.config | 21 ++++++++++ modules/local/samtools_addreplacerg.nf | 56 ++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 modules/local/samtools_addreplacerg.nf diff --git a/conf/modules.config b/conf/modules.config index a7a59cb..3fea793 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,6 +50,22 @@ process { ext.args = "--output-fmt cram" } + withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args2 = { "-R '${rglines}'" } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args2 = { "-ax sr" } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } @@ -105,6 +121,11 @@ process { ext.prefix = { "${bam.baseName}" } } + withName: SAMTOOLS_ADDREPLACERG { + ext.prefix = { "${input.baseName}_addRG" } + ext.args = { "-r ${meta.read_group}" } + } + withName: SAMTOOLS_STATS { ext.prefix = { "${input.baseName}" } } diff --git a/modules/local/samtools_addreplacerg.nf b/modules/local/samtools_addreplacerg.nf new file mode 100644 index 0000000..5b59500 --- /dev/null +++ b/modules/local/samtools_addreplacerg.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_ADDREPLACERG { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.20" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam, optional: true + tuple val(meta), path("${prefix}.cram"), emit: cram, optional: true + tuple val(meta), path("${prefix}.sam"), emit: sam, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + file_type = input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + addreplacerg \\ + --threads ${task.cpus-1} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + file_type = input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + touch ${prefix}.${file_type} + ${index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} From 17bbbc747974bb8a6831720ad62c8687b9efc959 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Thu, 12 Sep 2024 15:05:56 +0100 Subject: [PATCH 15/16] minor changes... currently working --- bin/generate_cram_csv.sh | 8 ++++++-- conf/modules.config | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index 71af968..9e56831 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -21,11 +21,15 @@ chunk_cram() { local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g") local ncontainers=$(zcat "${realcrai}" | wc -l) local base=$(basename "${realcram}" .cram) + + if [ $chunk_size -gt $ncontainers ]; then + chunk_size=$((ncontainers - 1)) + fi local from=0 local to=$((chunk_size - 1)) while [ $to -lt $ncontainers ]; do - echo "chunk $chunkn: $from - $to" + #echo "chunk $chunkn: $from - $to" echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv from=$((to + 1)) to=$((to + chunk_size)) @@ -35,7 +39,7 @@ chunk_cram() { # Catch any remainder if [ $from -lt $ncontainers ]; then to=$((ncontainers - 1)) - echo "chunk $chunkn: $from - $to" + #echo "chunk $chunkn: $from - $to" echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv ((chunkn++)) fi diff --git a/conf/modules.config b/conf/modules.config index 3fea793..028ef95 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,7 +52,7 @@ process { withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args1 = { "" } ext.args2 = { "-R '${rglines}'" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } @@ -60,7 +60,7 @@ process { withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args1 = { "" } ext.args2 = { "-ax sr" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } @@ -68,7 +68,7 @@ process { withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args1 = { "" } ext.args2 = { "-5SPCp -R '${rglines}'" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } @@ -76,7 +76,7 @@ process { withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" } + ext.args1 = { "" } ext.args2 = { "-ax sr" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } From a7db73770e216868862f856082bc028b9a0fdf72 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 16 Sep 2024 12:02:56 +0100 Subject: [PATCH 16/16] generalise mapreduce to illumina --- conf/modules.config | 36 ++++---- conf/test.config | 2 +- modules/local/generate_cram_csv.nf | 2 +- modules/local/samtools_addreplacerg.nf | 2 +- nextflow.config | 1 + subworkflows/local/align_short.nf | 86 +++++++++++++----- subworkflows/local/align_short_hic.nf | 116 ------------------------ subworkflows/local/bwamem2_mapreduce.nf | 3 +- subworkflows/local/prepare_genome.nf | 6 +- workflows/readmapping.nf | 2 +- 10 files changed, 92 insertions(+), 164 deletions(-) delete mode 100644 subworkflows/local/align_short_hic.nf diff --git a/conf/modules.config b/conf/modules.config index 028ef95..ebf3fb0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,14 +15,6 @@ process { ext.args = '-F 0x200 -nt' } - withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { - ext.args = { "-5SPCp -R ${meta.read_group}" } - } - - withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { - ext.args = { "-R ${meta.read_group}" } - } - withName: SAMTOOLS_MERGE { ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } @@ -50,31 +42,39 @@ process { ext.args = "--output-fmt cram" } - withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { + withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { + ext.args = { "-5SPCp -R ${meta.read_group}" } + } + + withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { + ext.args = { "-p -R ${meta.read_group}" } + } + + withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "" } - ext.args2 = { "-R '${rglines}'" } + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-p -R '${rglines}'" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } } - withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "" } + ext.args1 = { "-F 0x200 -nt" } ext.args2 = { "-ax sr" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } } - withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { + withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" - ext.args1 = { "" } + ext.args1 = { "-F 0x200 -nt" } ext.args2 = { "-5SPCp -R '${rglines}'" } ext.args3 = "-mpu" ext.args4 = { "--write-index -l1" } } - withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" ext.args1 = { "" } ext.args2 = { "-ax sr" } @@ -82,7 +82,7 @@ process { ext.args4 = { "--write-index -l1" } } - withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" { + withName: "MINIMAP2_INDEX" { ext.args = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "} } @@ -123,7 +123,7 @@ process { withName: SAMTOOLS_ADDREPLACERG { ext.prefix = { "${input.baseName}_addRG" } - ext.args = { "-r ${meta.read_group}" } + ext.args = { "-r ${meta.read_group} --no-PG" } } withName: SAMTOOLS_STATS { diff --git a/conf/test.config b/conf/test.config index e59b389..add9178 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,7 +26,7 @@ params { outdir = "${projectDir}/results" // Aligner - hic_aligner = "minimap2" + short_aligner = "minimap2" // Fasta references fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index d64933f..f3cfe96 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -3,7 +3,7 @@ process GENERATE_CRAM_CSV { label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' : + 'https://quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' : 'sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' }" input: diff --git a/modules/local/samtools_addreplacerg.nf b/modules/local/samtools_addreplacerg.nf index 5b59500..6515c3e 100644 --- a/modules/local/samtools_addreplacerg.nf +++ b/modules/local/samtools_addreplacerg.nf @@ -27,7 +27,7 @@ process SAMTOOLS_ADDREPLACERG { """ samtools \\ addreplacerg \\ - --threads ${task.cpus-1} \\ + --threads $task.cpus \\ $args \\ -o ${prefix}.${file_type} \\ $input diff --git a/nextflow.config b/nextflow.config index 9e1b306..d143247 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { bwamem2_index = null fasta = null header = null + // Aligner option short_aligner = "minimap2" // Can choose minimap2 and bwamem2 chunk_size = 10000 diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index cebc194..4f49cdc 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -2,21 +2,26 @@ // Align short read (HiC and Illumina) data against the genome // -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' -include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' - +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' +include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' +include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' +include { BWAMEM2_MAPREDUCE } from '../../subworkflows/local/bwamem2_mapreduce' workflow ALIGN_SHORT { take: - fasta // channel: [ val(meta), /path/to/fasta ] + fasta // channel: [ val(meta), /path/to/fasta ] reference_tuple index // channel: [ val(meta), /path/to/bwamem2/ ] - reads // channel: [ val(meta), /path/to/datafile ] + reads // channel: [ val(meta), /path/to/datafile ] hic_reads_path main: ch_versions = Channel.empty() + ch_merged_bam = Channel.empty() // Check file types and branch reads @@ -28,23 +33,60 @@ workflow ALIGN_SHORT { | set { ch_reads } - // Convert from CRAM to FASTQ only if CRAM files were provided as input - SAMTOOLS_FASTQ ( ch_reads.cram, false ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - - - SAMTOOLS_FASTQ.out.fastq - | mix ( ch_reads.fastq ) - | set { ch_reads_fastq } - + // Convert FASTQ to CRAM only if FASTQ were provided as input + CONVERT_CRAM ( ch_reads.fastq, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) + + SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) + + SAMTOOLS_ADDREPLACERG.out.cram + | mix ( ch_reads.cram ) + | set { ch_reads_cram } + + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram_crai } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // + // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // + if (params.short_aligner.startsWith("minimap")) { + MINIMAP2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv + ) + ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) + } else { + BWAMEM2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv, + index + ) + ch_versions = ch_versions.mix( BWAMEM2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(BWAMEM2_MAPREDUCE.out.mergedbam) + } - // Align Fastq to Genome and output sorted BAM - BWAMEM2_MEM ( ch_reads_fastq, index, fasta, true ) - ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } - // Collect all BWAMEM2 output by sample name - BWAMEM2_MEM.out.bam + // Collect all BAM output by sample name + ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } @@ -58,7 +100,7 @@ workflow ALIGN_SHORT { // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) SAMTOOLS_MERGE.out.bam diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf deleted file mode 100644 index 92c2010..0000000 --- a/subworkflows/local/align_short_hic.nf +++ /dev/null @@ -1,116 +0,0 @@ -// -// Align short read (HiC and Illumina) data against the genome -// - -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' -include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' -include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' -include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' -include { MINIMAP2_MAPREDUCE as HIC_MINIMAP2 } from '../../subworkflows/local/minimap2_mapreduce' -include { BWAMEM2_MAPREDUCE as HIC_BWAMEM2 } from '../../subworkflows/local/bwamem2_mapreduce' - -workflow ALIGN_SHORT_HIC { - take: - fasta // channel: [ val(meta), /path/to/fasta ] reference_tuple - index // channel: [ val(meta), /path/to/bwamem2/ ] - reads // channel: [ val(meta), /path/to/datafile ] hic_reads_path - - - main: - ch_versions = Channel.empty() - ch_merged_bam = Channel.empty() - - // Check file types and branch - reads - | branch { - meta, reads -> - fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } - cram : true - } - | set { ch_reads } - - - // Convert FASTQ to CRAM only if FASTQ were provided as input - CONVERT_CRAM ( ch_reads.fastq, fasta ) - ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) - - CONVERT_CRAM.out.bam - | mix ( ch_reads.cram ) - | set { ch_reads_cram } - - - // Index the CRAM file - SAMTOOLS_INDEX ( ch_reads_cram ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) - - ch_reads_cram - | join ( SAMTOOLS_INDEX.out.crai ) - | set { ch_reads_cram_crai } - - - // - // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT - // - GENERATE_CRAM_CSV( ch_reads_cram_crai ) - ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) - - - // - // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 - // - if (params.short_aligner.startsWith("minimap")) { - HIC_MINIMAP2 ( - fasta, - GENERATE_CRAM_CSV.out.csv - ) - ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) - ch_merged_bam = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam) - } else { - HIC_BWAMEM2 ( - fasta, - GENERATE_CRAM_CSV.out.csv, - index - ) - ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) - ch_merged_bam = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam) - } - - ch_merged_bam - | combine( ch_reads_cram_crai ) - | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } - | set { ch_merged_bam } - - - // Collect all BAM output by sample name - ch_merged_bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple( by: [0] ) - | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - | branch { - meta, bams -> - single_bam: bams.size() == 1 - multi_bams: true - } - | set { ch_bams } - - - // Merge, but only if there is more than 1 file - SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - SAMTOOLS_MERGE.out.bam - | mix ( ch_bams.single_bam ) - | set { ch_bam } - - - // Mark duplicates - SAMTOOLS_SORMADUP ( ch_bam, fasta ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) - - emit: - bam = SAMTOOLS_SORMADUP.out.bam // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/bwamem2_mapreduce.nf b/subworkflows/local/bwamem2_mapreduce.nf index ceaff1b..13711fb 100644 --- a/subworkflows/local/bwamem2_mapreduce.nf +++ b/subworkflows/local/bwamem2_mapreduce.nf @@ -36,8 +36,9 @@ workflow BWAMEM2_MAPREDUCE { } .set { ch_filtering_input } + // - // MODULE: map hic reads by 10,000 container per time using bwamem2 + // MODULE: map hic reads in each chunk using bwamem2 // CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( ch_filtering_input, diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index aa951b6..f7a4754 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -30,7 +30,7 @@ workflow PREPARE_GENOME { // Unmask genome fasta UNMASK ( ch_fasta ) - ch_versions = ch_versions.mix ( UNMASK.out.versions.first() ) + ch_versions = ch_versions.mix ( UNMASK.out.versions ) // Generate BWA index if ( checkShortReads( params.input ) ) { @@ -42,14 +42,14 @@ workflow PREPARE_GENOME { if ( params.bwamem2_index.endsWith('.tar.gz') ) { ch_bwamem2_index = UNTAR ( ch_bwamem ).untar - ch_versions = ch_versions.mix ( UNTAR.out.versions.first() ) + ch_versions = ch_versions.mix ( UNTAR.out.versions ) } else { ch_bwamem2_index = ch_bwamem } } else { ch_bwamem2_index = BWAMEM2_INDEX ( UNMASK.out.fasta ).index - ch_versions = ch_versions.mix ( BWAMEM2_INDEX.out.versions.first() ) + ch_versions = ch_versions.mix ( BWAMEM2_INDEX.out.versions ) } } else { ch_bwamem2_index = Channel.empty() diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index a5e9575..a71c77e 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER } from '../modules/local/samtools_replaceh include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_SHORT_HIC as ALIGN_HIC } from '../subworkflows/local/align_short_hic' +include { ALIGN_SHORT as ALIGN_HIC } from '../subworkflows/local/align_short' include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short' include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacbio' include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio'