Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subsample and branch hic_mapping #192

Merged
merged 14 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions assets/local_testing/nxOscSUBSET.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ assembly:
reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/assembly/draft/SUBSET_genome/Oscheius_SUBSET.fasta
#/lustre/scratch123/tol/resources/treeval/nextflow_test_data/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta
assem_reads:
pacbio: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/genomic_data/pacbio/
hic: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/subset/
supplementary: path
longread_type: hifi
longread_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/genomic_data/pacbio/
hic_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/subset/
supplementary_data: path
alignment:
data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/
common_name: "" # For future implementation (adding bee, wasp, ant etc)
Expand Down
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -319,11 +319,11 @@ process {
ext.args = "pairs -0 -c1 3 -p1 4 -c2 7 -p2 8"
}

withName: ".*:.*:HIC_MAPPING:SAMTOOLS_MARKDUP" {
withName: ".*:.*:(HIC_BAMTOBED_COOLER|HIC_BAMTOBED_JUICER):SAMTOOLS_MARKDUP" {
ext.prefix = { "${meta.id}_mkdup" }
}

withName: ".*:.*:HIC_MAPPING:SAMTOOLS_MERGE" {
withName: ".*:.*:(HIC_BAMTOBED_COOLER|HIC_BAMTOBED_JUICER):SAMTOOLS_MERGE" {
ext.prefix = { "${meta.id}_merged" }
}

Expand Down
43 changes: 43 additions & 0 deletions modules/local/subsample_bam.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
process SUBSAMPLE_BAM {
tag "${meta.id}"
label 'process_tiny'

conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'biocontainers/samtools:1.17--h00cdaf9_0' }"

input:
tuple val(meta), path(mergedbam)

output:
tuple val(meta), path('*.bam'), emit: subsampled_bam
path "versions.yml", emit: versions

shell:
def prefix = task.ext.prefix ?: "${meta.id}"
'''
percentage=`wc -c !{mergedbam} | cut -d$' ' -f1 | awk '{printf "%.2f\\n", 50000000000 / $0}'`

if awk "BEGIN {exit !($percentage <= 1 )}"; then
samtools view -s $percentage -b !{mergedbam} > !{meta.id}_subsampled.bam
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

${percentage}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which to change? The one in the if is being removed. The samtools parameter is referencing a shell variable, whilst {mergedbam} is a nextflow.

else
mv !{mergedbam} !{meta.id}_subsampled.bam
fi

cat <<-END_VERSIONS > versions.yml
"!{task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
END_VERSIONS
'''

stub:
"""
touch ${meta.id}_subsampled.bam

cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
END_VERSIONS
"""
}
64 changes: 64 additions & 0 deletions subworkflows/local/hic_bamtobed.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env nextflow

// This subworkflow takes converts .bam to .bed for the hic_mapping subworkflow.
// It runs markdup, sort and get paired contacts.
// Input - Assembled genomic fasta file, .bam file
// Output - sorted .bed and paired contact .bed

//
// MODULE IMPORT BLOCK
//
include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main'
include { BAMTOBED_SORT } from '../../modules/local/bamtobed_sort.nf'
include { GET_PAIRED_CONTACT_BED } from '../../modules/local/get_paired_contact_bed'


workflow HIC_BAMTOBED {
take:
bam_file // Channel: tuple [ val(meta), path( file ) ]
reference_tuple // Channel: tuple [ val(meta), path( file ) ]

main:
ch_versions = Channel.empty()

//
// LOGIC: PREPARE MARKDUP INPUT
//
bam_file
.combine( reference_tuple )
.multiMap { meta_bam, bam_file, meta_ref, ref ->
bam : tuple(meta_bam, bam_file )
reference : ref
}
.set { markdup_input }

//
// MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
//
SAMTOOLS_MARKDUP (
markdup_input.bam,
markdup_input.reference
)
ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP.out.versions )

//
// MODULE: SAMTOOLS FILTER OUT DUPLICATE READS | BAMTOBED | SORT BED FILE
//
BAMTOBED_SORT(
SAMTOOLS_MARKDUP.out.bam
)
ch_versions = ch_versions.mix( BAMTOBED_SORT.out.versions )

//
// MODULE: GENERATE CONTACT PAIRS
//
GET_PAIRED_CONTACT_BED(
BAMTOBED_SORT.out.sorted_bed
)
ch_versions = ch_versions.mix( GET_PAIRED_CONTACT_BED.out.versions )

emit:
paired_contacts_bed = GET_PAIRED_CONTACT_BED.out.bed
sorted_bed = BAMTOBED_SORT.out.sorted_bed
versions = ch_versions.ifEmpty(null)
}
63 changes: 44 additions & 19 deletions subworkflows/local/hic_mapping.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ include { PRETEXTMAP as PRETEXTMAP_STANDRD } from '../../modules/nf-cor
include { PRETEXTMAP as PRETEXTMAP_HIGHRES } from '../../modules/nf-core/pretextmap/main'
include { PRETEXTSNAPSHOT as SNAPSHOT_SRES } from '../../modules/nf-core/pretextsnapshot/main'
include { PRETEXTSNAPSHOT as SNAPSHOT_HRES } from '../../modules/nf-core/pretextsnapshot/main'
include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main'
include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main'
include { BAMTOBED_SORT } from '../../modules/local/bamtobed_sort.nf'
include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv'
include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
include { JUICER_TOOLS_PRE } from '../../modules/local/juicer_tools_pre'
include { GET_PAIRED_CONTACT_BED } from '../../modules/local/get_paired_contact_bed'
include { SUBSAMPLE_BAM } from '../../modules/local/subsample_bam.nf'
include { PRETEXT_INGESTION as PRETEXT_INGEST_SNDRD } from '../../subworkflows/local/pretext_ingestion'
include { PRETEXT_INGESTION as PRETEXT_INGEST_HIRES } from '../../subworkflows/local/pretext_ingestion'
include { HIC_BAMTOBED as HIC_BAMTOBED_COOLER } from '../../subworkflows/local/hic_bamtobed'
include { HIC_BAMTOBED as HIC_BAMTOBED_JUICER } from '../../subworkflows/local/hic_bamtobed'


workflow HIC_MAPPING {
Expand Down Expand Up @@ -210,32 +210,37 @@ workflow HIC_MAPPING {
// ch_versions = ch_versions.mix ( SNAPSHOT_HRES.out.versions )

//
// MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
// MODULE: SUBSAMPLE BAM IF OVER 50G
Copy link
Contributor

@DLBPointon DLBPointon Jan 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to check file size and use an if block if size over 50G prior to the module. This would then stop a container being spun up just to mv a file (in the case file < 50G). Then the module can focus on subsampling.

Something akin to:

file = input.map{ it -> it[1] }
if ( file.size() > 50G ) {
    SUB_SAMPLE(input)
    fixed_ch = SUB_SAMPLE.out.subsampled
} else {
    fixed_ch = input
}

//
SAMTOOLS_MARKDUP (
pretext_input.input_bam,
pretext_input.reference
SUBSAMPLE_BAM (
SAMTOOLS_MERGE.out.bam
)
ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP.out.versions )
ch_versions = ch_versions.mix ( SUBSAMPLE_BAM.out.versions )

//
// MODULE: SAMTOOLS FILTER OUT DUPLICATE READS | BAMTOBED | SORT BED FILE
// LOGIC: PREPARE BAMTOBED JUICER INPUT
//
BAMTOBED_SORT(
SAMTOOLS_MARKDUP.out.bam
)
ch_versions = ch_versions.mix( BAMTOBED_SORT.out.versions )
SUBSAMPLE_BAM.out.subsampled_bam
.combine( reference_tuple )
.multiMap { meta, subsampled_bam, meta_ref, ref ->
bam : tuple(meta, subsampled_bam )
reference : tuple(meta_ref, ref)
}
.set { ch_bamtobed_juicer_input }

//
// MODULE: GENERATE CONTACT PAIRS
// SUBWORKFLOW: BAM TO BED FOR JUICER - USES THE SUBSAMPLED MERGED BAM
//
GET_PAIRED_CONTACT_BED( BAMTOBED_SORT.out.sorted_bed )
ch_versions = ch_versions.mix( GET_PAIRED_CONTACT_BED.out.versions )
HIC_BAMTOBED_JUICER(
ch_bamtobed_juicer_input.bam,
ch_bamtobed_juicer_input.reference
)
ch_versions = ch_versions.mix( HIC_BAMTOBED_JUICER.out.versions )

//
// LOGIC: PREPARE JUICER TOOLS INPUT
//
GET_PAIRED_CONTACT_BED.out.bed
HIC_BAMTOBED_JUICER.out.paired_contacts_bed
.combine( dot_genome )
.multiMap { meta, paired_contacts, meta_my_genome, my_genome ->
paired : tuple([ id: meta.id, single_end: true], paired_contacts )
Expand All @@ -254,11 +259,31 @@ workflow HIC_MAPPING {
)
ch_versions = ch_versions.mix( JUICER_TOOLS_PRE.out.versions )


// LOGIC: PREPARE BAMTOBED JUICER INPUT
//
SAMTOOLS_MERGE.out.bam
.combine( reference_tuple )
.multiMap { meta, merged_bam, meta_ref, ref ->
bam : tuple(meta, merged_bam )
reference : tuple(meta_ref, ref)
}
.set { ch_bamtobed_cooler_input }

//
// SUBWORKFLOW: BAM TO BED FOR COOLER
//
HIC_BAMTOBED_COOLER(
ch_bamtobed_cooler_input.bam,
ch_bamtobed_cooler_input.reference
)
ch_versions = ch_versions.mix( HIC_BAMTOBED_COOLER.out.versions )

//
// LOGIC: BIN CONTACT PAIRS
//
GET_PAIRED_CONTACT_BED.out.bed
.join( BAMTOBED_SORT.out.sorted_bed )
HIC_BAMTOBED_COOLER.out.paired_contacts_bed
.join( HIC_BAMTOBED_COOLER.out.sorted_bed )
.combine( ch_cool_bin )
.set { ch_binned_pairs }

Expand Down