Skip to content

Commit

Permalink
- version 0.10.0
Browse files Browse the repository at this point in the history
- updated to nevermore 0.14.3
- updated to metaphlow 0.11.3
- updated samestr container to ghcr.io/danielpodlesny/samestr:v1.2024.09

- split samestr workflow into convert/post-convert
- reads are processed as phred33 during workflow execution
- increased robustness against garbage data
- added repair.sh step to preprocessing to catch corrupted paired-end samples
- fastq input is now parallelised
  • Loading branch information
cschu committed Sep 24, 2024
1 parent ec4d56a commit 3ba2317
Show file tree
Hide file tree
Showing 15 changed files with 169 additions and 98 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
metaphlow_old/
nevermore_old/
2 changes: 1 addition & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include { samestr } from "./metaphlow/workflows/samestr"
workflow {

fastq_input(
Channel.fromPath(params.input_dir + "/*", type: "dir"),
Channel.fromPath(input_dir + "/**[._]{fastq.gz,fq.gz,fastq.bz2,fq.bz2}"),
Channel.of(null)
)

Expand Down
17 changes: 6 additions & 11 deletions metaphlow/modules/profilers/humann3.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
process reduce_metaphlan_profiles {
// container "quay.io/biocontainers/humann:3.7--pyh7cba7a3_1"
// container "quay.io/biocontainers/humann:3.8--pyh7cba7a3_0"
container = "registry.git.embl.de/schudoma/humann3-docker:latest"
label "default"
label "humann3"
label "medium"

input:
path(mp_collated_profiles)
Expand All @@ -27,10 +26,8 @@ process reduce_metaphlan_profiles {


process generate_humann_joint_index {
// container "quay.io/biocontainers/humann:3.7--pyh7cba7a3_1"
// container "quay.io/biocontainers/humann:3.8--pyh7cba7a3_0"
container = "registry.git.embl.de/schudoma/humann3-docker:latest"

label "humann3"
label "process_high"

input:
Expand All @@ -53,11 +50,9 @@ process generate_humann_joint_index {


process run_humann3 {
// container "quay.io/biocontainers/humann:3.7--pyh7cba7a3_1"
// container "quay.io/biocontainers/humann:3.8--pyh7cba7a3_0"
publishDir params.output_dir, mode: "copy"
container = "registry.git.embl.de/schudoma/humann3-docker:latest"

label "humann3"
label "process_high"

input:
Expand Down Expand Up @@ -105,9 +100,9 @@ process run_humann3 {


process reformat_genefamily_table {
// container "quay.io/biocontainers/humann:3.7--pyh7cba7a3_1"
// container "quay.io/biocontainers/humann:3.8--pyh7cba7a3_0"
publishDir params.output_dir, mode: "copy"
label "humann3"
label "process_single"
container = "registry.git.embl.de/schudoma/humann3-docker:latest"

input:
Expand Down
28 changes: 17 additions & 11 deletions metaphlow/modules/profilers/samestr.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
process run_samestr_convert {
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
tag "${sample.id}"
label "highmem_large"
label "large"
label "samestr"


input:
Expand Down Expand Up @@ -32,9 +33,10 @@ process run_samestr_convert {

process run_samestr_merge {
publishDir params.output_dir, mode: "copy"
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
tag "${species}"
label "highmem_large"
label "large"
label "samestr"

input:
tuple val(species), path(sstr_npy)
Expand All @@ -60,9 +62,10 @@ process run_samestr_merge {
}

process run_samestr_filter {
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
tag "${species}"
label "highmem_large"
label "large"
label "samestr"

input:
tuple val(species), path(sstr_npy), path(sstr_names)
Expand Down Expand Up @@ -99,9 +102,10 @@ process run_samestr_filter {

process run_samestr_stats {
publishDir params.output_dir, mode: "copy"
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
tag "${species}"
label "large"
label "samestr"

input:
tuple val(species), path(sstr_npy), path(sstr_names)
Expand All @@ -124,13 +128,14 @@ process run_samestr_stats {

process run_samestr_compare {
publishDir params.output_dir, mode: "copy"
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
tag "${species}"
label "highmem_large"
label "large"
label "samestr"

input:
tuple val(species), path(sstr_npy), path(sstr_names)
path(marker_db)
path(marker_db)

output:
tuple \
Expand All @@ -153,8 +158,9 @@ process run_samestr_compare {

process run_samestr_summarize {
publishDir params.output_dir, mode: "copy"
container "registry.git.embl.de/schudoma/samestr-docker:latest"
container "ghcr.io/danielpodlesny/samestr:v1.2024.09"
label "large"
label "samestr"

input:
path(sstr_data)
Expand Down
3 changes: 3 additions & 0 deletions metaphlow/version.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"version": "0.11.3"
}
66 changes: 53 additions & 13 deletions metaphlow/workflows/samestr.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,43 @@
include { run_samestr_convert; run_samestr_merge; run_samestr_filter; run_samestr_stats; run_samestr_compare; run_samestr_summarize } from "../modules/profilers/samestr"

params.samestr_marker_db = "/scratch/schudoma/databases/samestr/mpa_vOct22_CHOCOPhlAnSGB_202212/marker_db/"

workflow samestr {

workflow samestr_post_convert {
take:
ss_converted
mp4_tables
main:
// grouped_npy_ch = run_samestr_convert.out.sstr_npy
// .join(run_samestr_convert.out.convert_sentinel, by: 0)
// .map { sample, data, sentinel -> return data }
// .flatten()
// .map { file ->
// def species = file.name.replaceAll(/[.].*/, "")
// return tuple(species, file)
// }
// .groupTuple(sort: true)

// run_samestr_merge(grouped_npy_ch, params.samestr_marker_db)
run_samestr_merge(ss_converted, params.samestr_marker_db)
run_samestr_filter(
run_samestr_merge.out.sstr_npy,
params.samestr_marker_db
)
run_samestr_stats(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)
run_samestr_compare(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)

run_samestr_summarize(
run_samestr_compare.out.sstr_compare.collect(),
mp4_tables.map { sample, table -> return table }.collect(),
params.samestr_marker_db
)

}



workflow samestr_full {

take:
mp4_sam
Expand All @@ -22,19 +58,23 @@ workflow samestr {
return tuple(species, file)
}
.groupTuple(sort: true)

if (!params.stop_after_convert) {
samestr_post_convert(grouped_npy_ch, mp4_tables)
}

run_samestr_merge(grouped_npy_ch, params.samestr_marker_db)
run_samestr_filter(
run_samestr_merge.out.sstr_npy,
params.samestr_marker_db
)
run_samestr_stats(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)
run_samestr_compare(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)
// run_samestr_merge(grouped_npy_ch, params.samestr_marker_db)
// run_samestr_filter(
// run_samestr_merge.out.sstr_npy,
// params.samestr_marker_db
// )
// run_samestr_stats(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)
// run_samestr_compare(run_samestr_filter.out.sstr_npy, params.samestr_marker_db)

run_samestr_summarize(
run_samestr_compare.out.sstr_compare.collect(),
mp4_tables.map { sample, table -> return table }.collect(),
params.samestr_marker_db
)
// run_samestr_summarize(
// run_samestr_compare.out.sstr_compare.collect(),
// mp4_tables.map { sample, table -> return table }.collect(),
// params.samestr_marker_db
// )

}
3 changes: 2 additions & 1 deletion nevermore/modules/converters/merge_fastqs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ process merge_single_fastqs {
} else {
fastq_in = "${fastqs[0]}"
}
def maxmem = task.memory.toGiga()

"""
set -e -o pipefail
mkdir -p merged/
${prefix} sortbyname.sh in=${fastq_in} out=merged/${sample.id}_R1.fastq.gz ${suffix}
${prefix} sortbyname.sh -Xmx${maxmem}g qin=33 in=${fastq_in} out=merged/${sample.id}_R1.fastq.gz ${suffix}
"""
// https://stackoverflow.com/questions/22464786/ignoring-bash-pipefail-for-error-code-141/72985727#72985727
}
12 changes: 6 additions & 6 deletions nevermore/modules/profilers/gffquant.nf
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ process run_gffquant {
path(gq_db)

output:
tuple val(sample), path("profiles/${sample}/*.txt.gz"), emit: results
tuple val(sample), path("logs/${sample}.log")
tuple val(sample), path("profiles/${sample.id}/*.txt.gz"), emit: results
tuple val(sample), path("logs/${sample.id}.log")

script:
def gq_output = "-o profiles/${sample}/${sample}"
def gq_output = "-o profiles/${sample.id}/${sample.id}"

def gq_params = "-m ${params.gq_mode} --ambig_mode ${params.gq_ambig_mode}"
gq_params += (params.gq_strand_specific) ? " --strand_specific" : ""
Expand Down Expand Up @@ -126,8 +126,8 @@ process run_gffquant {

mk_aln_sam += "echo 'Making alignment stream...'\n"
if (alignments instanceof Collection && alignments.size() >= 2) {
mk_aln_sam += "cat ${sample}.sam > tmp/alignments.sam \n"
mk_aln_sam += "grep -v '^@' ${sample}.singles.sam >> tmp/alignments.sam"
mk_aln_sam += "cat ${sample.id}.sam > tmp/alignments.sam \n"
mk_aln_sam += "grep -v '^@' ${sample.id}.singles.sam >> tmp/alignments.sam"
} else {
mk_aln_sam += "ln -s ${alignments[0]} tmp/alignments.sam"
}
Expand All @@ -145,7 +145,7 @@ process run_gffquant {
echo 'Copying database...'
cp -v ${gq_db} gq_db.sqlite3
${mk_aln_sam}
${gq_cmd} &> logs/${sample}.log
${gq_cmd} &> logs/${sample.id}.log
rm -rfv gq_db.sqlite3* tmp/
"""
}
Expand Down
48 changes: 20 additions & 28 deletions nevermore/modules/qc/bbduk.nf
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
params.qc_params_shotgun = "qtrim=rl trimq=3 maq=25 ktrim=r k=23 mink=11 hdist=1 ftm=5 entropy=0.5 entropywindow=50 entropyk=5 tpe tbo"
params.qc_minlen = 45


process qc_bbduk {
container "quay.io/biocontainers/bbmap:39.06--h92535d8_0"
label 'bbduk'
label "medium"
tag "${sample.id}"


Expand All @@ -18,29 +20,26 @@ process qc_bbduk {
tuple val(sample), path("qc_reads/${sample.id}/BBDUK_FINISHED"), emit: sentinel

script:
// def maxmem = task.memory.toGiga().intdiv(2)
def maxmem = task.memory.toGiga()
def compression = (reads[0].name.endsWith("gz")) ? "gz" : "bz2"

def read1 = ""
def read2 = ""
def orphans = ""
def orphan_check = ""

def bb_params = params.qc_params_shotgun //.replaceAll(/maq=([0-9]+)/, "")

def trim_params = "${bb_params} ref=${adapters} minlen=${params.qc_minlen}"

def orphan_filter = ""

def bb_params = params.qc_params_shotgun
def trim_params = "${bb_params} ref=${adapters} minlen=${params.qc_minlen} ibq qout=33 tossbrokenreads=t"

def r1_files = reads.findAll( { it.name.endsWith("_R1.fastq.${compression}") } )
def r2_files = reads.findAll( { it.name.endsWith("_R2.fastq.${compression}") } )

def qenc_str = (params.phred64 != null && params.phred64 != false) ? "qin=64" : ""
def qenc_str = (params.phred64 != null && params.phred64 != false) ? "qin=64" : "qin=33"


def read1 = ""
def orphans = ""
if (r1_files.size() != 0) {
read1 += "in1=${r1_files[0]} out1=qc_reads/${sample.id}/${sample.id}_R1.fastq.gz"
// read1 = "in1=${sample.id}_R1.fastq.${compression} out1=qc_reads/${sample.id}/${sample.id}_R1.fastq.gz"
if (r2_files.size() != 0) {
read2 += "in2=${r2_files[0]} out2=qc_reads/${sample.id}/${sample.id}_R2.fastq.gz outs=tmp_orphans.fq"
orphans += "qc_reads/${sample.id}/${sample.id}.orphans_R1.fastq.gz"
Expand All @@ -52,30 +51,23 @@ process qc_bbduk {
"""
}
}



// if (sample.is_paired) {
// def orphans = "qc_reads/${sample.id}/${sample.id}.orphans_R1.fastq.gz"
// // read2 = "in2=${sample.id}_R2.fastq.${compression} out2=qc_reads/${sample.id}/${sample.id}_R2.fastq.gz outs=tmp_orphans.fq"
// // orphan_filter = "bbduk.sh -Xmx${maxmem}g t=${task.cpus} ${trim_params} in=tmp_orphans.fq out=${orphans}"

// orphan_check = """
// if [[ -z "\$(gzip -dc ${orphans} | head -n 1)" ]]; then
// rm ${orphans}
// fi
// """
// }

// def read1 = "in1=${sample.id}_R1.fastq.${compression} out1=qc_reads/${sample.id}/${sample.id}_R1.fastq.gz"

def stats_out = "stats=stats/qc/bbduk/${sample.id}.bbduk_stats.txt"

"""
set -e -o pipefail
set -e
mkdir -p qc_reads/${sample.id}/ stats/qc/bbduk/
bbduk.sh -Xmx${maxmem}g t=${task.cpus} ${trim_params} ${qenc_str} ${stats_out} ${read1} ${read2}
bbduk.sh -Xmx${maxmem}g t=${task.cpus} ${trim_params} ${qenc_str} ${stats_out} ${read1} ${read2} 2>&1 | tee logfile
if [[ \$(grep -c 'There appear to be different numbers of reads in the paired input files.' logfile) -eq 1 ]]; then
repair.sh -Xmx${maxmem}g t=${task.cpus} in=${r1_files[0]} in2=${r2_files[0]} out=${sample.id}.repaired_R1.fastq.gz out2=${sample.id}.repaired_R2.fastq.gz outs=${sample.id}.repaired.orphans_R1.fastq ${qenc_str} qout=33 tossbrokenreads=t
bbduk.sh -Xmx${maxmem}g t=${task.cpus} ${trim_params} qin=33 ibq ${stats_out} overwrite=t \
in=${sample.id}.repaired_R1.fastq.gz in2=${sample.id}.repaired_R2.fastq.gz \
out=qc_reads/${sample.id}/${sample.id}_R1.fastq.gz out2=qc_reads/${sample.id}/${sample.id}_R2.fastq.gz outs=tmp_orphans.fq
cat ${sample.id}.repaired.orphans_R1.fastq >> tmp_orphans.fq
fi
${orphan_filter}
${orphan_check}
Expand Down
2 changes: 1 addition & 1 deletion nevermore/version.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"version": "0.13.0"
"version": "0.14.3"
}
Loading

0 comments on commit 3ba2317

Please sign in to comment.