-
Notifications
You must be signed in to change notification settings - Fork 4
/
align_pangenome.nf
112 lines (88 loc) · 4.64 KB
/
align_pangenome.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env nextflow
// Using DSL-2
nextflow.enable.dsl=2
// Import helpers
GroovyShell shell = new GroovyShell()
def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))
// Import sub-workflows
include { bin_metagenomes } from './modules/bin_metagenomes'
include { align_reads } from './modules/align_reads'
include { find_reads } from './modules/find_reads'
// Standalone entrypoint
workflow {
// Show help message if the user specifies the --help flag at runtime
helpers.help_message(
"""
Align a set of metagenomic sequencing data to a binned pangenome in
which the genes have been grouped by similar occurrence across genomes
(using the build_pangenome.nf workflow).
Those binned gene groups will be used to summarize a batch of metagenomic
data, and the similarity of gene abundance across metagenomes will be
tested for concordence with the similarity across genomes.
By applying the results of gene binning to the metagenome alignment
results using the same gene catalog, the diversity of a collection of
organisms can be collapsed into units of those co-occurrent genetic
elements.
Parameters:
--genes Path of single deduplicated amino acid FASTA file to be used for alignment
--reads Folder containing the set of FASTQ files to align against those genes
--reads_suffix File ending for all reads (default: ${params.reads_suffix})
--paired Set to true if input folder contains paired-end FASTQ files (default: false)
--samplesheet Optional file listing the FASTQ inputs to process (headers: sample,R1,R2)
--min_score_reads Minimum alignment score (default: ${params.min_score_reads})
--min_identity Minimum percent identity of the amino acid alignment required to retain the alignment
(default: ${params.min_identity}, ranges 0-100)
--max_evalue Maximum E-value threshold used to filter all alignments
(default: ${params.max_evalue})
--aligner Algorithm used for alignment (default: ${params.aligner}, options: diamond, blast)
--query_gencode Genetic code used for conceptual translation of genome sequences
(default: ${params.query_gencode})
--max_overlap Any alignment which overlaps a higher-scoring alignment by more than this
amount will be filtered out (default: ${params.max_overlap}, range: 0-100)
--aln_fmt Column headings used for alignment outputs (see DIAMOND documentation for details)
(default: ${params.aln_fmt})
--gene_bins Grouping of genes into bins (e.g. gene_bins.csv)
--group_profile Gene content of genome groups (e.g. group_profile.csv)
--genome_groups Grouping of genomes by gene content (e.g. genome_groups.csv)
--output Folder where output files will be written
--metadata Optional: Metadata table used to compare samples (CSV)
--formula Optional: Column(s) from metadata table used for comparison
--min_n_reads Exclude any samples with fewer than this number of reads aligned (default: 0)
--min_n_genes Exclude any samples with fewer than this number of genes detected (default: 0)
""",
params.help
)
// Make sure that the required parameters were provided
helpers.require_param(params.genes, "genes")
helpers.require_param(params.gene_bins, "gene_bins")
helpers.require_param(params.group_profile, "group_profile")
helpers.require_param(params.output, "output")
// If there is no samplesheet
if ( "${params.samplesheet}" == "false" ){
// Requre the reads parameter
helpers.require_param(params.reads, "reads")
}
// Get the reads either from samplesheet or the input folder
find_reads()
// Align those reads against the centroids
align_reads(
file("${params.genes}"),
find_reads.out
)
// Get the gene bins
gene_bins = file(params.gene_bins, checkIfExists: true)
// Get the genome groups
genome_groups = file(params.genome_groups, checkIfExists: true)
// Get the genome groups
group_profile = file(params.group_profile, checkIfExists: true)
// Get the metadata table
metadata = file(params.metadata, checkIfExists: true)
// Bin the metagenomes
bin_metagenomes(
align_reads.out.csv,
gene_bins,
genome_groups,
group_profile,
metadata
)
}