nextflow.config

manifest {
    homePage = 'https://github.com/FredHutch/gig-map'
    description = 'Build a map of genes present across a set of microbial genomes'
    mainScript = 'align_genomes.nf'
    version = '0.2.0'
}

profiles {
    standard {
        process {
            errorStrategy = 'retry'
            maxRetries = 3
            withLabel: 'io_limited' {
                cpus = { 1 * task.attempt }
                memory = { 6.GB * task.attempt }
            }
            withLabel: 'mem_medium' {
                cpus = { 4 * task.attempt }
                memory = { 30.GB * task.attempt }
            }
            withLabel: 'mem_veryhigh' {
                cpus = { 16 * task.attempt }
                memory = { 120.GB * task.attempt }
            }
        }
    }
    smallmem {
        process {
            errorStrategy = 'retry'
            maxRetries = 5
            withLabel: 'io_limited' {
                cpus = 1
                memory = 1.GB
            }
            withLabel: 'mem_medium' {
                cpus = 4
                memory = 30.GB
            }
            withLabel: 'mem_veryhigh' {
                cpus = 8
                memory = 30.GB
            }
        }
    }
    testing {
        process {
            executor = 'local'
            errorStrategy = 'terminate'
            withLabel: 'io_limited' {
                cpus = 1
                memory = 2.GB
            }
            withLabel: 'mem_medium' {
                cpus = 2
                memory = 4.GB
            }
            withLabel: 'mem_veryhigh' {
                cpus = 2
                memory = 4.GB
            }
        }
    }
}

// Set default parameters
params {

    // Global
    help = false
    output = false

    // Download Genomes / Genes
    genome_tsv = ""
    ncbi_datasets_type = "genome"
    genome_csv = ""
    parse_genome_csv_suffix = "_genomic.fna.gz"
    ftp_threads = 25
    skip_missing_ftp = "false"

    // Deduplicate genes
    genes = false
    cluster_similarity = 0.9
    cluster_coverage = 0.9
    min_gene_length = 50

    // Align Genes and Genomes
    genes = false
    genomes = false
    collect_results = true
    min_coverage = 90
    min_identity = 90
    aligner = 'diamond'
    query_gencode = 11
    max_evalue = 0.001
    max_overlap = 50
    aln_fmt = "qseqid sseqid pident length qstart qend qlen sstart send slen"

    // Collect
    genome_aln = false
    metagenome_aln = false
    gene_order = false
    marker_genes = false
    
    // ANI
    sketch_size = 10000
    ani_thresholds = "99,95,90,80,70,60,50"

    // Align Marker Genes
    min_marker_coverage = 90
    pick_marker_genes = 10
    raxml_model = "LG+G8+F"
    raxml_starting_trees = 10
    raxml_bs_trees = 10

    // Align Metagenomes
    // Folder containing reads of interest
    reads = false
    // Optional file containing a list of files to process (R1,R2)
    samplesheet = false
    // Suffix on all reads
    reads_suffix = "fastq.gz"
    // Pattern used to join paired reads
    read_pairing_pattern = "{1,2}"
    // Boolean flag used for paired-end input
    paired = false
    // Minimum alignment score 
    min_score_reads = 50
    // Minimum proportion of reads which must aligne
    min_coverage_reads = 50
    // Minimum percent identity for aligning reads
    min_pctiden_reads = 80
    // Resolve reads aligning to any gene with a score
    // that is no less than this percentage lower than
    // the top hit
    top_pct_reads = 5
    // Filter out any samples which have fewer than this number
    // of raw alignments (prior to FAMLI filtering)
    min_alignments = 100
    // Number of reads to resolve per batch with FAMLI
    famli_batchsize = 10000000
    // Filter out any genes for which the standard deviation
    // of sequencing depth across the length of the gene
    // exceeds this value
    famli_sd_mean_cutoff = 3.0

    // Mapping gene catalogs
    map_batchsize = 100000

    // Render
    genome_aln = false
    genome_distmat = false
    render_options = "$projectDir/templates/render_params.json"
    render_mem_gbs = 4

    // Sketch genomes and search
    gencode = 11
    minsize = 100
    sketch_size = 1000
    batchsize = 1000
    save_sketches = true
    kmer_size = 9
    search_results = "search_results"

    // Bin genes
    gene_annot = "$projectDir/templates/centroids.annot.csv.gz"
    genome_annot = "$projectDir/templates/genome_annotations.csv"
    min_genomes_per_gene = 1
    max_dist_genes = 0.05
    max_dist_genomes = 0.05
    min_bin_size = 5

    // Bin metagenomes
    read_alignments = false
    gene_bins = false
    genome_groups = false
    metadata = "$projectDir/templates/metadata.csv"
    formula = false
    incl_unaligned = false
    min_n_reads = 0
    min_n_genes = 0
    
    // Docker containers reused across processes
    container__pandas = "quay.io/fhcrc-microbiome/python-pandas:4fb7844"
    container__mashtree = "quay.io/hdc-workflows/mashtree:1.2.0"
    container__blast = "quay.io/biocontainers/blast:2.11.0--pl526he19e7b1_0"
    container__diamond = "quay.io/biocontainers/diamond:2.0.11--hdcc8f71_0"
    container__famli = "quay.io/fhcrc-microbiome/famli:v1.5"
    container__cdhit = "quay.io/biocontainers/cd-hit:4.8.1--h2e03b76_5"
    container__clustal = "quay.io/hdc-workflows/clustalo:main"
    container__emboss = "quay.io/biocontainers/emboss:6.6.0--hf657eab_5"
    container__raxml = "quay.io/biocontainers/raxml-ng:1.0.3--h32fcf60_0"
    container__gigmap = "quay.io/hdc-workflows/gig-map:c08eac9"
    container__mash = "staphb/mashtree:0.52.0"
    container__corncob = "quay.io/fhcrc-microbiome/corncob:84c8354"
    container__datasets = "biocontainers/ncbi-datasets-cli:15.12.0_cv23.1.0-4"

}