From 6a9908b7eee8cb5b3ac9130917b3e1d0865363fa Mon Sep 17 00:00:00 2001 From: msauria Date: Mon, 10 Jun 2024 16:39:35 -0400 Subject: [PATCH] Cleaned up workflow bits --- bin/pipeline.plotting.mod.R | 5 +- conf/rockfish.config | 1 + env/conda.yml | 3 +- input_data/all_species/rename_chromosomes | 14 +- main.nf | 32 ++-- modules/mapping.nf | 160 +++++++---------- modules/post-mapping.nf | 206 +++++++++++++++------- modules/setup.nf | 5 +- modules/simulations.nf | 41 +++-- nextflow.config | 11 +- 10 files changed, 267 insertions(+), 211 deletions(-) diff --git a/bin/pipeline.plotting.mod.R b/bin/pipeline.plotting.mod.R index c5e121a..af7b83e 100755 --- a/bin/pipeline.plotting.mod.R +++ b/bin/pipeline.plotting.mod.R @@ -93,7 +93,7 @@ for.plot <- combined.mappings %>% if(!("MtDNA" %in% mito_check$CHROM)) dplyr::filter(., CHROM != "MtDNA") else . } -BF <- combined.mappings %>% +BF <- for.plot %>% dplyr::group_by(trait) %>% dplyr::filter(log10p != 0) %>% dplyr::distinct(marker, log10p) %>% @@ -160,7 +160,8 @@ man.plot <- ggplot() + facet_grid(. ~ CHROM, scales = "free_x", space = facet_scales) + ggtitle(BF.frame$trait) -ggsave(man.plot, filename = paste0(BF.frame$trait,"_manhattan_", args[3], ".plot.png"), width = 8, height = 4) + + ggsave(man.plot, filename = paste0(BF.frame$trait,"_manhattan_", args[3], ".plot.png"), width = 8, height = 4) ## SWEPTNESS & EFFECTS SUMMARY ## diff --git a/conf/rockfish.config b/conf/rockfish.config index f3e336b..dc030b5 100644 --- a/conf/rockfish.config +++ b/conf/rockfish.config @@ -70,4 +70,5 @@ singularity { autoMounts = true cacheDir = "${params.baseDir}/singularity" pullTimeout = '20 min' + envWhitelist = "SLURMD_HOSTNAME,SLURM_JOB_NODELIST,SLURM_NODELIST,SLURM_STEP_NODELIST,SLURM_TOPOLOGY_ADDR" } diff --git a/env/conda.yml b/env/conda.yml index 05565ac..95f32bf 100644 --- a/env/conda.yml +++ b/env/conda.yml @@ -1,4 +1,4 @@ -name: nemascan +name: base channels: - defaults - bioconda @@ -6,7 +6,6 @@ channels: - biobuilds - conda-forge dependencies: - - bcftools=1.9 - pandoc=2.12 - plink=1.90b6.12 - bedtools=2.29.2 diff --git a/input_data/all_species/rename_chromosomes b/input_data/all_species/rename_chromosomes index 80434f0..88faab9 100644 --- a/input_data/all_species/rename_chromosomes +++ b/input_data/all_species/rename_chromosomes @@ -1,7 +1,7 @@ -I 1 -II 2 -III 3 -IV 4 -V 5 -X 6 -MtDNA 7 \ No newline at end of file +I 1 +II 2 +III 3 +IV 4 +V 5 +X 6 +MtDNA 7 \ No newline at end of file diff --git a/main.nf b/main.nf index a151bc0..505c138 100644 --- a/main.nf +++ b/main.nf @@ -175,13 +175,13 @@ if (params.matrix || params.mapping){ if (params.help) { log.info ''' -O~~~ O~~ O~~ ~~ +O~~~ O~~ O~ O~~ O~ O~~ O~~ O~~ O~~ -O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ -O~~ O~~ O~~ O~ O~~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~ O~~ O~~~~~ O~~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~ ~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~~ O~~~~ O~~~ O~ O~~ O~~ O~~~ O~~ ~~ O~~~ O~~ O~~~ O~~~ O~~ +O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ +O~~ O~~ O~~ O~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~ O~~ O~~~~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~O~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~~ O~~~~ O~~~ O~ O~~ O~~ O~~~ O~ O~~ O~~~ O~~ O~~~ O~~~ O~~ ''' log.info "----------------------------------------------------------------" log.info " USAGE " @@ -246,13 +246,13 @@ O~~ O~~ O~~~~ O~~~ O~ O~~ O~~ O~~~ O~~ ~~ O~~~ O~~ O~~~ O exit 1 } else { log.info ''' -O~~~ O~~ O~~ ~~ +O~~~ O~~ O~ O~~ O~ O~~ O~~ O~~ O~~ -O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ -O~~ O~~ O~~ O~ O~~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~ O~~ O~~~~~ O~~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~ ~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ -O~~ O~~ O~~~~ O~~~ O~ O~~ O~~ O~~~ O~~ ~~ O~~~ O~~ O~~~ O~~~ O~~ +O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ O~~ O~~~ O~~ O~~ O~~ +O~~ O~~ O~~ O~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~ O~~ O~~~~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~O~~ O~ O~~ O~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ O~~ +O~~ O~~ O~~~~ O~~~ O~ O~~ O~~ O~~~ O~ O~~ O~~~ O~~ O~~~ O~~~ O~~ ''' log.info "" log.info "Trait File = ${params.traitfile}" @@ -500,7 +500,7 @@ workflow { .join(gcta_fine_maps.out.finemap_html_inbred, remainder: true) // fine mapping data .join(gcta_fine_maps.out.finemap_html_loco, remainder: true) .join(prep_ld_files.out.finemap_LD_inbred, remainder: true) - .join(prep_ld_files.out.finemap_LD_loco, remainder: true) | html_report_main + .join(prep_ld_files.out.finemap_LD_loco, remainder: true).view { "$it" } | html_report_main } } else { // generate main html report @@ -548,14 +548,15 @@ workflow { if(simulation) { // for simulations - Channel.fromPath("${params.data_dir}/${params.simulate_strains}").collect { it.tokenize( ' ' ) } + Channel.fromPath("${params.data_dir}/${params.simulate_strains}") + .splitCsv(sep:" ") .map { SM, STRAINS -> [SM, STRAINS] } .combine(vcf_file.combine(vcf_index)) .combine(Channel.fromPath("${params.data_dir}/all_species/rename_chromosomes")) .combine(Channel.fromPath("${params.data_dir}/${params.simulate_maf}").splitCsv()) | prepare_simulation_files // eigen - contigs = Channel.from(["1", "2", "3", "4", "5", "6"]) + contigs = Channel.from([1, 2, 3, 4, 5, 6]) contigs.combine(prepare_simulation_files.out.sim_geno) .combine(Channel.fromPath("${params.bin_dir}/Get_GenoMatrix_Eigen.R")) | chrom_eigen_variants_sims @@ -645,7 +646,6 @@ workflow.onComplete { Mapping = ${params.mapping} Simulation = ${params.simulate} Simulate QTL effects = ${params.simulate_qtlloc} - Annotation = ${params.annotate} Result Directory = ${params.out} """ diff --git a/modules/mapping.nf b/modules/mapping.nf index 8e92821..34badca 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -58,28 +58,31 @@ process gcta_grm { label "lg" input: - tuple val(TRAIT), file(traits), file(bed), file(bim), file(fam), file(map), file(nosex), file(ped), file(log) + tuple val(TRAIT), file(traits), file(bed), file(bim), file(fam), file(map), file(nosex), \ + file(ped), file(log), val(algorithm) output: tuple val(TRAIT), file(traits), file(bed), file(bim), file(fam), file(map), file(nosex), \ - file(ped), file(log), file("${TRAIT}_gcta_grm.grm.bin"), file("${TRAIT}_gcta_grm.grm.id"), \ - file("${TRAIT}_gcta_grm.grm.N.bin"), file("${TRAIT}_gcta_grm_inbred.grm.bin"), file("${TRAIT}_gcta_grm_inbred.grm.id") + file(ped), file(log), file("${TRAIT}_gcta_grm_${algorithm}.grm.bin"), \ + file("${TRAIT}_gcta_grm_${algorithm}.grm.id"), \ + file("${TRAIT}_gcta_grm_${algorithm}.grm.N.bin"), val(algorithm) when: - params.maps + params.mapping """ + if [[ ${algorithm} == "inbred" ]]; + then + option="--make-grm-inbred" + else + options="--make-grm" + fi + gcta64 --bfile ${TRAIT} \\ --autosome \\ --maf ${params.maf} \\ - --make-grm \\ - --out ${TRAIT}_gcta_grm \\ - --thread-num 5 - gcta64 --bfile ${TRAIT} \\ - --autosome \\ - --maf ${params.maf} \\ - --make-grm-inbred \\ - --out ${TRAIT}_gcta_grm_inbred \\ + \${option} \\ + --out ${TRAIT}_gcta_grm_${algorithm} \\ --thread-num 5 """ } @@ -89,63 +92,44 @@ process gcta_lmm_exact_mapping { label "lg" - publishDir "${params.out}/INBRED/Mapping/Raw", pattern: "*pca.fastGWA", overwrite: true - publishDir "${params.out}/LOCO/Mapping/Raw", pattern: "*pca.loco.mlma", overwrite: true + publishDir "${params.out}/INBRED/Mapping/Raw", pattern: "*inbred_pca.fastGWA", overwrite: true + publishDir "${params.out}/LOCO/Mapping/Raw", pattern: "*loco_pca.mlma", overwrite: true input: tuple val(TRAIT), file(traits), file(bed), file(bim), file(fam), file(map), \ - file(nosex), file(ped), file(log), file(grm_bin), file(grm_id), file(grm_nbin), \ - file(grm_bin_inbred), file(grm_id_inbred) + file(nosex), file(ped), file(log), file(grm_bin), file(grm_id), file(grm_nbin), + val(algorithm) output: - tuple val(TRAIT), file("${TRAIT}_lmm-exact_inbred_pca.fastGWA"), file("${TRAIT}_lmm-exact_pca.loco.mlma") + tuple val(TRAIT), file("${TRAIT}_lmm-exact_${algorithm}_pca.*", arity=1) """ - gcta64 --grm ${TRAIT}_gcta_grm \\ - --make-bK-sparse ${params.sparse_cut} \\ - --out ${TRAIT}_sparse_grm \\ - --thread-num ${task.cpus} - gcta64 --grm ${TRAIT}_gcta_grm \\ - --pca 1 \\ - --out ${TRAIT}_sparse_grm \\ - --thread-num ${task.cpus} - gcta64 --mlma-loco \\ - --grm ${TRAIT}_sparse_grm \\ - --bfile ${TRAIT} \\ - --out ${TRAIT}_lmm-exact \\ - --pheno ${traits} \\ - --maf ${params.maf} \\ - --thread-num ${task.cpus} - gcta64 --mlma-loco \\ - --grm ${TRAIT}_sparse_grm \\ - --bfile ${TRAIT} \\ - --qcovar ${TRAIT}_sparse_grm.eigenvec \\ - --out ${TRAIT}_lmm-exact_pca \\ - --pheno ${traits} \\ - --maf ${params.maf} \\ - --thread-num ${task.cpus} - - gcta64 --grm ${TRAIT}_gcta_grm_inbred \\ + if [[ ${algorithm} == "inbred" ]]; + then + options="--fastqGWA-lmm-exact --grm-sparse" + else + options="--mlma-loco --grm" + fi + + gcta64 --grm ${TRAIT}_gcta_grm_${algorithm} \\ --make-bK-sparse ${params.sparse_cut} \\ - --out ${TRAIT}_sparse_grm_inbred \\ + --out ${TRAIT}_sparse_grm_${algorithm} \\ --thread-num ${task.cpus} - gcta64 --grm ${TRAIT}_gcta_grm_inbred \\ + gcta64 --grm ${TRAIT}_gcta_grm_${algorithm} \\ --pca 1 \\ - --out ${TRAIT}_sparse_grm_inbred \\ + --out ${TRAIT}_sparse_grm_${algorithm} \\ --thread-num ${task.cpus} - gcta64 --fastGWA-lmm-exact \\ - --grm-sparse ${TRAIT}_sparse_grm \\ + gcta64 \${options} ${TRAIT}_sparse_grm \\ --bfile ${TRAIT} \\ - --out ${TRAIT}_lmm-exact_inbred \\ + --out ${TRAIT}_lmm-exact_${algorithm} \\ --pheno ${traits} \\ --maf ${params.maf} \\ --thread-num ${task.cpus} - gcta64 --fastGWA-lmm-exact \\ - --grm-sparse ${TRAIT}_sparse_grm \\ + gcta64 \${options} ${TRAIT}_sparse_grm \\ --bfile ${TRAIT} \\ - --qcovar ${TRAIT}_sparse_grm_inbred.eigenvec \\ - --out ${TRAIT}_lmm-exact_inbred_pca \\ + --qcovar ${TRAIT}_sparse_grm_${algorithm}.eigenvec \\ + --out ${TRAIT}_lmm-exact_${algorithm}_pca \\ --pheno ${traits} \\ --maf ${params.maf} \\ --thread-num ${task.cpus} @@ -163,57 +147,38 @@ process gcta_lmm_exact_mapping_nopca { input: tuple val(TRAIT), file(traits), file(bed), file(bim), file(fam), file(map), \ file(nosex), file(ped), file(log), file(grm_bin), file(grm_id), file(grm_nbin), \ - file(grm_bin_inbred), file(grm_id_inbred) + val(algorithm) output: - tuple val(TRAIT), file("${TRAIT}_lmm-exact_inbred.fastGWA"), file("${TRAIT}_lmm-exact.loco.mlma") + tuple val(TRAIT), file("${TRAIT}_lmm-exact_${algorithm}.*", arity=1) """ - gcta64 --grm ${TRAIT}_gcta_grm \\ + if [[ ${algorithm} == "inbred" ]]; + then + options="--fastqGWA-lmm-exact --grm-sparse" + else + options="--mlma-loco --grm" + fi + + gcta64 --grm ${TRAIT}_gcta_grm_${algorithm} \\ --make-bK-sparse ${params.sparse_cut} \\ - --out ${TRAIT}_sparse_grm \\ + --out ${TRAIT}_sparse_grm_${algorithm} \\ --thread-num ${task.cpus} - gcta64 --grm ${TRAIT}_gcta_grm \\ + gcta64 --grm ${TRAIT}_gcta_grm_${algorithm} \\ --pca 1 \\ - --out ${TRAIT}_sparse_grm \\ + --out ${TRAIT}_sparse_grm_${algorithm} \\ --thread-num ${task.cpus} - gcta64 --mlma-loco \\ - --grm ${TRAIT}_sparse_grm \\ + gcta64 \${options} ${TRAIT}_sparse_grm \\ --bfile ${TRAIT} \\ - --out ${TRAIT}_lmm-exact \\ + --out ${TRAIT}_lmm-exact_${algorithm} \\ --pheno ${traits} \\ --maf ${params.maf} \\ --thread-num ${task.cpus} - gcta64 --mlma-loco \\ - --grm ${TRAIT}_sparse_grm \\ + gcta64 \${options} ${TRAIT}_sparse_grm \\ --bfile ${TRAIT} \\ - --qcovar ${TRAIT}_sparse_grm.eigenvec \\ - --out ${TRAIT}_lmm-exact_pca \\ - --pheno ${traits} \\ - --maf ${params.maf} \\ - --thread-num ${task.cpus} - - gcta64 --grm ${TRAIT}_gcta_grm_inbred \\ - --make-bK-sparse ${params.sparse_cut} \\ - --out ${TRAIT}_sparse_grm_inbred \\ - --thread-num ${task.cpus} - gcta64 --grm ${TRAIT}_gcta_grm_inbred \\ - --pca 1 \\ - --out ${TRAIT}_sparse_grm_inbred \\ - --thread-num ${task.cpus} - gcta64 --fastGWA-lmm-exact \\ - --grm-sparse ${TRAIT}_sparse_grm \\ - --bfile ${TRAIT} \\ - --out ${TRAIT}_lmm-exact_inbred \\ - --pheno ${traits} \\ - --maf ${params.maf} \\ - --thread-num ${task.cpus} - gcta64 --fastGWA-lmm-exact \\ - --grm-sparse ${TRAIT}_sparse_grm \\ - --bfile ${TRAIT} \\ - --qcovar ${TRAIT}_sparse_grm_inbred.eigenvec \\ - --out ${TRAIT}_lmm-exact_inbred_pca \\ + --qcovar ${TRAIT}_sparse_grm_${algorithm}.eigenvec \\ + --out ${TRAIT}_lmm-exact_${algorithm}_pca \\ --pheno ${traits} \\ --maf ${params.maf} \\ --thread-num ${task.cpus} @@ -230,19 +195,18 @@ process gcta_intervals_maps { input: tuple val(TRAIT), file(pheno), file(tests), file(geno), val(P3D), val(sig_thresh), \ - val(qtl_grouping_size), val(qtl_ci_size), file(lmmexact_inbred), file(lmmexact_loco), file(find_aggregate_intervals_maps) + val(qtl_grouping_size), val(qtl_ci_size), file(lmmexact), \ + file(find_aggregate_intervals_maps), val(algorithm) output: - tuple file(geno), file(pheno), val(TRAIT), file(tests), file("*AGGREGATE_mapping_inbred.tsv"), file("*AGGREGATE_mapping_loco.tsv"), emit: maps_to_plot - path "*AGGREGATE_qtl_region_inbred.tsv", emit: qtl_peaks_inbred - tuple file("*AGGREGATE_mapping_inbred.tsv"), file("*AGGREGATE_mapping_loco.tsv"), val(TRAIT), emit: for_html - - path "*AGGREGATE_qtl_region_loco.tsv", emit: qtl_peaks_loco - // tuple val('loco'), file("*AGGREGATE_mapping_loco.tsv"), val(TRAIT), emit: for_html_loco + tuple file(geno), file(pheno), val(TRAIT), file(tests), file("*AGGREGATE_mapping_${algorithm}.tsv"), emit: maps_to_plot + path "*AGGREGATE_qtl_region${algorithm}.tsv", emit: qtl_peaks + tuple file("*AGGREGATE_mapping_${algorithm}.tsv"), val(TRAIT), val(algorithm), emit: for_html """ - Rscript --vanilla ${find_aggregate_intervals_maps} ${geno} ${pheno} ${lmmexact_inbred} ${tests} ${qtl_grouping_size} ${qtl_ci_size} ${sig_thresh} ${TRAIT}_AGGREGATE inbred - Rscript --vanilla ${find_aggregate_intervals_maps} ${geno} ${pheno} ${lmmexact_loco} ${tests} ${qtl_grouping_size} ${qtl_ci_size} ${sig_thresh} ${TRAIT}_AGGREGATE loco + Rscript --vanilla ${find_aggregate_intervals_maps} ${geno} ${pheno} ${lmmexact} \\ + ${tests} ${qtl_grouping_size} ${qtl_ci_size} ${sig_thresh} \\ + ${TRAIT}_AGGREGATE ${algorithm} """ } diff --git a/modules/post-mapping.nf b/modules/post-mapping.nf index 7169312..f65c90c 100644 --- a/modules/post-mapping.nf +++ b/modules/post-mapping.nf @@ -6,14 +6,13 @@ process summarize_mapping { publishDir "${params.out}/LOCO/Plots", mode: 'copy', pattern: "*_loco.png" input: - tuple file(qtl_peaks_inbred), file(qtl_peaks_loco), file(chr_lens), file(summarize_mapping_file) + tuple file(qtl_peaks), file(chr_lens), file(summarize_mapping_file), val(algorithm) output: file("Summarized_mappings*.pdf") """ - Rscript --vanilla ${summarize_mapping_file} ${qtl_peaks_inbred} ${chr_lens} inbred - Rscript --vanilla ${summarize_mapping_file} ${qtl_peaks_loco} ${chr_lens} loco + Rscript --vanilla ${summarize_mapping_file} ${qtl_peaks} ${chr_lens} ${algorithm} """ } @@ -31,16 +30,15 @@ process generate_plots { publishDir "${params.out}/LOCO/Plots/ManhattanPlots", mode: 'copy', pattern: "*_manhattan_loco.plot.png" input: - tuple file(geno), file(pheno), val(TRAIT), file(tests), file(aggregate_mapping_inbred), file(aggregate_mapping_loco), file(pipeline_plotting_mod) + tuple file(geno), file(pheno), val(TRAIT), val(algorithm), file(tests), \ + file(aggregate_mapping), file(pipeline_plotting_mod) output: - tuple file(geno), file(pheno), file(aggregate_mapping_inbred), val(TRAIT), emit: maps_from_plot_inbred - tuple file(geno), file(pheno), file(aggregate_mapping_loco), val(TRAIT), emit: maps_from_plot_loco + tuple file(geno), file(pheno), file(aggregate_mapping_${algorithm}), val(TRAIT), emit: maps_from_plot file("*.png") """ - Rscript --vanilla ${pipeline_plotting_mod} ${aggregate_mapping_inbred} ${tests} inbred - Rscript --vanilla ${pipeline_plotting_mod} ${aggregate_mapping_loco} ${tests} loco + Rscript --vanilla ${pipeline_plotting_mod} ${aggregate_mapping} ${tests} ${algorithm} """ } @@ -53,14 +51,14 @@ process LD_between_regions { publishDir "${params.out}/LOCO/Mapping/Processed", mode: 'copy', pattern: "*LD_between_QTL_regions_loco.tsv" input: - tuple file(geno), file(pheno), val(TRAIT), file(tests), file(aggregate_mapping_inbred), file(aggregate_mapping_loco), file(ld_between_regions) + tuple file(geno), file(pheno), val(TRAIT), file(tests), file(aggregate_mapping), \ + file(ld_between_regions), val(algorithm) output: - tuple val(TRAIT), path("*LD_between_QTL_regions_inbred.tsv"), path("*LD_between_QTL_regions_loco.tsv") optional true + tuple val(TRAIT), path("*LD_between_QTL_regions_${algorithm}.tsv") optional true """ - Rscript --vanilla ${ld_between_regions} ${geno} ${aggregate_mapping_inbred} ${TRAIT} inbred - Rscript --vanilla ${ld_between_regions} ${geno} ${aggregate_mapping_loco} ${TRAIT} loco + Rscript --vanilla ${ld_between_regions} ${geno} ${aggregate_mapping} ${TRAIT} ${algorithm} """ } @@ -92,7 +90,10 @@ process prep_ld_files { publishDir "${params.out}/LOCO/Fine_Mappings/Data", mode: 'copy', pattern: "*LD_loco.tsv" input: - tuple val(TRAIT), val(CHROM), val(marker), val(log10p), val(start_pos), val(peak_pos), val(end_pos), val(peak_id), val(h2), val(algorithm), file(geno), file(pheno), file(aggregate_mapping), file(imputed_vcf), file(imputed_index), file(phenotype), file(num_chroms) + tuple val(TRAIT), val(CHROM), val(marker), val(log10p), val(start_pos), val(peak_pos), \ + val(end_pos), val(peak_id), val(h2), val(algorithm), file(geno), file(pheno), \ + file(aggregate_mapping), file(imputed_vcf), file(imputed_index), file(phenotype), \ + file(num_chroms) output: tuple val(TRAIT), file(pheno), file("*ROI_Genotype_Matrix*.tsv"), file("*LD*.tsv"), file("*.bim"), file("*.bed"), file("*.fam"), val(algorithm), emit: finemap_preps @@ -100,43 +101,44 @@ process prep_ld_files { tuple val(TRAIT), file("*ROI_Genotype_Matrix_loco.tsv"), file("*LD_loco.tsv"), emit: finemap_LD_loco, optional: true """ - echo "HELLO" - cat ${aggregate_mapping} |\\ - awk '\$0 !~ "\\tNA\\t" {print}' |\\ - awk '!seen[\$1,\$12,\$19,\$20,\$21]++' |\\ - awk 'NR>1{print \$1, \$11, \$18, \$19, \$20}' OFS="\\t" > ${TRAIT}_QTL_peaks.tsv - filename='${TRAIT}_QTL_peaks.tsv' - echo Start - while read p; do - chromosome=`echo \$p | cut -f1 -d' '` - trait=`echo \$p | cut -f2 -d' '` - start_pos=`echo \$p | cut -f3 -d' '` - peak_pos=`echo \$p | cut -f4 -d' '` - end_pos=`echo \$p | cut -f5 -d' '` - if [ \$chromosome == "MtDNA" ]; then - cat ${phenotype} | awk '\$0 !~ "strain" {print}' | cut -f1 > phenotyped_samples.txt - bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ - -S phenotyped_samples.txt |\\ - bcftools filter -i N_MISSING=0 |\\ - bcftools annotate --rename-chrs ${num_chroms} |\\ - awk '\$0 !~ "#" {print \$1":"\$2}' > \$trait.\$chromosome.\$start_pos.\$end_pos.txt - bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ - -S phenotyped_samples.txt |\\ - bcftools filter -i N_MISSING=0 -Oz --threads ${task.cpus} |\\ - bcftools annotate --rename-chrs ${num_chroms} -o finemap.vcf.gz - else - cat ${phenotype} | awk '\$0 !~ "strain" {print}' | cut -f1 > phenotyped_samples.txt - bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ - -S phenotyped_samples.txt |\\ - bcftools filter -i N_MISSING=0 |\\ - bcftools filter -e 'GT="het"' |\\ - bcftools annotate --rename-chrs ${num_chroms} |\\ - awk '\$0 !~ "#" {print \$1":"\$2}' > \$trait.\$chromosome.\$start_pos.\$end_pos.txt - bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ - -S phenotyped_samples.txt |\\ - bcftools filter -i N_MISSING=0 -Oz --threads ${task.cpus} |\\ - bcftools annotate --rename-chrs ${num_chroms} -o finemap.vcf.gz - fi + echo "HELLO" + cat ${aggregate_mapping} |\\ + awk '\$0 !~ "\\tNA\\t" {print}' |\\ + awk '!seen[\$1,\$12,\$19,\$20,\$21]++' |\\ + awk 'NR>1{print \$1, \$11, \$18, \$19, \$20}' OFS="\\t" > ${TRAIT}_QTL_peaks.tsv + filename='${TRAIT}_QTL_peaks.tsv' + echo Start + while read p; do + chromosome=`echo \$p | cut -f1 -d ' '` + trait=`echo \$p | cut -f2 -d ' '` + start_pos=`echo \$p | cut -f3 -d ' '` + peak_pos=`echo \$p | cut -f4 -d ' '` + end_pos=`echo \$p | cut -f5 -d ' '` + echo "\$chromosome" + if [ \$chromosome == "MtDNA" ]; then + cat ${phenotype} | awk '\$0 !~ "strain" {print}' | cut -f1 > phenotyped_samples.txt + bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ + -S phenotyped_samples.txt |\\ + bcftools filter -i N_MISSING=0 |\\ + bcftools annotate --rename-chrs ${num_chroms} |\\ + awk '\$0 !~ "#" {print \$1":"\$2}' > \$trait.\$chromosome.\$start_pos.\$end_pos.txt + bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ + -S phenotyped_samples.txt |\\ + bcftools filter -i N_MISSING=0 -Oz --threads ${task.cpus} |\\ + bcftools annotate --rename-chrs ${num_chroms} -o finemap.vcf.gz + else + cat ${phenotype} | awk '\$0 !~ "strain" {print}' | cut -f1 > phenotyped_samples.txt + bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ + -S phenotyped_samples.txt |\\ + bcftools filter -i N_MISSING=0 |\\ + bcftools filter -e 'GT="het"' |\\ + bcftools annotate --rename-chrs ${num_chroms} |\\ + awk '\$0 !~ "#" {print \$1":"\$2}' > \$trait.\$chromosome.\$start_pos.\$end_pos.txt + bcftools view --regions \$chromosome:\$start_pos-\$end_pos ${imputed_vcf} \ + -S phenotyped_samples.txt |\\ + bcftools filter -i N_MISSING=0 -Oz --threads ${task.cpus} |\\ + bcftools annotate --rename-chrs ${num_chroms} -o finemap.vcf.gz + fi plink --vcf finemap.vcf.gz \\ --threads ${task.cpus} \\ --snps-only \\ @@ -149,8 +151,10 @@ process prep_ld_files { --recode vcf-iid bgz \\ --extract \$trait.\$chromosome.\$start_pos.\$end_pos.txt \\ --out \$trait.\$chromosome.\$start_pos.\$end_pos - nsnps=`wc -l \$trait.\$chromosome.\$start_pos.\$end_pos.txt | cut -f1 -d' '` - chrom_num=`cat ${num_chroms} | grep -w \$chromosome | cut -f2 -d' '` + nsnps=`wc -l \$trait.\$chromosome.\$start_pos.\$end_pos.txt | cut -f1 -d ' '` + echo `cat ${num_chroms} | grep -w \$chromosome` + chrom_num=`cat ${num_chroms} | grep -w \$chromosome | sed "s/\\t/ /" | cut -f2 -d ' '` + echo \$chrom_num plink --r2 with-freqs \\ --threads ${task.cpus} \\ --allow-extra-chr \\ @@ -196,15 +200,18 @@ process gcta_fine_maps { publishDir "${params.out}/LOCO/Fine_Mappings/Plots", mode: 'copy', pattern: "*loco.pdf" input: - tuple val(TRAIT), file(pheno), file(ROI_geno), file(ROI_LD), file(bim), file(bed), file(fam), val(algorithm), \ - file(annotation), file(genefile), file(finemap_qtl_intervals), file(plot_genes), file(traits), file(bed), file(bim), file(fam), file(map), file(nosex), file(ped), file(log), file(grm_bin), file(grm_id), file(grm_nbin), file(grm_bin_inbred), file(grm_id_inbred) + tuple val(TRAIT), file(pheno), file(ROI_geno), file(ROI_LD), file(bim), file(bed), \ + file(fam), val(algorithm), file(annotation), file(genefile), \ + file(finemap_qtl_intervals), file(plot_genes), file(traits), file(bed), file(bim), \ + file(fam), file(map), file(nosex), file(ped), file(log), file(grm_bin), file(grm_id), \ + file(grm_nbin) output: tuple file("*.fastGWA"), val(TRAIT), file("*.prLD_df*.tsv"), file("*.pdf"), file("*_genes*.tsv"), val(algorithm) tuple file("*_genes*.tsv"), val(TRAIT), val(algorithm), emit: finemap_done tuple val(TRAIT), file("*inbred.fastGWA"), file("*.prLD_df_inbred.tsv"), file("*_genes_inbred.tsv"), emit: finemap_html_inbred, optional: true tuple val(TRAIT), file("*loco.fastGWA"), file("*.prLD_df_loco.tsv"), file("*_genes_loco.tsv"), emit: finemap_html_loco, optional: true - + """ tail -n +2 ${pheno} | awk 'BEGIN {OFS="\\t"}; {print \$1, \$1, \$2}' > plink_finemap_traits.tsv for i in *ROI_Genotype_Matrix_${algorithm}.tsv; @@ -259,8 +266,7 @@ process divergent_and_haplotype { tuple file("QTL_peaks.tsv"), val(algorithm), file("divergent_bins"), file(divergent_df_isotype), file(haplotype_df_isotype), file(div_isotype_list) output: - tuple file("all_QTL_bins_inbred.bed"), file("all_QTL_div_inbred.bed"), file("haplotype_in_QTL_region_inbred.txt"), file("div_isotype_list2_inbred.txt"), emit: div_hap_table_inbred, optional: true - tuple file("all_QTL_bins_loco.bed"), val(algorithm), file("all_QTL_div_loco.bed"), file("haplotype_in_QTL_region_loco.txt"), file("div_isotype_list2_loco.txt"), emit: div_hap_table_loco, optional: true + tuple file("all_QTL_bins_inbred.bed"), file("all_QTL_div_${algorithm}.bed"), file("haplotype_in_QTL_region_${algorithm}.txt"), file("div_isotype_list2_${algorithm}.txt"), emit: div_hap_table """ awk NR\\>1 QTL_peaks.tsv | awk -v OFS='\t' '{print \$1,\$5,\$7}' > QTL_region_${algorithm}.bed @@ -282,14 +288,88 @@ process html_report_main { publishDir "${params.out}/Reports/scripts/", pattern: "*.Rmd", overwrite: true, mode: 'copy' publishDir "${params.out}/Reports", pattern: "*.html", overwrite: true, mode: 'copy' +/* +file "QTL_peaks_inbred.tsv", # qtl_peaks_inbred +path "*AGGREGATE_qtl_region_loco.tsv", # qtl_peaks_loco +path "pr_*.tsv", # fixed_strain_phenotypes +path "strain_issues.txt", # strain_issues +file("total_independent_tests.txt"), # into independent_tests +file("Genotype_Matrix.tsv"), # genoype matrix +path "${params.bin_dir}/NemaScan_Report_main.Rmd", # +path "${params.bin_dir}/NemaScan_Report_region_template.Rmd", # +path "${params.bin_dir}/render_markdown.R", # +path "${params.bin_dir}/NemaScan_Report_algorithm_template.Rmd", # +val mde, # plot mediation? +val "${params.species}", # target species +file "all_QTL_bins_inbred.bed", # (optional) +file "all_QTL_div_inbred.bed", # (optional) +file "haplotype_in_QTL_region_inbred.txt", # (optional) +file "div_isotype_list2_inbred.txt", # (optional) +file "all_QTL_bins_loco.bed", # (optional) +val algorithm, # (optional) +file "all_QTL_div_loco.bed", # (optional) +file "haplotype_in_QTL_region_loco.txt", # (optional) +file "div_isotype_list2_loco.txt", # (optional) +join by 2 - + file "*AGGREGATE_mapping_inbred.tsv", file "*AGGREGATE_mapping_loco.tsv" , val TRAIT +join w/remainder (optional) - + val TRAIT, file "*inbred.fastGWA", file "*.prLD_df_inbred.tsv", file "*_genes_inbred.tsv" +join w/remainder (optional) - + val TRAIT, file "*loco.fastGWA", file "*.prLD_df_loco.tsv", file "*_genes_loco.tsv" +join w/remainder (optional) - + val TRAIT, file "*ROI_Genotype_Matrix_inbred.tsv", file "*LD_inbred.tsv" +join w/remainder (optional) - + val TRAIT , file "*ROI_Genotype_Matrix_loco.tsv", file "*LD_loco.tsv" +join w/remainder (optional) - + val TRAIT, file "${TRAIT}_mediation_inbred.tsv" +join w/remainder (optional) - + val TRAIT, file "${TRAIT}_mediation_loco.tsv" +*/ input: // there is no way this will work for every trait... UGH - tuple val(TRAIT), file(qtl_peaks_inbred), file(qtl_peaks_loco), file(pheno), file(strain_issues), file(tests), file(geno), file(ns_report_md), \ - file(ns_report_template_md), file(render_markdown), file(ns_report_alg), val(mediate), val(species), file(qtl_bins_inbred), file(qtl_div_inbred), \ - file(haplotype_qtl_inbred), file(div_isotype_inbred), file(qtl_bins_loco), val(algorithm), file(qtl_div_loco),file(haplotype_qtl_loco), file(div_isotype_loco), \ - file(pmap_inbred), file(pmap_loco), file(fastGWA_inbred), file(prLD_inbred), file(bcsq_genes_inbred), file(fastGWA_loco), file(prLD_loco), file(bcsq_genes_loco), \ - file(roi_geno_inbred), file(roi_ld_inbred), file(roi_geno_loco), file(roi_ld_loco),\ - file(mediation_summary_inbred), file(mediation_summary_loco) + tuple val(TRAIT), \ + file(qtl_peaks_inbred), \ + file(qtl_peaks_loco), \ + file(pheno), \ + file(strain_issues), \ + file(tests), \ + file(geno), \ + file(ns_report_md), \ + file(ns_report_template_md), \ + file(render_markdown), \ + file(ns_report_alg), \ + val(mediate), \ + val(species), \ + file(qtl_bins_inbred), \ + file(qtl_div_inbred), \ + file(haplotype_qtl_inbred), \ + file(div_isotype_inbred), \ + file(qtl_bins_loco), \ + val(algorithm), \ + file(qtl_div_loco), \ + file(haplotype_qtl_loco), \ + file(div_isotype_loco), \ + \ + file(pmap_inbred), \ + file(pmap_loco), \ + file(fastGWA_inbred), \ + \ + file(prLD_inbred), \ + file(bcsq_genes_inbred), \ + file(fastGWA_loco), \ + \ + file(prLD_loco), \ + file(bcsq_genes_loco), \ + \ + file(roi_geno_inbred), \ + file(roi_ld_inbred), \ + \ + file(roi_geno_loco), \ + file(roi_ld_loco),\ + \ + file(mediation_summary_inbred), \ + \ + file(mediation_summary_loco) output: tuple file("NemaScan_Report_*.Rmd"), file("NemaScan_Report_*.html") diff --git a/modules/setup.nf b/modules/setup.nf index b12055b..8158af9 100644 --- a/modules/setup.nf +++ b/modules/setup.nf @@ -12,9 +12,6 @@ process pull_vcf { tag {"PULLING VCF FROM CeNDR"} - executor "local" - container null - output: path "*hard-filter.isotype.vcf.gz", emit: hard_vcf path "*hard-filter.isotype.vcf.gz.tbi", emit: hard_vcf_index @@ -143,7 +140,7 @@ process fix_strain_names_alt { process vcf_to_geno_matrix { label "ml" - label "plink" + label "nemascan" publishDir "${params.out}/Genotype_Matrix", mode: 'copy' diff --git a/modules/simulations.nf b/modules/simulations.nf index cbe783d..190e0af 100644 --- a/modules/simulations.nf +++ b/modules/simulations.nf @@ -23,7 +23,9 @@ process prepare_simulation_files { """ - bcftools view -s `echo ${strains} ${vcf} | tr -d '\\n'` |\\ + export > node_name.txt + bcftools view --force-samples -s ${strains} ${vcf} | \\ + bcftools annotate --rename-chrs ${num_chroms} ${vcf} |\\ bcftools filter -i N_MISSING=0 -Oz -o renamed_chroms.vcf.gz tabix -p vcf renamed_chroms.vcf.gz plink --vcf renamed_chroms.vcf.gz \\ @@ -90,10 +92,11 @@ process chrom_eigen_variants_sims { """ - cat ${geno} |\\ - awk -v chrom="${CHROM}" '{if(\$1 == "CHROM" || \$1 == chrom) print}' > ${CHROM}_gm.tsv - Rscript --vanilla ${get_genomatrix_eigen} ${CHROM}_gm.tsv ${CHROM} - mv ${CHROM}_independent_snvs.csv ${CHROM}_${strain_set}_${MAF}_independent_snvs.csv + export > node_name.txt + cat ${geno} |\\ + awk -v chrom="${CHROM}" '{if(\$1 == "CHROM" || \$1 == chrom) print}' > ${CHROM}_gm.tsv + Rscript --vanilla ${get_genomatrix_eigen} ${CHROM}_gm.tsv ${CHROM} + mv ${CHROM}_independent_snvs.csv ${CHROM}_${strain_set}_${MAF}_independent_snvs.csv """ } @@ -106,7 +109,6 @@ process collect_eigen_variants_sims { executor 'local' container null - label "md" publishDir "${params.out}/Genotype_Matrix", mode: 'copy' @@ -118,9 +120,9 @@ process collect_eigen_variants_sims { tuple val(strain_set), val(strains), file(bed), file(bim), file(fam), file(map), file(sex), file(ped), file(log), file(geno), val(MAF), file("${strain_set}_${MAF}_total_independent_tests.txt") """ - cat *independent_snvs.csv |\\ - grep -v inde |\\ - awk '{s+=\$1}END{print s}' > ${strain_set}_${MAF}_total_independent_tests.txt + cat *independent_snvs.csv |\\ + grep -v inde |\\ + awk '{s+=\$1}END{print s}' > ${strain_set}_${MAF}_total_independent_tests.txt """ } @@ -139,8 +141,9 @@ process simulate_effects_loc { tuple val(strain_set), val(strains), file(bed), file(bim), file(fam), file(map), file(nosex), file(ped), file(log), file(gm), val(MAF), file(n_indep_tests), val(NQTL), val(SIMREP), val(effect_range), file("causal.variants.sim.${NQTL}.${SIMREP}.txt") """ - Rscript --vanilla ${create_causal_qtls} ${bim} ${NQTL} ${effect_range} ${qtl_loc_bed} - mv causal.variants.sim.${NQTL}.txt causal.variants.sim.${NQTL}.${SIMREP}.txt + export > node_name.txt + Rscript --vanilla ${create_causal_qtls} ${bim} ${NQTL} ${effect_range} ${qtl_loc_bed} + mv causal.variants.sim.${NQTL}.txt causal.variants.sim.${NQTL}.${SIMREP}.txt """ } @@ -159,8 +162,9 @@ process simulate_effects_genome { """ - Rscript --vanilla ${create_causal_qtls} ${bim} ${NQTL} ${effect_range} - mv causal.variants.sim.${NQTL}.txt causal.variants.sim.${NQTL}.${SIMREP}.txt + export > node_name.txt + Rscript --vanilla ${create_causal_qtls} ${bim} ${NQTL} ${effect_range} + mv causal.variants.sim.${NQTL}.txt causal.variants.sim.${NQTL}.${SIMREP}.txt """ } @@ -170,6 +174,7 @@ process simulate_map_phenotypes { tag {"${NQTL} - ${SIMREP} - ${H2} - ${MAF}"} label "md" + label "prep_sims" publishDir "${params.out}/Simulations/${effect_range}/${NQTL}/Mappings", pattern: "*fastGWA", overwrite: true publishDir "${params.out}/Simulations/${effect_range}/${NQTL}/Mappings", pattern: "*loco.mlma", overwrite: true @@ -191,6 +196,7 @@ process simulate_map_phenotypes { tuple val(strain_set), val(strains), val(NQTL), val(SIMREP), val(H2), file(loci), file(gm), val(effect_range), file(n_indep_tests), val(MAF), file("${NQTL}_${SIMREP}_${H2}_${MAF}_${effect_range}_${strain_set}_lmm-exact_inbred_pca.fastGWA"), file("${NQTL}_${SIMREP}_${H2}_${MAF}_${effect_range}_${strain_set}_lmm-exact.loco.mlma"), file("${NQTL}_${SIMREP}_${H2}_${MAF}_${effect_range}_${strain_set}_sims.par"), file("${NQTL}_${SIMREP}_${H2}_${MAF}_${effect_range}_${strain_set}_sims.phen"), emit: gcta_intervals """ + export > node_name.txt gcta64 --bfile TO_SIMS \\ --simu-qt \\ --simu-causal-loci ${loci} \\ @@ -301,8 +307,9 @@ process get_gcta_intervals { script: """ - Rscript --vanilla ${find_gcta_intervals} ${gm} ${phenotypes} ${lmmexact_inbred} ${n_indep_tests} ${NQTL} ${SIMREP} ${QTL_GROUP_SIZE} ${QTL_CI_SIZE} ${H2} ${params.maf} ${THRESHOLD} ${strain_set} ${MAF} ${effect_range} LMM-EXACT-INBRED_PCA - Rscript --vanilla ${find_gcta_intervals_loco} ${gm} ${phenotypes} ${lmmexact_loco} ${n_indep_tests} ${NQTL} ${SIMREP} ${QTL_GROUP_SIZE} ${QTL_CI_SIZE} ${H2} ${params.maf} ${THRESHOLD} ${strain_set} ${MAF} ${effect_range} LMM-EXACT-LOCO_PCA + export > node_name.txt + Rscript --vanilla ${find_gcta_intervals} ${gm} ${phenotypes} ${lmmexact_inbred} ${n_indep_tests} ${NQTL} ${SIMREP} ${QTL_GROUP_SIZE} ${QTL_CI_SIZE} ${H2} ${params.maf} ${THRESHOLD} ${strain_set} ${MAF} ${effect_range} LMM-EXACT-INBRED_PCA + Rscript --vanilla ${find_gcta_intervals_loco} ${gm} ${phenotypes} ${lmmexact_loco} ${n_indep_tests} ${NQTL} ${SIMREP} ${QTL_GROUP_SIZE} ${QTL_CI_SIZE} ${H2} ${params.maf} ${THRESHOLD} ${strain_set} ${MAF} ${effect_range} LMM-EXACT-LOCO_PCA """ } @@ -310,7 +317,7 @@ process assess_sims_INBRED { label 'assess_sims' label 'sm' - + publishDir "${params.out}/scored_sims", mode: 'copy', pattern: "*_mapping.tsv" input: @@ -320,6 +327,7 @@ process assess_sims_INBRED { script: """ + export > node_name.txt Rscript --vanilla ${R_assess_sims} ${mapping_processed} ${gm} ${var_effects} ${phenotypes} ${NQTL} ${SIMREP} ${H2} ${MAF} ${effect_range} ${strain_set} ${algorithm_id} """ @@ -338,6 +346,7 @@ process assess_sims_LOCO { script: """ + export > node_name.txt Rscript --vanilla ${R_assess_sims} ${mapping_processed} ${gm} ${var_effects} ${phenotypes} ${NQTL} ${SIMREP} ${H2} ${MAF} ${effect_range} ${strain_set} ${algorithm_id} """ diff --git a/nextflow.config b/nextflow.config index 47f570d..5a596b7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,10 +56,11 @@ timeline { } process { - container = 'andersenlab/nemascan:20220407173056db3227' + //container = 'andersenlab/nemascan:20220407173056db3227' + container = 'msauria/nemascan:060724' - withLabel: plink { - container = "andersenlab/postgatk:latest" + withLabel: R { + container = 'andersenlab/r_packages:v0.7' } withLabel: mediation { @@ -70,6 +71,10 @@ process { container = 'mckeowr1/prep_sims:1.1' } + withLabel: gcta { + container = 'msauria/gcta:060624' + } + withLabel: assess_sims { container = 'mckeowr1/asess_sims:1.1' }