diff --git a/bin/Fix_Isotype_names_bulk.R b/bin/Fix_Isotype_names_bulk.R index 6e72a04..c776e41 100755 --- a/bin/Fix_Isotype_names_bulk.R +++ b/bin/Fix_Isotype_names_bulk.R @@ -225,6 +225,9 @@ process_phenotypes <- function(df, # get strain isotypes strain_isotypes_db <- data.table::fread(args[3]) # identify strains that were phenotyped, but are not part of an isotype + strains_in <- df$strain %in% strain_isotypes_db$strain + message(glue::glue("{df$strain} ")) + message(glue::glue("{strains_in} ")) non_isotype_strains <- dplyr::filter(df, !(strain %in% c(strain_isotypes_db$strain, "PD1074")), !(strain %in% strain_isotypes_db$isotype)) @@ -233,7 +236,7 @@ process_phenotypes <- function(df, strains_to_remove <- unique(non_isotype_strains$strain) - message(glue::glue("WARNING: Removing strain(s) {strains_to_remove} because they do not fall into a defined isotype.")) + message(glue::glue("WARNING: Removing strain(s) {strains_to_remove} because they do not fall into a defined isotype.\n")) df_non_isotypes_removed <- dplyr::filter( df, !( strain %in% strains_to_remove) ) } else { @@ -271,7 +274,7 @@ process_phenotypes <- function(df, fix <- df %>% dplyr::mutate(strain = isotype) %>% dplyr::select(-ref_strain, -num) - message(glue::glue("WARNING: Non-isotype reference strain {df$strain[1]} renamed to isotype {i}.")) + message(glue::glue("WARNING: Non-isotype reference strain {df$strain[1]} renamed to isotype {i}.\n")) } else { # remove non-isotype strains fix <- df %>% @@ -280,10 +283,10 @@ process_phenotypes <- function(df, # warn the user if(sum(df$ref_strain) > 0) { - message(glue::glue("WARNING: Non-isotype reference strain(s) {paste(df %>% dplyr::filter(!ref_strain) %>% dplyr::pull(strain) %>% unique(), collapse = ', ')} from isotype group {i} removed.")) + message(glue::glue("WARNING: Non-isotype reference strain(s) {paste(df %>% dplyr::filter(!ref_strain) %>% dplyr::pull(strain) %>% unique(), collapse = ', ')} from isotype group {i} removed.\n")) } else { - message(glue::glue("WARNING: Non-isotype reference strain(s) {paste(df %>% dplyr::filter(!ref_strain) %>% dplyr::pull(strain) %>% unique(), collapse = ', ')} from isotype group {i} removed. To include this isotype in the analysis, you can (1) phenotype {i} or (2) evaluate the similarity of these strains and choose one representative for the group.")) + message(glue::glue("WARNING: Non-isotype reference strain(s) {paste(df %>% dplyr::filter(!ref_strain) %>% dplyr::pull(strain) %>% unique(), collapse = ', ')} from isotype group {i} removed. To include this isotype in the analysis, you can (1) phenotype {i} or (2) evaluate the similarity of these strains and choose one representative for the group.\n")) } } # add to data diff --git a/bin/plot_genes.R b/bin/plot_genes.R index 5d8d79b..88fb765 100644 --- a/bin/plot_genes.R +++ b/bin/plot_genes.R @@ -1,5 +1,6 @@ #!/usr/bin/env Rscript library(dplyr) +library(tibble) library(tidyr) library(ggplot2) library(stringr) @@ -12,6 +13,8 @@ library(purrr) # 2 = phenotype file # 3 = gene file # 4 = annotation file +# 5 = algorithm +# 6 = species args <- commandArgs(trailingOnly = TRUE) @@ -52,6 +55,8 @@ query_regions <- pr_trait_ld %>% query_regions +# query_regions == CHROM, start_pos, end_pos + # update 20210330 KSE: use impute vcf for genotypes and hard vcf annotation file for annotations # this could be bcsq or snpeff annotations <- data.table::fread(args[4]) @@ -63,6 +68,8 @@ if("CONSEQUENCE" %in% names(annotations)) { ann_type <- "snpeff" } +species = args[6] + # do this for each QTL separately, then combine annotation_out <- list() for(r in 1:nrow(query_regions)){ @@ -71,6 +78,8 @@ for(r in 1:nrow(query_regions)){ eq <- query_regions$end_pos[r] + + # a function to check how many strains in the mapping set are in the column "Strains" of the annotation file alt_strain_check=function(df){ @@ -93,6 +102,8 @@ for(r in 1:nrow(query_regions)){ dplyr::select(CHROM=CHR,POS,strains_used=strains) %>% dplyr::distinct() +# strain_list == CHROM, POS, strains_used + # count and filter annotations_region <- annotations %>% dplyr::filter(CHROM == cq, @@ -115,7 +126,6 @@ for(r in 1:nrow(query_regions)){ }%>% tidyr::unite(marker, CHROM, POS, sep = "_") - # pull variants from finemap impute -- don't need this? # impute_vcf <- data.table::fread(args[4]) %>% # dplyr::select(marker:POS, REF:strains) @@ -142,7 +152,12 @@ for(r in 1:nrow(query_regions)){ } # combine annotations for regions -annotation_df <- dplyr::bind_rows(annotation_out) %>% +annotation_out = dplyr::bind_rows(annotation_out) +if(ann_type == "bcsq" && species != "c_elegans") { + annotation_out = tibble::add_column(annotation_out, dplyr::select(annotation_out, GENE) %>% dplyr::rename(WORMBASE_ID = GENE)) +} + +annotation_df <- annotation_out %>% dplyr::left_join(pr_trait_ld, ., by = c("marker", "REF", "ALT")) %>% { if(ann_type == "bcsq") dplyr::rename(., gene_id = WORMBASE_ID) else dplyr::select(., -gene_name) } @@ -186,7 +201,7 @@ tidy_genes_in_region <- if(ann_type == "bcsq") { # path = glue::glue("{analysis_trait}_{cq}_{sq}-{eq}_{ann_type}_genes_{args[5]}.tsv")) write_tsv(tidy_genes_in_region, - path = glue::glue("{analysis_trait}_{cq}_{output_sq}-{output_eq}_{ann_type}_genes_{args[5]}.tsv")) + file = glue::glue("{analysis_trait}_{cq}_{output_sq}-{output_eq}_{ann_type}_genes_{args[5]}.tsv")) for(r in 1:length(unique(ugly_genes_in_region$start_pos))){ diff --git a/main.nf b/main.nf index 3be817c..5e32424 100644 --- a/main.nf +++ b/main.nf @@ -131,7 +131,7 @@ if(params.debug) { println "WARNING: Using snpeff annotation. To use BCSQ annotation, please use a newer vcf (2021 or later)" ann_file = Channel.fromPath("${params.dataDir}/${params.species}/WI/variation/${params.vcf}/vcf/WI.${params.vcf}.strain-annotation.snpeff.tsv") } else { - ann_file = Channel.fromPath("${parms.dataDir}/${params.species}/WI/variation/${params.vcf}/vcf/WI.${params.vcf}.strain-annotation.tsv") + ann_file = Channel.fromPath("${params.dataDir}/${params.species}/WI/variation/${params.vcf}/vcf/WI.${params.vcf}.strain-annotation.tsv") } } else { // check that vcf file exists, if it does, use it. If not, throw error @@ -433,7 +433,8 @@ workflow { .combine(ann_file) .combine(Channel.fromPath("${params.genes}")) .combine(Channel.fromPath("${params.bin_dir}/Finemap_QTL_Intervals.R")) - .combine(Channel.fromPath("${params.bin_dir}/plot_genes.R")) | gcta_fine_maps + .combine(Channel.fromPath("${params.bin_dir}/plot_genes.R")) + .combine(Channel.of("${params.species}")) | gcta_fine_maps // divergent regions and haplotypes diff --git a/modules/post-mapping.nf b/modules/post-mapping.nf index 22c6262..d223d07 100644 --- a/modules/post-mapping.nf +++ b/modules/post-mapping.nf @@ -204,7 +204,7 @@ process gcta_fine_maps { file(formatted_traits), file(trait_bed), file(trait_bim), file(trait_fam), \ file(trait_map), file(trait_nosex), file(trait_ped), file(grm_bin), file(grm_id), \ file(grm_nbin), file(grm_bin_inbred), file(grm_id_inbred), \ - file(annotation), file(genefile), file(finemap_qtl_intervals), file(plot_genes) + file(annotation), file(genefile), file(finemap_qtl_intervals), file(plot_genes), val(species) output: tuple file("*.fastGWA"), val(TRAIT), file("*.prLD_df*.tsv"), file("*.pdf"), file("*_genes*.tsv"), val(algorithm) @@ -248,7 +248,7 @@ process gcta_fine_maps { Rscript --vanilla ${finemap_qtl_intervals} ${TRAIT}.\$chr.\$start.\$stop.finemap_inbred.${algorithm}.fastGWA \$i ${TRAIT}.\$chr.\$start.\$stop.LD_${algorithm}.tsv ${algorithm} - Rscript --vanilla ${plot_genes} ${TRAIT}.\$chr.\$start.\$stop.prLD_df_${algorithm}.tsv ${pheno} ${genefile} ${annotation} ${algorithm} + Rscript --vanilla ${plot_genes} ${TRAIT}.\$chr.\$start.\$stop.prLD_df_${algorithm}.tsv ${pheno} ${genefile} ${annotation} ${algorithm} ${species} done """ }