Ensure filtering is same for mod/hist in load/burden calc

also remove unused code
zjnolen · Dec 4, 2024 · fd4bc20 · fd4bc20
1 parent bac7ab5
commit fd4bc20
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 221 deletions.
diff --git a/angsd/workflow/Snakefile b/angsd/workflow/Snakefile
@@ -21,6 +21,9 @@ include: "rules/vep.smk"
 include: "rules/theta_downsampling.smk"
 
 
+# include: "rules/gone.smk"
+
+
 wildcard_constraints:
     ref=config["reference"]["name"],
     dp=".{0}|.dp[1-9][0-9]*",
@@ -40,7 +43,6 @@ if config["vep"]:
     all_outputs.extend(
         expand(
             "results/datasets/{{dataset}}/analyses/vep/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.varimpacts.tsv",
-            zip,
             dp=[".dp5"],
             mindp=[4],
         )
@@ -50,7 +52,6 @@ if config["bcftools_roh"]:
     all_outputs.extend(
         expand(
             "results/datasets/{{dataset}}/plots/inbreeding/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.bcftools.froh_bins.svg",
-            zip,
             dp=[".dp5"],
             mindp=[4],
         )
@@ -60,7 +61,6 @@ if config["gerp_rel_load"]:
     all_outputs.extend(
         expand(
             "results/datasets/{{dataset}}/analyses/gerp/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.relative-load-gerp.csv",
-            zip,
             dp=[".dp5"],
             mindp=[4],
         )

diff --git a/angsd/workflow/rules/vep.smk b/angsd/workflow/rules/vep.smk
@@ -58,9 +58,7 @@ rule calc_vep_load:
         pops="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list",
     output:
         varcounts="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts.tsv",
-        varcounts_boots="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts_boots.tsv",
-        varcounts_nomiss="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts_nomiss.tsv",
-    threads: lambda w, attempt: 2 * attempt
+    threads: 8
     conda:
         "../envs/r.yaml"
     resources:

diff --git a/angsd/workflow/scripts/genetic_load_vep.R b/angsd/workflow/scripts/genetic_load_vep.R
@@ -18,9 +18,6 @@ df <- merge(vars, vep, by = c("chr", "pos"))
 sum(df$alt == df$vepalt) == nrow(df) # Check that alt alleles match
 df$impact <- gsub("IMPACT=", "", df$impact)
 
-# Remove modifier variants
-# df <- df[df$impact != "MODIFIER", ]
-
 # Get sample list and metadata
 samples <- read.table(snakemake@input[["samples"]])
 colnames(samples) <- c("sample")
@@ -62,75 +59,13 @@ impacts <- df$impact
 # historical samples
 comp80both <- (
   rowSums(!is.na(histgt)) >= floor(ncol(histgt) * 0.8) &
-  rowSums(!is.na(modgt)) >= floor(ncol(modgt) * 0.8)
+    rowSums(!is.na(modgt)) >= floor(ncol(modgt) * 0.8)
 )
 
 histgt <- histgt[comp80both, ]
 modgt <- modgt[comp80both, ]
 impacts <- impacts[comp80both]
-
-# # Calculate Rxy for the dataset, treating historical samples and modern samples
-# # as single units
-
-# # set values for 'bootstraps'
-# nboot <- 1000 # number of 'bootstraps' to perform
-# nsample <- 8 # number of samples to sample per group
-
-# # initialize rxy data frame
-# rxy <- data.frame()
-
-# for (b in c(1:nboot)) {
-#   # subset historical genotypes by nsample
-#   subhistgt <- sample(histgt, nsample)
-#   # calculate allele frequencies for historical samples at all sites
-#   hist_af <- (rowSums(subhistgt, na.rm = TRUE)) /
-#     (rowSums(!is.na(subhistgt)) * 2)
-#   # subset modern genotypes by nsample
-#   submodgt <- sample(modgt, nsample)
-#   # calculate allele frequencies for modern samples at all sites
-#   mod_af <- (rowSums(submodgt, na.rm = TRUE)) /
-#     (rowSums(!is.na(submodgt)) * 2)
-#   # remove sites with missing individuals from subsampling to ensure subsamples
-#   # are even (only relevant if data wasn't already filtered for missingness)
-#   sitefilt <- (complete.cases(submodgt) * complete.cases(subhistgt)) == 1
-#   # Filter by missingness. Also only use sites that are segregating in the
-#   # subsample. This makes the subsample a filtered set with equivalent
-#   # processing to our original data
-#   sitefilt <- sitefilt & (
-#     (mod_af > 0 & mod_af < 1) | (hist_af > 0 & hist_af < 1)
-#   )
-#   hist_af <- hist_af[sitefilt]
-#   mod_af <- mod_af[sitefilt]
-#   bootimpacts <- impacts[sitefilt]
-#   usable_sites <- length(bootimpacts)
-#   # Prep Rxy dataframe
-#   rxy_df <- data.frame(matrix(nrow = length(bootimpacts), ncol = 0))
-#   rxy_df$hist_af <- hist_af
-#   rxy_df$mod_af <- mod_af
-#   rxy_df$impact <- bootimpacts
-#   usable_sites <- nrow(rxy_df)
-#   # calculate Rxy for historical-modern frequencies (<1 is increase in freq
-#   # over time)
-#   rxy_df <- rxy_df %>%
-#     group_by(impact) %>%
-#     summarise(
-#       rxy = ((sum(hist_af * (1 - mod_af))) / (sum(mod_af * (1 - hist_af))))
-#     )
-#   rxy_df$boot <- b
-#   rxy_df$usablesites <- usable_sites
-#   rxy_df$nsamples <- nsample
-#   rxy <- rbind(rxy, rxy_df)
-# }
-
-# # Column names update
-# colnames(rxy) <- c(
-#   "var.impact", "rxy.hist.mod", "boot", "usable.sites", "nsamples"
-# )
-
-# write.table(rxy,
-#   file = snakemake@output[["rxy"]], quote = FALSE,
-#   sep = "\t", row.names = FALSE, col.names = TRUE
-# )
+df <- df[comp80both, ]
 
 # Now count the totals per individual for alternate alleles. Also count the
 # total alt count per individual for normalization into 'relative' load
@@ -144,19 +79,27 @@ for (sam in samples$sample) {
   mod_hom <- sum(df[df$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
   low <- sum(df[df$impact == "LOW", sam], na.rm = TRUE)
   low_hom <- sum(df[df$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
+  inter <- sum(df[df$consequence == "intergenic_variant", sam], na.rm = TRUE)
+  inter_hom <- sum(
+    df[df$consequence == "intergenic_variant", sam] == 2,
+    na.rm = TRUE
+  ) * 2
   alts <- sum(df[, sam], na.rm = TRUE)
   nsites <- sum(!is.na(df[, sam]), na.rm = TRUE)
-  row <- c(sam, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
+  row <- c(
+    sam, high, high_hom, mod, mod_hom, low, low_hom, inter, inter_hom,
+    alts, nsites
+  )
   varcounts <- rbind(varcounts, row)
 }
 
 colnames(varcounts) <- c(
-  "sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
-  "nsites"
+  "sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "inter",
+  "inter_hom", "alts", "nsites"
 )
 numcols <- c(
-  "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
-  "nsites"
+  "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "inter",
+  "inter_hom", "alts", "nsites"
 )
 varcounts[numcols] <- sapply(varcounts[numcols], as.numeric)
 varcounts <- merge(varcounts, samples, by = "sample")
@@ -165,146 +108,3 @@ write.table(varcounts,
   file = snakemake@output[["varcounts"]], quote = FALSE,
   sep = "\t", row.names = FALSE, col.names = TRUE
 )
-
-varcounts_boots <- data.frame()
-
-for (sam in samples$sample) {
-  sam_nomiss <- df[complete.cases(df[, sam]),]
-  for (b in c(1:300)) {
-    bdf <- sample_n(sam_nomiss, 500000)
-    high <- sum(bdf[bdf$impact == "HIGH", sam], na.rm = TRUE)
-    high_hom <- sum(bdf[bdf$impact == "HIGH", sam] == 2, na.rm = TRUE) * 2
-    mod <- sum(bdf[bdf$impact == "MODERATE", sam], na.rm = TRUE)
-    mod_hom <- sum(bdf[bdf$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
-    low <- sum(bdf[bdf$impact == "LOW", sam], na.rm = TRUE)
-    low_hom <- sum(bdf[bdf$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
-    alts <- sum(bdf[, sam], na.rm = TRUE)
-    nsites <- sum(!is.na(bdf[, sam]), na.rm = TRUE)
-    row <- c(sam, b, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
-    varcounts_boots <- rbind(varcounts_boots, row)
-  }
-}
-
-colnames(varcounts_boots) <- c(
-  "sample", "boot", "high", "high_hom", "mod", "mod_hom", "low", "low_hom",
-  "alts", "nsites"
-)
-numcols <- c(
-  "boot", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
-  "nsites"
-)
-varcounts_boots[numcols] <- sapply(varcounts_boots[numcols], as.numeric)
-varcounts_boots <- merge(varcounts_boots, samples, by = "sample")
-
-write.table(varcounts_boots,
-  file = snakemake@output[["varcounts_boots"]], quote = FALSE,
-  sep = "\t", row.names = FALSE, col.names = TRUE
-)
-
-# Limit to only sites with data for all individuals and repeat both
-df <- df[complete.cases(df), ]
-
-# make data frames of historical and modern genotypes only, as well as list of
-# variant impacts
-histgt <- df[, which(
-  names(df) %in% samples$sample[sampmeta$time == "historical"]
-)]
-modgt <- df[, which(names(df) %in% samples$sample[sampmeta$time == "modern"])]
-impacts <- df$impact
-
-# # Calculate Rxy for the dataset, treating historical samples and modern samples
-# # as single units
-
-# # set values for 'bootstraps'
-# nboot <- 1000 # number of 'bootstraps' to perform
-# nsample <- 8 # number of samples to sample per group
-
-# # initialize rxy data frame
-# rxy <- data.frame()
-
-# for (b in c(1:nboot)) {
-#   # subset historical genotypes by nsample
-#   subhistgt <- sample(histgt, nsample)
-#   # calculate allele frequencies for historical samples at all sites
-#   hist_af <- (rowSums(subhistgt, na.rm = TRUE)) /
-#     (rowSums(!is.na(subhistgt)) * 2)
-#   # subset modern genotypes by nsample
-#   submodgt <- sample(modgt, nsample)
-#   # calculate allele frequencies for modern samples at all sites
-#   mod_af <- (rowSums(submodgt, na.rm = TRUE)) /
-#     (rowSums(!is.na(submodgt)) * 2)
-#   # remove sites with missing individuals from subsampling to ensure subsamples
-#   # are even (only relevant if data wasn't already filtered for missingness)
-#   sitefilt <- (complete.cases(submodgt) * complete.cases(subhistgt)) == 1
-#   # Filter by missingness. Also only use sites that are segregating in the
-#   # subsample. This makes the subsample a filtered set with equivalent
-#   # processing to our original data
-#   sitefilt <- sitefilt & (
-#     (mod_af > 0 & mod_af < 1) | (hist_af > 0 & hist_af < 1)
-#   )
-#   hist_af <- hist_af[sitefilt]
-#   mod_af <- mod_af[sitefilt]
-#   bootimpacts <- impacts[sitefilt]
-#   usable_sites <- length(bootimpacts)
-#   # Prep Rxy dataframe
-#   rxy_df <- data.frame(matrix(nrow = length(bootimpacts), ncol = 0))
-#   rxy_df$hist_af <- hist_af
-#   rxy_df$mod_af <- mod_af
-#   rxy_df$impact <- bootimpacts
-#   usable_sites <- nrow(rxy_df)
-#   # calculate Rxy for historical-modern frequencies (<1 is increase in freq
-#   # over time)
-#   rxy_df <- rxy_df %>%
-#     group_by(impact) %>%
-#     summarise(
-#       rxy = ((sum(hist_af * (1 - mod_af))) / (sum(mod_af * (1 - hist_af))))
-#     )
-#   rxy_df$boot <- b
-#   rxy_df$usablesites <- usable_sites
-#   rxy_df$nsamples <- nsample
-#   rxy <- rbind(rxy, rxy_df)
-# }
-
-# # Column names update
-# colnames(rxy) <- c(
-#   "var.impact", "rxy.hist.mod", "boot", "usable.sites", "nsamples"
-# )
-
-# write.table(rxy,
-#   file = snakemake@output[["rxy_nomiss"]], quote = FALSE,
-#   sep = "\t", row.names = FALSE, col.names = TRUE
-# )
-
-# Now count the totals per individual for alternate alleles. Also count the
-# total alt count per individual for normalization into 'relative' load
-
-varcounts <- data.frame()
-
-for (sam in samples$sample) {
-  high <- sum(df[df$impact == "HIGH", sam], na.rm = TRUE)
-  high_hom <- sum(df[df$impact == "HIGH", sam] == 2, na.rm = TRUE) * 2
-  mod <- sum(df[df$impact == "MODERATE", sam], na.rm = TRUE)
-  mod_hom <- sum(df[df$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
-  low <- sum(df[df$impact == "LOW", sam], na.rm = TRUE)
-  low_hom <- sum(df[df$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
-  alts <- sum(df[, sam], na.rm = TRUE)
-  nsites <- sum(!is.na(df[, sam]), na.rm = TRUE)
-  row <- c(sam, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
-  varcounts <- rbind(varcounts, row)
-}
-
-colnames(varcounts) <- c(
-  "sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
-  "nsites"
-)
-numcols <- c(
-  "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
-  "nsites"
-)
-varcounts[numcols] <- sapply(varcounts[numcols], as.numeric)
-varcounts <- merge(varcounts, samples, by = "sample")
-
-write.table(varcounts,
-  file = snakemake@output[["varcounts_nomiss"]], quote = FALSE,
-  sep = "\t", row.names = FALSE, col.names = TRUE
-)