Skip to content

Commit

Permalink
Ensure filtering is same for mod/hist in load/burden calc
Browse files Browse the repository at this point in the history
also remove unused code
  • Loading branch information
zjnolen committed Dec 4, 2024
1 parent bac7ab5 commit fd4bc20
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 221 deletions.
6 changes: 3 additions & 3 deletions angsd/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ include: "rules/vep.smk"
include: "rules/theta_downsampling.smk"


# include: "rules/gone.smk"


wildcard_constraints:
ref=config["reference"]["name"],
dp=".{0}|.dp[1-9][0-9]*",
Expand All @@ -40,7 +43,6 @@ if config["vep"]:
all_outputs.extend(
expand(
"results/datasets/{{dataset}}/analyses/vep/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.varimpacts.tsv",
zip,
dp=[".dp5"],
mindp=[4],
)
Expand All @@ -50,7 +52,6 @@ if config["bcftools_roh"]:
all_outputs.extend(
expand(
"results/datasets/{{dataset}}/plots/inbreeding/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.bcftools.froh_bins.svg",
zip,
dp=[".dp5"],
mindp=[4],
)
Expand All @@ -60,7 +61,6 @@ if config["gerp_rel_load"]:
all_outputs.extend(
expand(
"results/datasets/{{dataset}}/analyses/gerp/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.filtered_mindp{mindp}-biallelic.{{trans}}.fmiss{{miss}}.relative-load-gerp.csv",
zip,
dp=[".dp5"],
mindp=[4],
)
Expand Down
4 changes: 1 addition & 3 deletions angsd/workflow/rules/vep.smk
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ rule calc_vep_load:
pops="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list",
output:
varcounts="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts.tsv",
varcounts_boots="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts_boots.tsv",
varcounts_nomiss="results/datasets/{dataset}/analyses/vep/{dataset}.{ref}_all{dp}_{sites}-filts.filtered_mindp{mindp}-biallelic.{trans}.fmiss{miss}.varimpacts_nomiss.tsv",
threads: lambda w, attempt: 2 * attempt
threads: 8
conda:
"../envs/r.yaml"
resources:
Expand Down
230 changes: 15 additions & 215 deletions angsd/workflow/scripts/genetic_load_vep.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ df <- merge(vars, vep, by = c("chr", "pos"))
sum(df$alt == df$vepalt) == nrow(df) # Check that alt alleles match
df$impact <- gsub("IMPACT=", "", df$impact)

# Remove modifier variants
# df <- df[df$impact != "MODIFIER", ]

# Get sample list and metadata
samples <- read.table(snakemake@input[["samples"]])
colnames(samples) <- c("sample")
Expand Down Expand Up @@ -62,75 +59,13 @@ impacts <- df$impact
# historical samples
comp80both <- (
rowSums(!is.na(histgt)) >= floor(ncol(histgt) * 0.8) &
rowSums(!is.na(modgt)) >= floor(ncol(modgt) * 0.8)
rowSums(!is.na(modgt)) >= floor(ncol(modgt) * 0.8)
)

histgt <- histgt[comp80both, ]
modgt <- modgt[comp80both, ]
impacts <- impacts[comp80both]

# # Calculate Rxy for the dataset, treating historical samples and modern samples
# # as single units

# # set values for 'bootstraps'
# nboot <- 1000 # number of 'bootstraps' to perform
# nsample <- 8 # number of samples to sample per group

# # initialize rxy data frame
# rxy <- data.frame()

# for (b in c(1:nboot)) {
# # subset historical genotypes by nsample
# subhistgt <- sample(histgt, nsample)
# # calculate allele frequencies for historical samples at all sites
# hist_af <- (rowSums(subhistgt, na.rm = TRUE)) /
# (rowSums(!is.na(subhistgt)) * 2)
# # subset modern genotypes by nsample
# submodgt <- sample(modgt, nsample)
# # calculate allele frequencies for modern samples at all sites
# mod_af <- (rowSums(submodgt, na.rm = TRUE)) /
# (rowSums(!is.na(submodgt)) * 2)
# # remove sites with missing individuals from subsampling to ensure subsamples
# # are even (only relevant if data wasn't already filtered for missingness)
# sitefilt <- (complete.cases(submodgt) * complete.cases(subhistgt)) == 1
# # Filter by missingness. Also only use sites that are segregating in the
# # subsample. This makes the subsample a filtered set with equivalent
# # processing to our original data
# sitefilt <- sitefilt & (
# (mod_af > 0 & mod_af < 1) | (hist_af > 0 & hist_af < 1)
# )
# hist_af <- hist_af[sitefilt]
# mod_af <- mod_af[sitefilt]
# bootimpacts <- impacts[sitefilt]
# usable_sites <- length(bootimpacts)
# # Prep Rxy dataframe
# rxy_df <- data.frame(matrix(nrow = length(bootimpacts), ncol = 0))
# rxy_df$hist_af <- hist_af
# rxy_df$mod_af <- mod_af
# rxy_df$impact <- bootimpacts
# usable_sites <- nrow(rxy_df)
# # calculate Rxy for historical-modern frequencies (<1 is increase in freq
# # over time)
# rxy_df <- rxy_df %>%
# group_by(impact) %>%
# summarise(
# rxy = ((sum(hist_af * (1 - mod_af))) / (sum(mod_af * (1 - hist_af))))
# )
# rxy_df$boot <- b
# rxy_df$usablesites <- usable_sites
# rxy_df$nsamples <- nsample
# rxy <- rbind(rxy, rxy_df)
# }

# # Column names update
# colnames(rxy) <- c(
# "var.impact", "rxy.hist.mod", "boot", "usable.sites", "nsamples"
# )

# write.table(rxy,
# file = snakemake@output[["rxy"]], quote = FALSE,
# sep = "\t", row.names = FALSE, col.names = TRUE
# )
df <- df[comp80both, ]

# Now count the totals per individual for alternate alleles. Also count the
# total alt count per individual for normalization into 'relative' load
Expand All @@ -144,19 +79,27 @@ for (sam in samples$sample) {
mod_hom <- sum(df[df$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
low <- sum(df[df$impact == "LOW", sam], na.rm = TRUE)
low_hom <- sum(df[df$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
inter <- sum(df[df$consequence == "intergenic_variant", sam], na.rm = TRUE)
inter_hom <- sum(
df[df$consequence == "intergenic_variant", sam] == 2,
na.rm = TRUE
) * 2
alts <- sum(df[, sam], na.rm = TRUE)
nsites <- sum(!is.na(df[, sam]), na.rm = TRUE)
row <- c(sam, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
row <- c(
sam, high, high_hom, mod, mod_hom, low, low_hom, inter, inter_hom,
alts, nsites
)
varcounts <- rbind(varcounts, row)
}

colnames(varcounts) <- c(
"sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
"nsites"
"sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "inter",
"inter_hom", "alts", "nsites"
)
numcols <- c(
"high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
"nsites"
"high", "high_hom", "mod", "mod_hom", "low", "low_hom", "inter",
"inter_hom", "alts", "nsites"
)
varcounts[numcols] <- sapply(varcounts[numcols], as.numeric)
varcounts <- merge(varcounts, samples, by = "sample")
Expand All @@ -165,146 +108,3 @@ write.table(varcounts,
file = snakemake@output[["varcounts"]], quote = FALSE,
sep = "\t", row.names = FALSE, col.names = TRUE
)

varcounts_boots <- data.frame()

for (sam in samples$sample) {
sam_nomiss <- df[complete.cases(df[, sam]),]
for (b in c(1:300)) {
bdf <- sample_n(sam_nomiss, 500000)
high <- sum(bdf[bdf$impact == "HIGH", sam], na.rm = TRUE)
high_hom <- sum(bdf[bdf$impact == "HIGH", sam] == 2, na.rm = TRUE) * 2
mod <- sum(bdf[bdf$impact == "MODERATE", sam], na.rm = TRUE)
mod_hom <- sum(bdf[bdf$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
low <- sum(bdf[bdf$impact == "LOW", sam], na.rm = TRUE)
low_hom <- sum(bdf[bdf$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
alts <- sum(bdf[, sam], na.rm = TRUE)
nsites <- sum(!is.na(bdf[, sam]), na.rm = TRUE)
row <- c(sam, b, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
varcounts_boots <- rbind(varcounts_boots, row)
}
}

colnames(varcounts_boots) <- c(
"sample", "boot", "high", "high_hom", "mod", "mod_hom", "low", "low_hom",
"alts", "nsites"
)
numcols <- c(
"boot", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
"nsites"
)
varcounts_boots[numcols] <- sapply(varcounts_boots[numcols], as.numeric)
varcounts_boots <- merge(varcounts_boots, samples, by = "sample")

write.table(varcounts_boots,
file = snakemake@output[["varcounts_boots"]], quote = FALSE,
sep = "\t", row.names = FALSE, col.names = TRUE
)

# Limit to only sites with data for all individuals and repeat both
df <- df[complete.cases(df), ]

# make data frames of historical and modern genotypes only, as well as list of
# variant impacts
histgt <- df[, which(
names(df) %in% samples$sample[sampmeta$time == "historical"]
)]
modgt <- df[, which(names(df) %in% samples$sample[sampmeta$time == "modern"])]
impacts <- df$impact

# # Calculate Rxy for the dataset, treating historical samples and modern samples
# # as single units

# # set values for 'bootstraps'
# nboot <- 1000 # number of 'bootstraps' to perform
# nsample <- 8 # number of samples to sample per group

# # initialize rxy data frame
# rxy <- data.frame()

# for (b in c(1:nboot)) {
# # subset historical genotypes by nsample
# subhistgt <- sample(histgt, nsample)
# # calculate allele frequencies for historical samples at all sites
# hist_af <- (rowSums(subhistgt, na.rm = TRUE)) /
# (rowSums(!is.na(subhistgt)) * 2)
# # subset modern genotypes by nsample
# submodgt <- sample(modgt, nsample)
# # calculate allele frequencies for modern samples at all sites
# mod_af <- (rowSums(submodgt, na.rm = TRUE)) /
# (rowSums(!is.na(submodgt)) * 2)
# # remove sites with missing individuals from subsampling to ensure subsamples
# # are even (only relevant if data wasn't already filtered for missingness)
# sitefilt <- (complete.cases(submodgt) * complete.cases(subhistgt)) == 1
# # Filter by missingness. Also only use sites that are segregating in the
# # subsample. This makes the subsample a filtered set with equivalent
# # processing to our original data
# sitefilt <- sitefilt & (
# (mod_af > 0 & mod_af < 1) | (hist_af > 0 & hist_af < 1)
# )
# hist_af <- hist_af[sitefilt]
# mod_af <- mod_af[sitefilt]
# bootimpacts <- impacts[sitefilt]
# usable_sites <- length(bootimpacts)
# # Prep Rxy dataframe
# rxy_df <- data.frame(matrix(nrow = length(bootimpacts), ncol = 0))
# rxy_df$hist_af <- hist_af
# rxy_df$mod_af <- mod_af
# rxy_df$impact <- bootimpacts
# usable_sites <- nrow(rxy_df)
# # calculate Rxy for historical-modern frequencies (<1 is increase in freq
# # over time)
# rxy_df <- rxy_df %>%
# group_by(impact) %>%
# summarise(
# rxy = ((sum(hist_af * (1 - mod_af))) / (sum(mod_af * (1 - hist_af))))
# )
# rxy_df$boot <- b
# rxy_df$usablesites <- usable_sites
# rxy_df$nsamples <- nsample
# rxy <- rbind(rxy, rxy_df)
# }

# # Column names update
# colnames(rxy) <- c(
# "var.impact", "rxy.hist.mod", "boot", "usable.sites", "nsamples"
# )

# write.table(rxy,
# file = snakemake@output[["rxy_nomiss"]], quote = FALSE,
# sep = "\t", row.names = FALSE, col.names = TRUE
# )

# Now count the totals per individual for alternate alleles. Also count the
# total alt count per individual for normalization into 'relative' load

varcounts <- data.frame()

for (sam in samples$sample) {
high <- sum(df[df$impact == "HIGH", sam], na.rm = TRUE)
high_hom <- sum(df[df$impact == "HIGH", sam] == 2, na.rm = TRUE) * 2
mod <- sum(df[df$impact == "MODERATE", sam], na.rm = TRUE)
mod_hom <- sum(df[df$impact == "MODERATE", sam] == 2, na.rm = TRUE) * 2
low <- sum(df[df$impact == "LOW", sam], na.rm = TRUE)
low_hom <- sum(df[df$impact == "LOW", sam] == 2, na.rm = TRUE) * 2
alts <- sum(df[, sam], na.rm = TRUE)
nsites <- sum(!is.na(df[, sam]), na.rm = TRUE)
row <- c(sam, high, high_hom, mod, mod_hom, low, low_hom, alts, nsites)
varcounts <- rbind(varcounts, row)
}

colnames(varcounts) <- c(
"sample", "high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
"nsites"
)
numcols <- c(
"high", "high_hom", "mod", "mod_hom", "low", "low_hom", "alts",
"nsites"
)
varcounts[numcols] <- sapply(varcounts[numcols], as.numeric)
varcounts <- merge(varcounts, samples, by = "sample")

write.table(varcounts,
file = snakemake@output[["varcounts_nomiss"]], quote = FALSE,
sep = "\t", row.names = FALSE, col.names = TRUE
)

0 comments on commit fd4bc20

Please sign in to comment.