-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_admixture.Rmd
264 lines (241 loc) · 10.4 KB
/
plot_admixture.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
---
title: "R Notebook"
output: html_notebook
---
NOTE: interaction/hardcoding required for 2 variables:
1) population_list, this variable should be set == to the values in auto_pop_list, comment the pops you want to omit
2) pub_table_order, this variable should == the formated names held in "pop_IDs", order it how you wish the pops to be plotted
#Dependencies
```{r}
library(tictoc)
library("devtools", lib.loc="~/R/win-library/3.5")
library("genepop", lib.loc="~/R/win-library/3.5")
library("diveRsity", lib.loc="~/R/win-library/3.5")
library("adegenet", lib.loc="~/R/win-library/3.5")
library("xlsx", lib.loc="~/R/win-library/3.5")
library("sendplot", lib.loc="~/R/win-library/3.5")
library("dplyr", lib.loc="~/R/win-library/3.5")
library(reshape2)
library(ggplot2)
library(egg)
```
#genepop metadata
```{r}
#read genepop that we derived the admixture data from
#this will allow us to generalize our metadata in the figure
#e.g. n samples per population
SNPs_gen = read.genepop('../../v6_novoseq2_snps_genomic.gen', ncode = 3)
beep()
auto_pop_list = NULL
for (i in 1:n_distinct(SNPs_gen$pop)) {
auto_pop_list = paste(auto_pop_list, "'",unique(SNPs_gen$pop)[i],"'",",", sep = "")
}
auto_pop_list
```
```{r}
#Paste your auto_pop_list strings here, they should be formatted as csv, then parse to different lines and comment out the ones you want to omit
population_list = c(
'Big_Arbor_Vitae-10-13698',
'Chippewa_Flowage-17-08347',
# 'Cutfoot_Sioux-17-15586',
'Delavan-15-01449',
'Eau_Claire_River-17-01449',
'Escanaba_Lake-17-02775',
'Kawaguesaga-11-00300',
# 'Lake_Koronis-17-15556',
'Lake_Millicent-07-00697',
'Lake_Wisconsin-15-05298',
'Manitowish_Lake-17-10943',
'Medicine_Lake-17-13247',
# 'Mille_Lacs-17-15428',
# 'Ottertail_Lake-17-15364',
# 'Pike_River-17-15524',
# 'Pine_River-17-15460',
# 'Red_Lake-17-15396',
'Sanford_Lake-17-02975',
# 'Sarah_Lake-17-15332',
# 'St_Louis_River-17-15492',
'Turtle_Flambeau_Flowage-17-08397',
'Willow_Flowage-11-00167',
'WolfR_17-03648'
)
snps_subset = SNPs_gen[which(SNPs_gen$pop %in% population_list)]
#extract n samples per population
n_by_pop = as.data.frame(matrix(NA,nrow = n_distinct(snps_subset$pop), ncol = 1))
for (i in 1:n_distinct(snps_subset$pop)) {
n_by_pop[i,1] = length(which(snps_subset$pop == unique(snps_subset$pop)[i]))
}
#expand the point data to a format that will fit our genind
n_by_pop$V1 = as.double(n_by_pop$V1)
n_by_pop_strata = NULL
for (i in 1:nrow(n_by_pop)) {
n_by_pop_strata = c(n_by_pop_strata,rep(x = n_by_pop[i,1],times = n_by_pop[i,1]))
}
n_by_pop_strata = as.data.frame(n_by_pop_strata)
#extract population names, removing the ID number that gets pulled in from the first sample
pop_IDs = snps_subset$pop
pop_IDs = gsub("\\d+","",pop_IDs)
pop_IDs = gsub("-","",pop_IDs)
pop_IDs = as.data.frame(pop_IDs)
#now merge the metadata with the genind
strat_dat = cbind.data.frame(n_by_pop_strata,pop_IDs)
strata(snps_subset) = strat_dat
#SNPs_gen$strata$n_by_pop_strata
#SNPs_gen@strata@pop_IDs
#initialize a palette of colors we can use downstream, 23 unique hexidecimal colors here
PCA23_palette = c("#000ffF","#0033CC","#006699","#009966","#006600",
"#66CC00","#CCFF00","#CC9900","#CC3300","#CC0033",
"#CC0099","#CC00FF","#6633FF","#CC66FF","#FF66CC",
"#FF3366","#FF6633","#FF9900","#6699ff","#33ccff","#00ffff","#00ff99")
```
#exploring ggplot option
this section requires a custom pallette for each level of K
the order of which K is which color moves from level to level so,
they have to be hard coded to facilitate the same cluster being represented with the same color across runs
```{r}
pdf("./SEMINAR_admixture_sorted.pdf", width = 11.5, height = 8, family = "Times")
#RAN 2-15
for (i in c(2,9,11)) {
read_name = paste(c("./admixture_all_pops/v6_novoseq2_snps_genomic.",as.character(i),".Q"),collapse = "")
tbl = read.table(read_name)
#requires ggplot2 and reshape pkgs
#transpose the results to have individuals in columns and K proportions in rows
tmp = as.data.frame(t(tbl))
#melt the data into column format, i.e., indv, K-proportion
tmp2 = melt(tmp)
rownames(tmp2) = c(1:nrow(tmp2))
#add name for each K value within an individual
tmp2$K_val = c(1:length(which(tmp2$variable == unique(tmp2$variable)[1])))
#add the pop ID data to each individual's K values
pop_IDs_for_admix = rep(pop_IDs$pop_IDs, each = i)
tmp2$pop = pop_IDs_for_admix
#making a list to sort the results on, this will organize the admixture plot by geography and not alphabetic
pub_table_order = c("Delavan","WolfR_","Lake_Wisconsin",
"Medicine_Lake","Willow_Flowage",
"Kawaguesaga","Big_Arbor_Vitae","Escanaba_Lake",
"Sanford_Lake","Manitowish_Lake","Turtle_Flambeau_Flowage",
"Chippewa_Flowage","Eau_Claire_River","Lake_Millicent",
"St_Louis_River",
"Sarah_Lake",
"Lake_Koronis",
"Mille_Lacs",
"Pine_River",
"Cutfoot_Sioux",
"Ottertail_Lake",
"Red_Lake",
"Pike_River"
)
#Loop through to subset populations in order of publish table. We're really just sorting here
tmp3 = NULL
for (j in 1:length(pub_table_order)) {
tmp3 = rbind.data.frame(tmp3, tmp2[which(tmp2$pop == pub_table_order[j]),])
}
#add/change some metadata to make sure the plot is built in the order I wanted
rownames(tmp3) = c(1:nrow(tmp3))
tmp3$variable = rep(c(1:n_distinct(tmp2$variable)), each = i)
#rebuilding the x values for labels and lines
x_lab_vec = tmp3 %>%
group_by(pop) %>%
summarise(X_max = max(variable))
x_lab_vec = x_lab_vec %>% arrange(X_max)
############################
# This section builds on the sorting performed above,
# in addtion to sorting based on the pop ID order (to match the order of tables)
# I'm sorting individuals within each pop based on the descending proportion of ancestry allocated to the major K for that pop
K_max = tmp3 %>% group_by(pop, K_val) %>% summarise(max_K = sum(value))
K_max2 = K_max %>% group_by(pop) %>% filter(max_K == max(max_K))
# need to update the variable ID as that is the Xval in the plot
# var_ID = as.integer(1)
admx_to_plot = NULL
for (p in pub_table_order) {
# pull out a pop in the order of table
test_sort = tmp3[which(tmp3$pop == p),]
# extract the K value that accounts for the majority of that pop's ancestry
K_iter = K_max2[which(K_max2$pop == p),"K_val"]
# extract each individuals ancestry proporiton at that K
test_sort2 = test_sort[which(test_sort$K_val == K_iter$K_val),]
# arrange the individuals based on their ancestry at that K
test_sort3 = test_sort2 %>% arrange(desc(value,variable))
# bind them back into the dataframe to plot
for (v in test_sort3$variable) {
admx_to_plot = rbind.data.frame(admx_to_plot,tmp3[which(tmp3$variable == v),])
# admx_to_plot[which(admx_to_plot$pop == p & admx_to_plot$variable == v),"variable"] = var_ID
# var_ID = var_ID + 1
}
}
xval = 1
var_ID_seq = seq(1,nrow(tmp3),by = i)
for (l in var_ID_seq) {
admx_to_plot[c(l:(l+i-1)),"variable"] = xval
xval = xval + 1
}
admx_to_plot
if (i == 9) {
# Sanford, Medicine, WISC/CHIP, CHIP, Red , Pike , Delavan , Minnesota, WISC
PCA23_palette = c("#990099","#0099FF","#6600CC","#CC00CC","#FFCC00","#00FF00","#F79646","#00B050","#0066cc")
#custom colors
plt9 = ggplot() + geom_bar(aes(y = value, x = variable), fill = rep(PCA23_palette[1:i], nrow(pop_IDs)), data = admx_to_plot, stat = "identity")+
geom_text(angle = 90, aes(x = x_lab_vec$X_max-15, y = 0.5, label = pub_table_order[which(pub_table_order %in% unique(tmp2$pop))]))+
geom_vline(xintercept = x_lab_vec$X_max, color = "black", size=.5)+
scale_fill_manual("legend", values = rep(PCA23_palette[1:i], nrow(pop_IDs)))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.margin=unit(c(0,0,0,0), "cm"),
axis.line = element_line(colour = "black"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(""),
axis.title.x= element_blank(),
axis.ticks.x= element_blank()
# legend.position = "right"
)+
xlab("Individual")+
ylab("Ancestry proportion")
}
if (i == 11) {
# Mille lacs, Medicine, Delavan, St Louis, WISC, MINNESOTA, Sanford, RED , CHIP , PIKE , wisc/chip
PCA23_palette = c("#666600","#0099FF","#F79646","#9999FF","#0066cc","#00B050","#990099","#FFCC00","#CC00CC","#00FF00","#6600CC")
#custom colors
plt11 = ggplot() + geom_bar(aes(y = value, x = variable), fill = rep(PCA23_palette[1:i], nrow(pop_IDs)), data = admx_to_plot, stat = "identity")+
geom_text(angle = 90, aes(x = x_lab_vec$X_max-15, y = 0.5, label = pub_table_order[which(pub_table_order %in% unique(tmp2$pop))]))+
geom_vline(xintercept = x_lab_vec$X_max, color = "black", size=.5)+
scale_fill_manual("legend", values = rep(PCA23_palette[1:i], nrow(pop_IDs)))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.margin=unit(c(0,0,0,0), "cm"),
axis.line = element_line(colour = "black"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(""),
axis.title.x= element_blank(),
axis.ticks.x= element_blank()
# legend.position = "right"
)+
xlab("Individual")+
ylab("Ancestry proportion")
}
if (i == 2) {
PCA23_palette = c("#666600","#0099FF")
plt2 = ggplot() + geom_bar(aes(y = value, x = variable), fill = rep(PCA23_palette[1:i], nrow(pop_IDs)), data = admx_to_plot, stat = "identity")+
geom_text(angle = 90, aes(x = x_lab_vec$X_max-15, y = 0.5, label = pub_table_order[which(pub_table_order %in% unique(tmp2$pop))]))+
geom_vline(xintercept = x_lab_vec$X_max, color = "black", size=.5)+
scale_fill_manual("legend", values = rep(PCA23_palette[1:i], nrow(pop_IDs)))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.margin=unit(c(0,0,0,0), "cm"),
axis.line = element_line(colour = "black"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(""),
axis.title.x= element_blank(),
axis.ticks.x= element_blank()
# legend.position = "right"
)+
xlab("Individual")+
ylab("Ancestry proportion")
ggarrange(plt2)
}
}
ggarrange(plt9,plt11, nrow = 2, ncol = 1)
dev.off()
```