-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpwise_FST_heatmap.Rmd
115 lines (112 loc) · 4.47 KB
/
pwise_FST_heatmap.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
---
title: "pwise_FST_heatmap"
author: "Matthew Bootsma"
date: "April 22, 2019"
output: html_document
---
This script was used to build the pairwise FST heatmap of ALL populations
#Dependecies
```{r}
library(dplyr)
library(ggplot2)
```
#Format Raw data
The data should come in with names on the rows and columns that match the population they represent
```{r}
#read data
raw_pwise_dat = read.csv("./raw_pairwise_FST.csv",fileEncoding="UTF-8-BOM")
names = raw_pwise_dat$X
#removing names for the loop, will append after
raw_pwise_dat = raw_pwise_dat[,-1]
#this loop will copy the half matrix across the diagonal so we can reorder the values for the heatmap
#i is indexing from column i
for (i in 1:ncol(raw_pwise_dat)) {
#j is indexing from row j
for (j in 1:nrow(raw_pwise_dat)) {
raw_pwise_dat[i,j] = raw_pwise_dat[j,i]
}
}
raw_pwise_dat = cbind.data.frame(names,raw_pwise_dat)
#Melt data into column format
molten_pwise_dat = melt(raw_pwise_dat)
names(molten_pwise_dat) = c("pop_x","pop_y","FST")
```
#Organize data how you want to plot it
This section will be custom to your data, likely organized by some biological/geographic concept
It requires a whitelist of Population Names (the same ones you have in your matrix), and a sequence from 1-n_pops in the order you want them plotted
The data should have name in column 1 and sequence order value in column 2
```{r}
#Read Whitelist
whitelist = read.csv("./FST_XY_dat.csv", header = F)
#Vectorize xy data
molten_pwise_dat$X_val = NA
molten_pwise_dat$Y_val = NA
#Initialize xy data
for (i in 1:nrow(molten_pwise_dat)) {
#Assign X value
molten_pwise_dat[i,"X_val"] = whitelist[which(whitelist$V1 == molten_pwise_dat[i,"pop_x"]),"V2"]
#Assign Y value
molten_pwise_dat[i,"Y_val"] = whitelist[which(whitelist$V1 == molten_pwise_dat[i,"pop_y"]),"V2"]
}
#Set negative FST values to the effective value of 0
molten_pwise_dat[which(molten_pwise_dat$FST <0),"FST"] = 0.00000999
#Remove mirror by changing values to NA
#If you want the plot to be on the other half of the mirror, switch i and j in the if statements.
for (i in 1:nrow(molten_pwise_dat)) {
for (j in 1:23) {
if (molten_pwise_dat[i,"Y_val"] == j) {
if (molten_pwise_dat[i,"X_val"] > j) {
molten_pwise_dat[i,"FST"] = NA
}
}
}
}
```
#Plot heatmap
This section currently has hardcoding that is dependent upon the analyis I used to develop it.
I am looking to get this generalized.
TODO:
sort the data after giving it the xy values, remove duplicates, extract unique names. This should allow labels to be generalized.
make the breaks a sequence that is initialized outside the plot, simply from 1:npops by 1 or 0.5 depending on the x or y axis
```{r}
pdf("./TEST3_FST_heatmap.pdf", width = 11, height = 11)
ggplot(molten_pwise_dat, aes(x = X_val, y = Y_val, fill=FST))+
geom_tile(colour="white",size=0.25)+
scale_fill_gradientn(colors = c("#333333","#6699cc","#ffcc00","#ff6600","#ff3300"), name="FST")+
scale_y_continuous(
name = NULL,
expand = c(0,0),
breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23),
labels=c('Delavan Lake','Wolf River',
'Lake Wisconsin','Medicine Lake','Willow Flowage',
'Kawaguesaga','Big Arbor Vitae','Escanaba Lake',
'Sanford Lake','Manitowish Lake','Turtle Flambeau',
'Chippewa Flowage','Eau Claire River','Lake Millicent',
'St. Louis River','Pike River','Sarah Lake',
'Lake Koronis','Mille Lacs','Pine River',
'Cutfoot Sioux','Ottertail Lake','Red Lake')
)+
scale_x_continuous(
name = NULL,
expand = c(0,0),
breaks=c(0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5),
labels=c('Delavan Lake','Wolf River',
'Lake Wisconsin','Medicine Lake','Willow Flowage',
'Kawaguesaga','Big Arbor Vitae','Escanaba Lake',
'Sanford Lake','Manitowish Lake','Turtle Flambeau',
'Chippewa Flowage','Eau Claire River','Lake Millicent',
'St. Louis River','Pike River','Sarah Lake',
'Lake Koronis','Mille Lacs','Pine River',
'Cutfoot Sioux','Ottertail Lake','Red Lake')
)+
theme(
plot.title = element_text(hjust = 0.5, size = 28),
axis.text.x = element_text(angle = 90, hjust = 1, size = 16),
axis.text.y = element_text(size = 16),
axis.ticks.x=element_blank(),
axis.ticks.y=element_blank(),
panel.background = element_blank()
)+
ggtitle("Pairwise FST Heatmap")
dev.off()
```