forked from TomokazuKonishi/direct-PCA-for-sequences
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnucleotide.txt
128 lines (95 loc) · 4.2 KB
/
nucleotide.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
##### R scripts for calculations
# reading the aligned sequence data:
# the data have to be formatted in tab-separated text with two colums,
# (name of sequence) \t (aligned sequence)
## TableS2.txt: human, TableS3.txt: lion, TableS4.txt: bacteria
sites <- read.table(file="TableS4.txt", header=F, sep="\t")
sites<-as.matrix(sites)
dim(sites)
### finding the size of data
n_sample <- dim(sites)[1]
n_seq <- nchar(sites[2,2])
### translation of the sequence to bollean vectors
bool <- array(0, dim=c(n_sample, 5*n_seq))
colnames(bool) <- c(paste("A_", 1:n_seq, sep=""),paste("T_", 1:n_seq, sep=""),paste("G_", 1:n_seq, sep=""),paste("C_", 1:n_seq, sep=""),paste("N_", 1:n_seq, sep=""))
rownames(bool) <- sites[ ,1]
for (s in 1:n_sample){
se <- sites[s, 2]
se <- tolower(se)
for (le in 1:n_seq){
base <- substr(se, le,le)
if(base =="a") {
bool[s, le] <-1
} else {
if(base =="t") {
bool[s, le+n_seq] <-1
} else {
if(base =="g") {
bool[s, le+n_seq*2] <-1
} else {
if(base =="c") {
bool[s, le+n_seq*3] <-1
} else {
bool[s, le+n_seq*4] <-1
}}}}
}}
apply(bool, 1, sum) # here you can verify the translation
# they should show identical values same as n_seq
############ PCA
## centering : the center can be replaced to certain group
center<- apply(bool, 2, mean)
diffs<-sweep(bool, 2, center)
diffs <- diffs/(2^0.5)
# compensating the doubled counts in Euclidean distance metrics
# checking distribution of the distances
dist<- (apply(diffs^2, 1, sum))^0.5
qqnorm(dist)
hist(dist)
### PCA core
res_svd <- svd(diffs) #
str(res_svd)
Left <- res_svd$u # the left singular vector
Right <- res_svd$v # the right singular vector
sqL <- diag(res_svd$d) # diagonal matrix of the singular values
### calculatinf of pc's
sPC_nuc <- Right %*% sqL / (n_sample^0.5)
sPC_sample <- Left %*% sqL/ (n_seq^0.5)
rownames(sPC_nuc)<- colnames(bool)
rownames(sPC_sample)<- rownames(bool)
#### output to text files
write.table(sPC_sample, file="sPC_sample.txt", sep="\t")
write.table(sPC_nuc, file="sPC_nuc.txt", sep="\t")
#### output to png images
# sample
png(width=2100, height=2300, pointsize = 80, file="sPC_sample_12.png")
par(lwd=4, mex=0.6, mai=c(4,4,3,0.2))
plot( sPC_sample [,1], sPC_sample[,2], col="gray50" , pch=1, main="sample", xlab="", ylab="" , axes=T)
dev.off()
# sites
png(width=2100, height=2300, pointsize = 80, file="sPC_seq_1.png")
par(lwd=4, mex=0.6, mai=c(4,4,3,0.2))
colors <- c(rgb(red=10, green=100, blue=255, alpha=255, maxColorValue =255), rgb(red=140, green=255, blue=100, alpha=255, maxColorValue =255),
rgb(red=255, green=50, blue=10, alpha=255, maxColorValue =255), rgb(red=100, green=100, blue=100, alpha=255, maxColorValue =255))
# color of presentation: I hope this set is recognizable for colorblind persons.
plot(1:n_seq, sPC_nuc[1:n_seq,1], pch="", xlab="sites", ylab="sPC1", main="sites")
text(1:n_seq, sPC_nuc[1:n_seq,1], labels=1:n_seq , cex=0.8, col= colors[1] )
text(1:n_seq, sPC_nuc[1:n_seq+n_seq,1], labels=1:n_seq , cex=0.8, col=colors[2] )
text(1:n_seq, sPC_nuc[1:n_seq+n_seq+n_seq,1], labels=1:n_seq , cex=0.8, col= colors[3] )
text(1:n_seq, sPC_nuc[1:n_seq+n_seq+n_seq+n_seq,1], labels=1:n_seq , cex=0.8, col= colors[4] )
legend(x=10, y=0, legend=c( "A","T", "G", "C"), pch="1", text.col=colors, col=colors, border = "white", box.lwd = 1, box.lty = 1, cex=0.8, bg="white")
dev.off()
# contribution
png(width=2100, height=2300, pointsize = 80, file="contributions.png")
par(lwd=4, mex=0.6, mai=c(4,4,3,0.2))
plot(1:20, (res_svd$d/sum(res_svd$d)*100)[1:20], pch=1, type="b", lty=3, ylab="(%)", xlab="PC", main="Contribution", col="gray50")
dev.off()
# contribution tangent regression
theta <- ((length(res_svd$d)+1):1-1)/(length(res_svd$d)+1)*(0.5*pi)
tans <- tan(theta)
png(width=2100, height=2300, pointsize = 80, file="tangent.png")
par(lwd=4, mex=0.6, mai=c(4,4,3,0.2))
plot((tans), c( res_svd$d, 0)/sum(res_svd$d)*100, xlab="Tangent", ylab="Contribution (%)")
#abline(0, sum(res_svd$d)/sum(tans)/sum(res_svd$d)*100, lty=3)
z <- line ((res_svd$d/sum(res_svd$d)*100 )[1:10] ~ (tans )[1:10] )
abline(coef(z), lty=3)
dev.off()