-
Notifications
You must be signed in to change notification settings - Fork 0
/
HAC.R
102 lines (65 loc) · 2.55 KB
/
HAC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# # Hierarchical agglomerative clustering
# First, load a few libraries
library(tidyverse)
library(skimr)
library(dendextend) # for "cutree" function
library(ape) # for phylo trees
# Using data from the 1977 US census statistical abstract
# store as a data frame (currently a matrix)
s <- as.data.frame(...)
# take a look at the summary stats and distributions for each
skim(s)
# select a few related features, standardize, and calculate euclidean distance matrix
s_sub <- s %>%
...(Income, Illiteracy, `Life Exp`, `HS Grad`) %>%
...() %>%
...()
s_sub # inspect to make sure features are on the same scale
# fit the algorithms with different linkage methods and visualize
hc_single <- ...(s_sub,
method = "single"); plot(hc_single, hang = -1)
hc_complete <- hclust(...,
method = "complete"); plot(hc_complete, hang = -1)
hc_average <- hclust(s_sub,
method = "average"); plot(..., hang = -1)
hc_centroid <- hclust(...,
method = "centroid"); plot(hc_centroid, hang = -1) # see the inversions?
# Now, see all in a single pane
par(mfrow = c(2,2))
hc_single <- ...(s_sub,
method = "single"); plot(hc_single, hang = -1)
hc_complete <- hclust(...,
method = "complete"); plot(hc_complete, hang = -1)
hc_average <- hclust(s_sub,
method = "average"); plot(..., hang = -1)
hc_centroid <- hclust(...,
method = "centroid"); plot(hc_centroid, hang = -1)
# reset plot space
...
# Or a gimicky triangular tree (sometimes called a "cladogram"):
plot(as.dendrogram(hc_complete),
main = "A Cladogram of Hierarchical Clustering\n(via Complete Linkage)",
type = ...)
# And we can cut and compare trees if we aren't sure about 3 or 4 clusters, e.g.
cuts <- ...(hc_complete,
k = c(3,4))
### Inspect assignments for each iteration...
cuts
### Or, a simple matrix of assignments by iteration
table(`3 Clusters` = ...[,1],
`4 Clusters` = ...[,2])
# what do you see?
## FOR FUN...
## unrooted phylo tree
plot(as.phylo(hc_complete),
main = "Unrooted Dendrogram",
type = "unrooted", label.offset = .1)
## "fan" phylo tree
plot(as.phylo(hc_complete),
main = "Fan Dendrogram",
type = "fan",
label.offset = .1)
# A more interesting (though still dev) version from Romain Francois, "A2Rplot"
source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")
A2Rplot(hc_complete, k = 4,
col.down = c("darkgreen", "darkblue", "darkred", "black"))