-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathClustering_Hierarchical Cluster.R
109 lines (79 loc) · 2.64 KB
/
Clustering_Hierarchical Cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#library checker
libs = c("dplyr", "dendextend","cluster", "readxl", "tidyverse", "dendextend", "factoextra")
for (i in libs){
if( !is.element(i, .packages(all.available = TRUE)) ) {
install.packages(i)
}
library(i,character.only = TRUE)
}
lapply(libs, require, character.only = TRUE)
df <- Chipo
df <- na.omit(df)
head(df)
df.scaled <- scale(df[-1])
head(df.scaled)
# Dissimilarity matrix
d <- dist(df.scaled, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
# Compute with agnes
hc2 <- agnes(df.scaled, method = "complete")
# Agglomerative coefficient
hc2$ac
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(df.scaled, method = x)$ac
}
map_dbl(m, ac)
hc3 <- agnes(df.scaled, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# compute divisive hierarchical clustering
hc4 <- diana(df.scaled)
# Divise coefficient; amount of clustering structure found
hc4$dc
# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# Ward's method
hc5 <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)
# Number of members in each cluster
table(sub_grp)
Chipo %>%
mutate(cluster = sub_grp) %>%
head
plot(hc5, cex = 0.6)
rect.hclust(hc5, k = 4, border = 2:5)
fviz_cluster(list(data = df.scaled, cluster = sub_grp))
# Cut agnes() tree into 4 groups
hc_a <- agnes(df.scaled, method = "ward")
cutree(as.hclust(hc_a), k = 4)
# Cut diana() tree into 4 groups
hc_d <- diana(df.scaled)
cutree(as.hclust(hc_d), k = 4)
# Compute distance matrix
res.dist <- dist(df.scaled, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
#Determining Optimal Clusters
fviz_nbclust(df.scaled, FUN = hcut, method = "wss")
fviz_nbclust(df.scaled, FUN = hcut, method = "silhouette")
gap_stat <- clusGap(df.scaled, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)