K-means
library(lattice)
library(latticeExtra)
library(labestData)
# labestDataView()
# Cães Pré-históricos da Tailândia.
ManlyTb1.4
## grup largm altm comppm largpm comppt comppq
## 1 Cao moderno 9.7 21.0 19.4 7.7 32.0 36.5
## 2 Chacal dourado 8.1 16.7 18.3 7.0 30.3 32.9
## 3 Lobo chines 13.5 27.3 26.8 10.6 41.9 48.1
## 4 Lobo indiano 11.5 24.3 24.5 9.3 40.0 44.6
## 5 Cuon 10.7 23.5 21.4 8.5 28.8 37.6
## 6 Dingo 9.6 22.6 21.1 8.3 34.4 43.1
## 7 Cao pre-historico 10.3 22.1 19.1 8.1 32.2 35.0
# Índices de Desenvolvimento de Países.
MingotiTb6.8
## pais expecvida educ pib estabpoli
## 1 Reino Unido 0.88 0.99 0.91 1.10
## 2 Australia 0.90 0.99 0.93 1.26
## 3 Canada 0.90 0.98 0.94 1.24
## 4 Estados Unidos 0.87 0.98 0.97 1.18
## 5 Japao 0.93 0.93 0.93 1.20
## 6 Franca 0.89 0.97 0.92 1.04
## 7 Cingapura 0.88 0.87 0.91 1.41
## 8 Argentina 0.81 0.92 0.80 0.55
## 9 Uruguai 0.82 0.92 0.75 1.05
## 10 Cuba 0.85 0.90 0.64 0.07
## 11 Colombia 0.77 0.85 0.69 -1.36
## 12 Brasil 0.71 0.83 0.72 0.47
## 13 Paraguai 0.75 0.83 0.63 -0.87
## 14 Egito 0.70 0.62 0.60 0.21
## 15 Nigeria 0.44 0.58 0.37 -1.36
## 16 Senegal 0.47 0.37 0.45 -0.68
## 17 Serra Leoa 0.23 0.33 0.27 -1.26
## 18 Angola 0.34 0.36 0.51 -1.98
## 19 Etiopia 0.31 0.35 0.32 -0.55
## 20 Mocambique 0.24 0.37 0.36 0.20
## 21 China 0.76 0.80 0.61 0.39
# Emprego em Paises Europeus.
ManlyTb1.5
## pais grup afp mep fab fea con ser fin
## 1 Belgica UE 2.6 0.2 20.8 0.8 6.3 16.9 8.7
## 2 Dinamarca UE 5.6 0.1 20.4 0.7 6.4 14.5 9.1
## 3 Franca UE 5.1 0.3 20.2 0.9 7.1 16.7 10.2
## 4 Alemanha UE 3.2 0.7 24.8 1.0 9.4 17.2 9.6
## 5 Grecia UE 22.2 0.5 19.2 1.0 6.8 18.2 5.3
## 6 Irlanda UE 13.8 0.6 19.8 1.2 7.1 17.8 8.4
## 7 Italia UE 8.4 1.1 21.9 0.0 9.1 21.6 4.6
## 8 Luxemburgo UE 3.3 0.1 19.6 0.7 9.9 21.2 8.7
## 9 Paises Baixos UE 4.2 0.1 19.2 0.7 0.6 18.5 11.5
## 10 Portugal UE 11.5 0.5 23.6 0.7 8.2 19.8 6.7
## 11 Espanha UE 9.9 0.5 21.1 0.6 9.5 20.1 5.9
## 12 Reino Unido UE 2.2 0.7 21.3 1.2 7.0 20.2 12.4
## 13 Austria AELC 7.4 0.3 26.9 1.2 8.5 19.1 6.7
## 14 Finlandia AELC 8.5 0.2 26.3 1.2 6.8 14.6 8.6
## 15 Islandia AELC 10.5 0.0 18.7 0.9 10.0 14.5 8.0
## 16 Norurega AELC 5.8 1.1 14.6 1.1 6.5 17.6 7.6
## 17 Suecia AELC 3.2 0.3 19.0 0.8 6.4 14.2 9.4
## 18 Suica AELC 5.6 0.0 24.7 0.0 9.2 20.5 10.7
## 19 Albania Leste 55.5 19.4 0.0 0.0 3.4 4.3 15.3
## 20 Bulgaria Leste 19.0 0.0 3.5 0.0 6.7 9.4 1.5
## 21 Republica Tcheca/Eslovaquia Leste 12.8 37.3 0.0 0.0 8.4 10.2 1.6
## 22 Hungria Leste 15.3 28.9 0.0 0.0 6.4 13.3 0.0
## 23 Polonia Leste 23.6 3.9 24.1 0.9 6.3 10.3 1.3
## 24 Romenia Leste 22.0 2.6 37.9 2.0 5.8 6.9 0.6
## 25 USSR Leste 18.5 0.0 28.8 0.0 10.2 7.9 0.6
## 26 Iugoslavia Leste 5.0 2.2 38.7 2.2 8.1 13.8 3.1
## 27 Chipre Outro 13.5 0.3 19.0 0.5 9.1 23.7 6.7
## 28 Gibraltar Outro 0.0 0.0 6.8 2.0 16.9 24.5 10.8
## 29 Malta Outro 2.6 0.6 27.9 1.5 4.6 10.2 3.9
## 30 Turquia Outro 44.8 0.9 15.3 0.2 5.2 12.4 2.4
## ssp tc
## 1 36.9 6.8
## 2 36.3 7.0
## 3 33.1 6.4
## 4 28.4 5.6
## 5 19.8 6.9
## 6 25.5 5.8
## 7 28.0 5.3
## 8 29.6 6.8
## 9 38.3 6.8
## 10 24.6 4.8
## 11 26.7 5.8
## 12 28.4 6.5
## 13 23.3 6.4
## 14 33.2 7.5
## 15 30.7 6.7
## 16 37.5 8.1
## 17 39.5 7.2
## 18 23.1 6.2
## 19 0.0 3.0
## 20 20.9 7.5
## 21 22.9 6.9
## 22 27.3 8.8
## 23 24.5 5.2
## 24 15.3 6.8
## 25 25.6 8.4
## 26 19.1 7.8
## 27 21.2 0.6
## 28 34.0 0.5
## 29 41.6 7.2
## 30 14.5 4.4
#-----------------------------------------------------------------------
# help(kmeans, help_type = "html")
# Variáveis métricas.
X <- ManlyTb1.4[, -(1)]
# Diagrama de pares de dispersão.
splom(X)

km <- kmeans(X, centers = 2, trace = TRUE)
## KMNS(*, k=2): iter= 1, indx=7
## [1] "kmeans"
methods(class = class(km))
## [1] fitted print
## see '?methods' for accessing help and source code
## K-means clustering with 2 clusters of sizes 5, 2
##
## Cluster means:
## largm altm comppm largpm comppt comppq
## 1 9.68 21.18 19.86 7.92 31.54 37.02
## 2 12.50 25.80 25.65 9.95 40.95 46.35
##
## Clustering vector:
## [1] 1 1 2 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 117.316 17.920
## (between_SS / total_SS = 71.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
splom(X, groups = km$cluster)

#-----------------------------------------------------------------------
# Função para guiar a escolha do número de grupos.
wssplot <- function(data, nc = 15, seed = 1234, ...) {
# Soma de quadrados interna aos cluters.
wss <- (nrow(data) - 1) *
sum(apply(data, MARGIN = 2, FUN = var))
for (i in 2:nc) {
set.seed(seed)
wss[i] <- sum(kmeans(data, centers = i, ...)$withinss)
}
plot(x = 1:nc,
y = wss,
type = "b",
xlab = "Número de agrupamentos",
ylab = "Soma de quadrados dentro de grupos")
}
# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)
# Dados originais.
wssplot(X, nc = 15, trace = TRUE)
## KMNS(*, k=2): iter= 1, indx=8
## QTRAN(): istep=30, icoun=10
## KMNS(*, k=3): iter= 1, indx=5
## KMNS(*, k=3): iter= 2, indx=10
## KMNS(*, k=3): iter= 3, indx=30
## KMNS(*, k=4): iter= 1, indx=1
## QTRAN(): istep=30, icoun=10
## KMNS(*, k=4): iter= 2, indx=30
## KMNS(*, k=5): iter= 1, indx=0
## QTRAN(): istep=30, icoun=3
## QTRAN(): istep=60, icoun=2
## QTRAN(): istep=90, icoun=16
## KMNS(*, k=5): iter= 2, indx=0
## KMNS(*, k=5): iter= 3, indx=30
## KMNS(*, k=6): iter= 1, indx=0
## QTRAN(): istep=30, icoun=7
## QTRAN(): istep=60, icoun=17
## QTRAN(): istep=90, icoun=2
## KMNS(*, k=6): iter= 2, indx=3
## QTRAN(): istep=30, icoun=0
## KMNS(*, k=6): iter= 3, indx=30
## KMNS(*, k=7): iter= 1, indx=0
## QTRAN(): istep=30, icoun=16
## KMNS(*, k=7): iter= 2, indx=0
## QTRAN(): istep=30, icoun=7
## QTRAN(): istep=60, icoun=22
## KMNS(*, k=7): iter= 3, indx=15
## KMNS(*, k=7): iter= 4, indx=30
## KMNS(*, k=8): iter= 1, indx=5
## QTRAN(): istep=30, icoun=7
## KMNS(*, k=8): iter= 2, indx=10
## QTRAN(): istep=30, icoun=3
## QTRAN(): istep=60, icoun=24
## KMNS(*, k=8): iter= 3, indx=30
## KMNS(*, k=9): iter= 1, indx=0
## QTRAN(): istep=30, icoun=15
## QTRAN(): istep=60, icoun=16
## KMNS(*, k=9): iter= 2, indx=10
## QTRAN(): istep=30, icoun=16
## QTRAN(): istep=60, icoun=3
## KMNS(*, k=9): iter= 3, indx=30
## KMNS(*, k=10): iter= 1, indx=0
## QTRAN(): istep=30, icoun=17
## KMNS(*, k=10): iter= 2, indx=24
## KMNS(*, k=10): iter= 3, indx=12
## QTRAN(): istep=30, icoun=10
## QTRAN(): istep=60, icoun=19
## QTRAN(): istep=90, icoun=20
## QTRAN(): istep=120, icoun=3
## KMNS(*, k=10): iter= 4, indx=3
## QTRAN(): istep=30, icoun=15
## QTRAN(): istep=60, icoun=19
## QTRAN(): istep=90, icoun=20
## KMNS(*, k=10): iter= 5, indx=30
## KMNS(*, k=11): iter= 1, indx=0
## QTRAN(): istep=30, icoun=16
## KMNS(*, k=11): iter= 2, indx=5
## QTRAN(): istep=30, icoun=7
## KMNS(*, k=11): iter= 3, indx=17
## QTRAN(): istep=30, icoun=19
## KMNS(*, k=11): iter= 4, indx=30
## KMNS(*, k=12): iter= 1, indx=0
## QTRAN(): istep=30, icoun=16
## KMNS(*, k=12): iter= 2, indx=7
## QTRAN(): istep=30, icoun=17
## KMNS(*, k=12): iter= 3, indx=30
## KMNS(*, k=13): iter= 1, indx=0
## QTRAN(): istep=30, icoun=12
## QTRAN(): istep=60, icoun=17
## KMNS(*, k=13): iter= 2, indx=7
## QTRAN(): istep=30, icoun=17
## KMNS(*, k=13): iter= 3, indx=30
## KMNS(*, k=14): iter= 1, indx=0
## QTRAN(): istep=30, icoun=12
## QTRAN(): istep=60, icoun=17
## KMNS(*, k=14): iter= 2, indx=7
## QTRAN(): istep=30, icoun=24
## KMNS(*, k=14): iter= 3, indx=30
## KMNS(*, k=15): iter= 1, indx=10
## KMNS(*, k=15): iter= 2, indx=3
## QTRAN(): istep=30, icoun=7
## KMNS(*, k=15): iter= 3, indx=5
## KMNS(*, k=15): iter= 4, indx=30

# Dados padronizados.
wssplot(Z, nc = 15)

#-----------------------------------------------------------------------
# Determinar o melhor número de grupos.
# NbClust Package for determining the best number of clusters.
library(NbClust)
# help(package = "NbClust", help_type = "html")
set.seed(1234)
nc <- NbClust(Z, min.nc = 2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 3 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
##
## 0 2 3 4 5 6 8 13 14 15
## 2 6 3 1 1 2 5 1 2 3
splom(Z, groups = nc$Best.partition)

##
## 1 2
## 22 8
split(x = ManlyTb1.5[, 1], f = nc$Best.partition)
## $`1`
## [1] "Belgica" "Dinamarca" "Franca" "Alemanha"
## [5] "Grecia" "Irlanda" "Italia" "Luxemburgo"
## [9] "Paises Baixos" "Portugal" "Espanha" "Reino Unido"
## [13] "Austria" "Finlandia" "Islandia" "Norurega"
## [17] "Suecia" "Suica" "Iugoslavia" "Chipre"
## [21] "Gibraltar" "Malta"
##
## $`2`
## [1] "Albania" "Bulgaria"
## [3] "Republica Tcheca/Eslovaquia" "Hungria"
## [5] "Polonia" "Romenia"
## [7] "USSR" "Turquia"
#-----------------------------------------------------------------------
# Os agrupamentos vistos com os primeiros scores.
pc <- princomp(Z)
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.737796 1.3444684 1.1511163 1.0160728 0.79149291
## Proportion of Variance 0.347119 0.2077696 0.1523067 0.1186671 0.07200701
## Cumulative Proportion 0.347119 0.5548885 0.7071953 0.8258624 0.89786940
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.58277877 0.55904883 0.42721031 0.232079414
## Proportion of Variance 0.03903806 0.03592363 0.02097801 0.006190903
## Cumulative Proportion 0.93690746 0.97283109 0.99380910 1.000000000
screeplot(pc, type = "lines")
biplot(pc)

xyplot(pc$scores[, 2] ~ pc$scores[, 1],
groups = nc$Best.partition)

splom(pc$scores[, 1:4],
groups = nc$Best.partition)

Agrupamento hierárquico
## grup largm altm comppm largpm comppt comppq
## 1 Cao moderno 9.7 21.0 19.4 7.7 32.0 36.5
## 2 Chacal dourado 8.1 16.7 18.3 7.0 30.3 32.9
## 3 Lobo chines 13.5 27.3 26.8 10.6 41.9 48.1
## 4 Lobo indiano 11.5 24.3 24.5 9.3 40.0 44.6
## 5 Cuon 10.7 23.5 21.4 8.5 28.8 37.6
## 6 Dingo 9.6 22.6 21.1 8.3 34.4 43.1
## 7 Cao pre-historico 10.3 22.1 19.1 8.1 32.2 35.0
Z <- scale(ManlyTb1.4[, -1])
rownames(Z) <- as.character(ManlyTb1.4[, 1])
Z
## largm altm comppm largpm
## Cao moderno -0.4628718 -0.46166596 -0.68197696 -0.6868366
## Chacal dourado -1.4054471 -1.78510838 -1.03678930 -1.2878186
## Lobo chines 1.7757445 1.47733107 1.70494241 1.8029460
## Lobo indiano 0.5975254 0.55399915 0.96306206 0.6868366
## Cuon 0.1262378 0.30777731 -0.03686362 0.0000000
## Dingo -0.5217828 0.03077773 -0.13363062 -0.1717091
## Cao pre-historico -0.1094061 -0.12311092 -0.77874396 -0.3434183
## comppt comppq
## Cao moderno -0.45150929 -0.5674490
## Chacal dourado -0.79592984 -1.2086918
## Lobo chines 1.55423391 1.4987779
## Lobo indiano 1.16929330 0.8753473
## Cuon -1.09983033 -0.3715137
## Dingo 0.03473148 0.6081628
## Cao pre-historico -0.41098923 -0.8346335
## attr(,"scaled:center")
## largm altm comppm largpm comppt comppq
## 10.48571 22.50000 21.51429 8.50000 34.22857 39.68571
## attr(,"scaled:scale")
## largm altm comppm largpm comppt comppq
## 1.697477 3.249102 3.100230 1.164760 4.935826 5.614098
# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)
rownames(Z) <- ManlyTb1.5[, 1]
Z
## afp mep fab fea
## Belgica -0.77896674 -0.36620404 0.13850687 0.0000000
## Dinamarca -0.53520107 -0.37748342 0.09685067 -0.1610564
## Franca -0.57582868 -0.35492465 0.07602257 0.1610564
## Alemanha -0.73021361 -0.30980711 0.55506889 0.3221129
## Grecia 0.81363563 -0.33236588 -0.02811794 0.3221129
## Irlanda 0.13109176 -0.32108650 0.03436637 0.6442258
## Italia -0.30768645 -0.26468957 0.25306143 -1.2884515
## Luxemburgo -0.72208808 -0.37748342 0.01353827 -0.1610564
## Paises Baixos -0.64895838 -0.37748342 -0.02811794 -0.1610564
## Portugal -0.05579525 -0.33236588 0.43010028 -0.1610564
## Espanha -0.18580361 -0.33236588 0.16974902 -0.3221129
## Reino Unido -0.81146883 -0.30980711 0.19057712 0.6442258
## Austria -0.38894167 -0.35492465 0.77376395 0.6442258
## Finlandia -0.29956092 -0.36620404 0.71127965 0.6442258
## Islandia -0.13705048 -0.38876281 -0.08018819 0.1610564
## Norurega -0.51895003 -0.26468957 -0.50716426 0.4831693
## Suecia -0.73021361 -0.35492465 -0.04894604 0.0000000
## Suica -0.53520107 -0.38876281 0.54465484 -1.2884515
## Albania 3.51943457 1.79943790 -2.02761563 -1.2884515
## Bulgaria 0.55361892 -0.38876281 -1.66312386 -1.2884515
## Republica Tcheca/Eslovaquia 0.04983654 3.81844784 -2.02761563 -1.2884515
## Hungria 0.25297459 2.87097949 -2.02761563 -1.2884515
## Polonia 0.92739295 0.05113321 0.48217054 0.1610564
## Romenia 0.79738459 -0.09549879 1.91930950 1.9326773
## USSR 0.51299131 -0.38876281 0.97163091 -1.2884515
## Iugoslavia -0.58395420 -0.14061633 2.00262191 2.2547902
## Chipre 0.10671519 -0.35492465 -0.04894604 -0.4831693
## Gibraltar -0.99023032 -0.38876281 -1.31946019 1.9326773
## Malta -0.77896674 -0.32108650 0.87790445 1.1273951
## Turquia 2.65000368 -0.28724834 -0.43426590 -0.9663386
## con ser fin ssp
## Belgica -0.4500407 0.2421102 0.510937277 1.13451620
## Dinamarca -0.4134521 -0.2303000 0.611285023 1.06580392
## Franca -0.1573313 0.2027427 0.887241326 0.69933839
## Alemanha 0.6842083 0.3011615 0.736719707 0.16109214
## Grecia -0.2670973 0.4979991 -0.342018570 -0.82378397
## Irlanda -0.1573313 0.4192641 0.435676467 -0.17101725
## Italia 0.5744422 1.1672469 -0.517627126 0.11528395
## Luxemburgo 0.8671517 1.0885118 0.510937277 0.29851671
## Paises Baixos -2.5355954 0.5570504 1.213371503 1.29484487
## Portugal 0.2451441 0.8129392 0.009198543 -0.27408568
## Espanha 0.7207970 0.8719905 -0.191496950 -0.03359267
## Reino Unido -0.1939200 0.8916743 1.439153933 0.16109214
## Austria 0.3549102 0.6751529 0.009198543 -0.42296230
## Finlandia -0.2670973 -0.2106162 0.485850340 0.71079043
## Islandia 0.9037403 -0.2303000 0.335328720 0.42448924
## Norurega -0.3768634 0.3798965 0.234980973 1.20322849
## Suecia -0.4134521 -0.2893513 0.686545833 1.43226945
## Suica 0.6110309 0.9507255 1.012676010 -0.44586640
## Albania -1.5111124 -2.2380433 2.166675096 -3.09128944
## Bulgaria -0.3036860 -1.2341717 -1.295322163 -0.69781145
## Republica Tcheca/Eslovaquia 0.3183215 -1.0767016 -1.270235226 -0.46877049
## Hungria -0.4134521 -0.4665051 -1.671626213 0.03511961
## Polonia -0.4500407 -1.0570178 -1.345496036 -0.28553773
## Romenia -0.6329841 -1.7262656 -1.521104593 -1.33912613
## USSR 0.9769177 -1.5294280 -1.521104593 -0.15956520
## Iugoslavia 0.2085555 -0.3680863 -0.893931176 -0.90394831
## Chipre 0.5744422 1.5806058 0.009198543 -0.66345530
## Gibraltar 3.4283591 1.7380759 1.037762946 0.80240682
## Malta -1.0720483 -1.0767016 -0.693235683 1.67276245
## Turquia -0.8525162 -0.6436589 -1.069539733 -1.43074251
## tc
## Belgica 0.35026166
## Dinamarca 0.45378727
## Franca 0.14321043
## Alemanha -0.27089202
## Grecia 0.40202446
## Irlanda -0.16736641
## Italia -0.42618044
## Luxemburgo 0.35026166
## Paises Baixos 0.35026166
## Portugal -0.68499447
## Espanha -0.16736641
## Reino Unido 0.19497324
## Austria 0.14321043
## Finlandia 0.71260130
## Islandia 0.29849885
## Norurega 1.02317814
## Suecia 0.55731288
## Suica 0.03968482
## Albania -1.61672498
## Bulgaria 0.71260130
## Republica Tcheca/Eslovaquia 0.40202446
## Hungria 1.38551778
## Polonia -0.47794324
## Romenia 0.35026166
## USSR 1.17846656
## Iugoslavia 0.86788972
## Chipre -2.85903233
## Gibraltar -2.91079514
## Malta 0.55731288
## Turquia -0.89204569
## attr(,"scaled:center")
## afp mep fab fea con ser fin
## 12.186667 3.446667 19.470000 0.800000 7.530000 15.670000 6.663333
## ssp tc
## 26.993333 6.123333
## attr(,"scaled:scale")
## afp mep fab fea con ser
## 12.3069012 8.8657315 9.6024117 0.6209003 2.7330859 5.0803306
## fin ssp tc
## 3.9861383 8.7320627 1.9318891
X <- MingotiTb6.8[, -1]
Z <- scale(X)
rownames(Z) <- MingotiTb6.8[, 1]
# help(dist, help_type = "html")
# method = {"euclidean", "maximum", "manhattan", "canberra", "binary",
# "minkowski"}
# Distâncias entre cada par de observações.
dis <- dist(as.matrix(Z), method = "euclidian")
dis
## Reino Unido Australia Canada Estados Unidos Japao
## Australia 0.19355717
## Canada 0.20779029 0.06214472
## Estados Unidos 0.27814083 0.23083305 0.18944523
## Japao 0.34306768 0.27686469 0.24307601 0.36501441
## Franca 0.11524943 0.23117298 0.21629167 0.27115746 0.27976169
## Cingapura 0.56346107 0.51578127 0.49426773 0.55841921 0.38504873
## Argentina 0.81503568 0.99573052 0.99835354 1.01201180 0.97486121
## Uruguai 0.79304403 0.91853110 0.94118345 1.01703855 0.91976319
## Cuba 1.57464604 1.74300711 1.75509726 1.81103978 1.69268529
## Colombia 2.62053376 2.80240051 2.79453633 2.77759859 2.74107540
## Brasil 1.39730464 1.55967812 1.56089377 1.56280255 1.52095232
## Paraguai 2.38117422 2.56418995 2.56221781 2.56335473 2.50400764
## Egito 2.30077327 2.43849365 2.43285246 2.45416892 2.32476330
## Nigeria 4.12434564 4.29743418 4.29634370 4.28781833 4.23679822
## Senegal 3.99268232 4.13795639 4.12793757 4.12067522 4.03017525
## Serra Leoa 5.20377422 5.36000779 5.35551180 5.33990817 5.28765187
## Angola 4.79232674 4.95631145 4.94060936 4.89963995 4.86422212
## Etiopia 4.61000337 4.75452588 4.75081624 4.74404744 4.67620299
## Mocambique 4.44243520 4.57073694 4.56886398 4.55770153 4.51054149
## China 1.72871565 1.88158340 1.89033659 1.93594702 1.81731811
## Franca Cingapura Argentina Uruguai Cuba
## Australia
## Canada
## Estados Unidos
## Japao
## Franca
## Cingapura 0.53572548
## Argentina 0.79962669 1.00873195
## Uruguai 0.82056450 0.83922181 0.52273547
## Cuba 1.56108474 1.73851071 0.85221230 1.05471980
## Colombia 2.57826852 2.83055887 1.89897272 2.32307348 1.42530436
## Brasil 1.38785238 1.41533081 0.65612362 0.81139431 0.82648468
## Paraguai 2.34942738 2.54271845 1.59630059 1.94746841 1.02269495
## Egito 2.27008853 2.16311141 1.58678209 1.65971252 1.30277924
## Nigeria 4.10520655 4.13691211 3.31754977 3.50771308 2.78800690
## Senegal 3.95998198 3.85456871 3.24687526 3.37222754 2.86413655
## Serra Leoa 5.18483894 5.11503352 4.43676407 4.55809423 4.00763735
## Angola 4.75208952 4.75042321 4.02465495 4.28389218 3.64392021
## Etiopia 4.59325154 4.47425463 3.87369095 3.94049664 3.49259593
## Mocambique 4.43662764 4.26173028 3.77759800 3.77095591 3.53002192
## China 1.72007417 1.72277325 0.99114196 1.02759921 0.64018451
## Colombia Brasil Paraguai Egito Nigeria
## Australia
## Canada
## Estados Unidos
## Japao
## Franca
## Cingapura
## Argentina
## Uruguai
## Cuba
## Colombia
## Brasil 1.75721371
## Paraguai 0.54485463 1.33825611
## Egito 1.81601873 1.02174755 1.34700726
## Nigeria 2.23607847 2.75545568 2.04116381 2.09993050
## Senegal 2.60147614 2.63923886 2.32477032 1.74851973 1.12248639
## Serra Leoa 3.57079502 3.81203313 3.36083930 3.02993261 1.40239103
## Angola 2.83188240 3.48389970 2.79973410 2.78997555 1.29118690
## Etiopia 3.29649696 3.23122080 2.99498604 2.41058540 1.33348354
## Mocambique 3.57976631 3.12247186 3.21159177 2.40049057 1.89335274
## China 1.70543417 0.54131547 1.20286998 0.78347250 2.52769902
## Senegal Serra Leoa Angola Etiopia Mocambique
## Australia
## Canada
## Estados Unidos
## Japao
## Franca
## Cingapura
## Argentina
## Uruguai
## Cuba
## Colombia
## Brasil
## Paraguai
## Egito
## Nigeria
## Senegal
## Serra Leoa 1.39271139
## Angola 1.37024188 1.33467034
## Etiopia 0.88634951 0.78523156 1.59216786
## Mocambique 1.32823508 1.44631896 2.20491693 0.79112688
## China 2.43706482 3.61222383 3.37501936 3.02465361 2.97869150
# help(hclust, help_type = "html")
# method = {"ward.D", "ward.D2", "single", "complete", "average"
# (=UPGMA), "mcquitty" (=WPGMA), "median" (=WPGMC), "centroid" (=
# UPGMC)}
layout(1)
hcl <- hclust(dis, method = "average")
# hcl <- hclust(dis, method = "ward.D")
plot(hcl, hang = -1)

# Cortando de 2 a 4 grupos.
cutree(hcl, k = 2:4)
## 2 3 4
## Reino Unido 1 1 1
## Australia 1 1 1
## Canada 1 1 1
## Estados Unidos 1 1 1
## Japao 1 1 1
## Franca 1 1 1
## Cingapura 1 1 1
## Argentina 1 1 2
## Uruguai 1 1 2
## Cuba 1 1 2
## Colombia 1 2 3
## Brasil 1 1 2
## Paraguai 1 2 3
## Egito 1 1 2
## Nigeria 2 3 4
## Senegal 2 3 4
## Serra Leoa 2 3 4
## Angola 2 3 4
## Etiopia 2 3 4
## Mocambique 2 3 4
## China 1 1 2
#-----------------------------------------------------------------------
# Aprimorando a exibição fazendo cortes.
library(dendextend)
den <- as.dendrogram(hcl)
k <- 2
plot(den, ylab = "Distância")
rect.hclust(hcl, k = 2, border = "red")

den <- color_branches(dend = den, k = k)
plot(den, ylab = "Distância")
rect.dendrogram(den,
k = k,
lty = 2,
border = "gray30")

# Determina o número ótimo de grupos.
den_k <- find_k(den)
plot(den_k)

plot(color_branches(den, k = den_k$nc))

#-----------------------------------------------------------------------
library(pvclust)
pvc <- pvclust(t(Z),
method.dist = "euclidean",
method.hclust = "average",
nboot = 2000)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.75)... Done.
## Bootstrap (r = 0.75)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.25)... Done.
## Bootstrap (r = 1.25)... Done.
##
## Cluster method: average
## Distance : euclidean
##
## Estimates on edges:
##
## au bp se.au se.bp v c pchi
## 1 0.975 0.954 0.005 0.002 -1.820 0.139 0.000
## 2 0.939 0.705 0.006 0.004 -1.044 0.504 0.007
## 3 0.848 0.243 0.014 0.004 -0.167 0.862 0.000
## 4 0.704 0.528 0.018 0.004 -0.304 0.233 0.000
## 5 0.989 0.793 0.002 0.003 -1.556 0.739 0.040
## 6 0.926 0.771 0.008 0.004 -1.095 0.354 0.000
## 7 0.936 0.686 0.007 0.004 -1.004 0.520 0.416
## 8 0.744 0.406 0.017 0.004 -0.209 0.446 0.000
## 9 0.948 0.714 0.006 0.004 -1.093 0.529 0.005
## 10 0.999 0.216 0.000 0.004 -1.232 2.016 0.004
## 11 0.716 0.189 0.021 0.003 0.155 0.725 0.000
## 12 0.471 0.094 0.060 0.002 0.695 0.622 0.834
## 13 0.938 0.439 0.007 0.004 -0.691 0.846 0.000
## 14 0.588 0.336 0.020 0.004 0.100 0.322 0.006
## 15 0.923 0.272 0.009 0.004 -0.407 1.015 0.010
## 16 0.957 0.285 0.006 0.004 -0.576 1.144 0.000
## 17 0.887 0.827 0.011 0.003 -1.076 0.135 0.000
## 18 0.564 0.386 0.020 0.004 0.063 0.225 0.000
## 19 0.938 0.688 0.006 0.004 -1.012 0.523 0.000
## 20 1.000 1.000 0.000 0.000 0.000 0.000 0.000
## [1] 21 4
plot(pvc)
pvrect(pvc, alpha = 0.9)

DBSCAN
# Dois pacotes com o algoritmo implementado.
# library(dbscan)
library(fpc)
# help(dbscan, help_type = "html")
# dbscan(data,
# eps,
# MinPts = 5,
# scale = FALSE,
# method = c("hybrid", "raw", "dist"))
# Usando o iris.
X <- scale(iris[, 1:2])
set.seed(123)
db <- fpc::dbscan(X, eps = 0.25, MinPts = 3)
class(db)
## [1] "dbscan"
methods(class = class(db))
## [1] plot predict print
## see '?methods' for accessing help and source code
## dbscan Pts=150 MinPts=3 eps=0.25
## 0 1 2 3 4 5 6 7 8 9
## border 33 2 1 1 2 1 0 1 0 2
## seed 0 17 11 5 1 61 4 4 3 1
## total 33 19 12 6 3 62 4 5 3 3
# Plot DBSCAN results.
plot(db, X, main = "DBSCAN", frame = FALSE)
box()

## [1] 1 2 2 2 1 0 1 1 4 2 3 1 2 4 0 0 0 1 0 3 1 3 0 1 1 2 1 1 1 2 2 1 0 0 2
## [36] 1 0 1 4 1 1 0 2 1 3 2 3 2 3 1 5 6 5 5 5 5 7 9 5 0 0 5 8 5 5 5 5 5 8 5
## [71] 0 5 0 5 5 5 0 5 5 5 5 5 5 5 5 7 5 0 5 5 5 5 5 0 5 5 5 5 9 5 7 5 0 5 5
## [106] 0 9 0 0 0 6 5 5 5 5 6 5 0 0 8 5 5 0 5 0 5 5 5 5 0 0 0 5 5 0 0 7 6 5 5
## [141] 5 5 5 5 0 5 0 5 7 5
#-----------------------------------------------------------------------
# Cria grid cruzando valores dos parâmetros de tunning.
grid <- expand.grid(eps = c(0.05, 0.1, 0.25, 0.5, 0.8),
MinPts = c(1, 2, 3, 5, 8, 13),
KEEP.OUT.ATTRS = FALSE)
# Aplica o algoritmo para cada condição de tunning.
m <- mapply(FUN = dbscan,
eps = grid$eps,
MinPts = grid$MinPts,
MoreArgs = list(data = X))
# Empilha as predições de cada condição gerando vetor.
pred <- c(sapply(m, predict))
# Junta as observações, as condições e a predição.
pred <- cbind(X,
grid[rep(seq_len(nrow(grid)),
each = nrow(X)), ],
clas = pred)
str(pred)
## 'data.frame': 4500 obs. of 5 variables:
## $ Sepal.Length: num -0.898 -1.139 -1.381 -1.501 -1.018 ...
## $ Sepal.Width : num 1.0156 -0.1315 0.3273 0.0979 1.245 ...
## $ eps : num 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ...
## $ MinPts : num 1 1 1 1 1 1 1 1 1 1 ...
## $ clas : num 1 2 3 4 5 6 7 8 9 10 ...
# Converte para fator.
pred <- transform(pred,
eps = factor(eps),
MinPts = factor(MinPts))
library(grid)
useOuterStrips(
xyplot(Sepal.Length ~ Sepal.Width | eps + MinPts,
groups = clas,
# auto.key = list(columns = 5),
as.table = TRUE,
data = pred)) +
layer({
n <- nlevels(factor(groups[subscripts]))
grid.text(x = 0.9, y = 0.9, label = n)
})

#-----------------------------------------------------------------------
# Número ótimo do parâmetro de tunning.
MinPts <- 2
dbscan::kNNdistplot(X, k = MinPts)
eps <- 0.4
abline(h = eps, lty = 2)

set.seed(456)
db <- fpc::dbscan(X, eps = eps, MinPts = MinPts)
db
## dbscan Pts=150 MinPts=2 eps=0.4
## 0 1 2 3 4 5 6 7
## border 12 0 0 0 0 0 0 0
## seed 0 42 2 82 4 4 2 2
## total 12 42 2 82 4 4 2 2
p <- predict(db)
# Plot DBSCAN results.
plot(db, X, frame = FALSE, asp = 1, type = "n")
points(X, col = p, pch = 19)
points(X[p == 0, ], pch = 4)
for (i in seq_len(nrow(X))) {
plotrix::draw.circle(X[i, 1],
X[i, 2],
radius = eps,
border = "gray50")
}
box()

# Usando rpanel para controlar.
library(rpanel)
action <- function(panel) {
with(panel, {
EPS <- as.numeric(eps)
MINPTS <- as.numeric(MinPts)
db <- dbscan::dbscan(X,
eps = EPS,
minPts = MINPTS)
p <- predict(db)
plot(db, X, frame = FALSE, asp = 1, type = "n")
points(X, col = p, pch = 19)
points(X[p == 0, ], pch = 4)
if (circle) {
for (i in seq_len(nrow(X))) {
plotrix::draw.circle(X[i, 1],
X[i, 2],
radius = EPS,
border = "gray70")
}
}
box()
})
return(panel)
}
# apropos("^rp\\.")
# help(package = "rpanel", help_type = "html")
panel <- rp.control()
# rp.textentry(panel = panel,
# variable = eps,
# action = action,
# labels = c("eps"),
# initval = c(0.4))
# rp.textentry(panel = panel,
# variable = MinPts,
# action = action,
# labels = c("minPts"),
# initval = c(5))
rp.checkbox(panel = panel,
variable = circle,
action = action,
title = "Circulos?",
initval = FALSE)
rp.doublebutton(panel = panel,
variable = eps,
step = 0.05,
initval = 0.4,
range = c(0.05, 0.8),
action = action,
title = "eps",
showvalue = TRUE)
rp.doublebutton(panel = panel,
variable = MinPts,
step = 1,
initval = 4,
range = c(1, 8),
action = action,
title = "MinPts",
showvalue = TRUE)