1 K-means

library(lattice)
library(latticeExtra)
library(labestData)
# labestDataView()

# Cães Pré-históricos da Tailândia.
ManlyTb1.4

##                grup largm altm comppm largpm comppt comppq
## 1       Cao moderno   9.7 21.0   19.4    7.7   32.0   36.5
## 2    Chacal dourado   8.1 16.7   18.3    7.0   30.3   32.9
## 3       Lobo chines  13.5 27.3   26.8   10.6   41.9   48.1
## 4      Lobo indiano  11.5 24.3   24.5    9.3   40.0   44.6
## 5              Cuon  10.7 23.5   21.4    8.5   28.8   37.6
## 6             Dingo   9.6 22.6   21.1    8.3   34.4   43.1
## 7 Cao pre-historico  10.3 22.1   19.1    8.1   32.2   35.0

# Índices de Desenvolvimento de Países.
MingotiTb6.8

##              pais expecvida educ  pib estabpoli
## 1     Reino Unido      0.88 0.99 0.91      1.10
## 2       Australia      0.90 0.99 0.93      1.26
## 3          Canada      0.90 0.98 0.94      1.24
## 4  Estados Unidos      0.87 0.98 0.97      1.18
## 5           Japao      0.93 0.93 0.93      1.20
## 6          Franca      0.89 0.97 0.92      1.04
## 7       Cingapura      0.88 0.87 0.91      1.41
## 8       Argentina      0.81 0.92 0.80      0.55
## 9         Uruguai      0.82 0.92 0.75      1.05
## 10           Cuba      0.85 0.90 0.64      0.07
## 11       Colombia      0.77 0.85 0.69     -1.36
## 12         Brasil      0.71 0.83 0.72      0.47
## 13       Paraguai      0.75 0.83 0.63     -0.87
## 14          Egito      0.70 0.62 0.60      0.21
## 15        Nigeria      0.44 0.58 0.37     -1.36
## 16        Senegal      0.47 0.37 0.45     -0.68
## 17     Serra Leoa      0.23 0.33 0.27     -1.26
## 18         Angola      0.34 0.36 0.51     -1.98
## 19        Etiopia      0.31 0.35 0.32     -0.55
## 20     Mocambique      0.24 0.37 0.36      0.20
## 21          China      0.76 0.80 0.61      0.39

# Emprego em Paises Europeus.
ManlyTb1.5

##                           pais  grup  afp  mep  fab fea  con  ser  fin
## 1                      Belgica    UE  2.6  0.2 20.8 0.8  6.3 16.9  8.7
## 2                    Dinamarca    UE  5.6  0.1 20.4 0.7  6.4 14.5  9.1
## 3                       Franca    UE  5.1  0.3 20.2 0.9  7.1 16.7 10.2
## 4                     Alemanha    UE  3.2  0.7 24.8 1.0  9.4 17.2  9.6
## 5                       Grecia    UE 22.2  0.5 19.2 1.0  6.8 18.2  5.3
## 6                      Irlanda    UE 13.8  0.6 19.8 1.2  7.1 17.8  8.4
## 7                       Italia    UE  8.4  1.1 21.9 0.0  9.1 21.6  4.6
## 8                   Luxemburgo    UE  3.3  0.1 19.6 0.7  9.9 21.2  8.7
## 9                Paises Baixos    UE  4.2  0.1 19.2 0.7  0.6 18.5 11.5
## 10                    Portugal    UE 11.5  0.5 23.6 0.7  8.2 19.8  6.7
## 11                     Espanha    UE  9.9  0.5 21.1 0.6  9.5 20.1  5.9
## 12                 Reino Unido    UE  2.2  0.7 21.3 1.2  7.0 20.2 12.4
## 13                     Austria  AELC  7.4  0.3 26.9 1.2  8.5 19.1  6.7
## 14                   Finlandia  AELC  8.5  0.2 26.3 1.2  6.8 14.6  8.6
## 15                    Islandia  AELC 10.5  0.0 18.7 0.9 10.0 14.5  8.0
## 16                    Norurega  AELC  5.8  1.1 14.6 1.1  6.5 17.6  7.6
## 17                      Suecia  AELC  3.2  0.3 19.0 0.8  6.4 14.2  9.4
## 18                       Suica  AELC  5.6  0.0 24.7 0.0  9.2 20.5 10.7
## 19                     Albania Leste 55.5 19.4  0.0 0.0  3.4  4.3 15.3
## 20                    Bulgaria Leste 19.0  0.0  3.5 0.0  6.7  9.4  1.5
## 21 Republica Tcheca/Eslovaquia Leste 12.8 37.3  0.0 0.0  8.4 10.2  1.6
## 22                     Hungria Leste 15.3 28.9  0.0 0.0  6.4 13.3  0.0
## 23                     Polonia Leste 23.6  3.9 24.1 0.9  6.3 10.3  1.3
## 24                     Romenia Leste 22.0  2.6 37.9 2.0  5.8  6.9  0.6
## 25                        USSR Leste 18.5  0.0 28.8 0.0 10.2  7.9  0.6
## 26                  Iugoslavia Leste  5.0  2.2 38.7 2.2  8.1 13.8  3.1
## 27                      Chipre Outro 13.5  0.3 19.0 0.5  9.1 23.7  6.7
## 28                   Gibraltar Outro  0.0  0.0  6.8 2.0 16.9 24.5 10.8
## 29                       Malta Outro  2.6  0.6 27.9 1.5  4.6 10.2  3.9
## 30                     Turquia Outro 44.8  0.9 15.3 0.2  5.2 12.4  2.4
##     ssp  tc
## 1  36.9 6.8
## 2  36.3 7.0
## 3  33.1 6.4
## 4  28.4 5.6
## 5  19.8 6.9
## 6  25.5 5.8
## 7  28.0 5.3
## 8  29.6 6.8
## 9  38.3 6.8
## 10 24.6 4.8
## 11 26.7 5.8
## 12 28.4 6.5
## 13 23.3 6.4
## 14 33.2 7.5
## 15 30.7 6.7
## 16 37.5 8.1
## 17 39.5 7.2
## 18 23.1 6.2
## 19  0.0 3.0
## 20 20.9 7.5
## 21 22.9 6.9
## 22 27.3 8.8
## 23 24.5 5.2
## 24 15.3 6.8
## 25 25.6 8.4
## 26 19.1 7.8
## 27 21.2 0.6
## 28 34.0 0.5
## 29 41.6 7.2
## 30 14.5 4.4

#-----------------------------------------------------------------------

# help(kmeans, help_type = "html")

# Variáveis métricas.
X <- ManlyTb1.4[, -(1)]

# Diagrama de pares de dispersão.
splom(X)

km <- kmeans(X, centers = 2, trace = TRUE)

## KMNS(*, k=2): iter=  1, indx=7

class(km)

## [1] "kmeans"

methods(class = class(km))

## [1] fitted print 
## see '?methods' for accessing help and source code

km

## K-means clustering with 2 clusters of sizes 5, 2
## 
## Cluster means:
##   largm  altm comppm largpm comppt comppq
## 1  9.68 21.18  19.86   7.92  31.54  37.02
## 2 12.50 25.80  25.65   9.95  40.95  46.35
## 
## Clustering vector:
## [1] 1 1 2 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 117.316  17.920
##  (between_SS / total_SS =  71.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

splom(X, groups = km$cluster)

#-----------------------------------------------------------------------

# Função para guiar a escolha do número de grupos.
wssplot <- function(data, nc = 15, seed = 1234, ...) {
    # Soma de quadrados interna aos cluters.
    wss <- (nrow(data) - 1) *
        sum(apply(data, MARGIN = 2, FUN = var))
    for (i in 2:nc) {
        set.seed(seed)
        wss[i] <- sum(kmeans(data, centers = i, ...)$withinss)
    }
    plot(x = 1:nc,
         y = wss,
         type = "b",
         xlab = "Número de agrupamentos",
         ylab = "Soma de quadrados dentro de grupos")
}

# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)

# Dados originais.
wssplot(X, nc = 15, trace = TRUE)

## KMNS(*, k=2): iter=  1, indx=8
##  QTRAN(): istep=30, icoun=10
## KMNS(*, k=3): iter=  1, indx=5
## KMNS(*, k=3): iter=  2, indx=10
## KMNS(*, k=3): iter=  3, indx=30
## KMNS(*, k=4): iter=  1, indx=1
##  QTRAN(): istep=30, icoun=10
## KMNS(*, k=4): iter=  2, indx=30
## KMNS(*, k=5): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=3
##  QTRAN(): istep=60, icoun=2
##  QTRAN(): istep=90, icoun=16
## KMNS(*, k=5): iter=  2, indx=0
## KMNS(*, k=5): iter=  3, indx=30
## KMNS(*, k=6): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=7
##  QTRAN(): istep=60, icoun=17
##  QTRAN(): istep=90, icoun=2
## KMNS(*, k=6): iter=  2, indx=3
##  QTRAN(): istep=30, icoun=0
## KMNS(*, k=6): iter=  3, indx=30
## KMNS(*, k=7): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=16
## KMNS(*, k=7): iter=  2, indx=0
##  QTRAN(): istep=30, icoun=7
##  QTRAN(): istep=60, icoun=22
## KMNS(*, k=7): iter=  3, indx=15
## KMNS(*, k=7): iter=  4, indx=30
## KMNS(*, k=8): iter=  1, indx=5
##  QTRAN(): istep=30, icoun=7
## KMNS(*, k=8): iter=  2, indx=10
##  QTRAN(): istep=30, icoun=3
##  QTRAN(): istep=60, icoun=24
## KMNS(*, k=8): iter=  3, indx=30
## KMNS(*, k=9): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=15
##  QTRAN(): istep=60, icoun=16
## KMNS(*, k=9): iter=  2, indx=10
##  QTRAN(): istep=30, icoun=16
##  QTRAN(): istep=60, icoun=3
## KMNS(*, k=9): iter=  3, indx=30
## KMNS(*, k=10): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=17
## KMNS(*, k=10): iter=  2, indx=24
## KMNS(*, k=10): iter=  3, indx=12
##  QTRAN(): istep=30, icoun=10
##  QTRAN(): istep=60, icoun=19
##  QTRAN(): istep=90, icoun=20
##  QTRAN(): istep=120, icoun=3
## KMNS(*, k=10): iter=  4, indx=3
##  QTRAN(): istep=30, icoun=15
##  QTRAN(): istep=60, icoun=19
##  QTRAN(): istep=90, icoun=20
## KMNS(*, k=10): iter=  5, indx=30
## KMNS(*, k=11): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=16
## KMNS(*, k=11): iter=  2, indx=5
##  QTRAN(): istep=30, icoun=7
## KMNS(*, k=11): iter=  3, indx=17
##  QTRAN(): istep=30, icoun=19
## KMNS(*, k=11): iter=  4, indx=30
## KMNS(*, k=12): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=16
## KMNS(*, k=12): iter=  2, indx=7
##  QTRAN(): istep=30, icoun=17
## KMNS(*, k=12): iter=  3, indx=30
## KMNS(*, k=13): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=12
##  QTRAN(): istep=60, icoun=17
## KMNS(*, k=13): iter=  2, indx=7
##  QTRAN(): istep=30, icoun=17
## KMNS(*, k=13): iter=  3, indx=30
## KMNS(*, k=14): iter=  1, indx=0
##  QTRAN(): istep=30, icoun=12
##  QTRAN(): istep=60, icoun=17
## KMNS(*, k=14): iter=  2, indx=7
##  QTRAN(): istep=30, icoun=24
## KMNS(*, k=14): iter=  3, indx=30
## KMNS(*, k=15): iter=  1, indx=10
## KMNS(*, k=15): iter=  2, indx=3
##  QTRAN(): istep=30, icoun=7
## KMNS(*, k=15): iter=  3, indx=5
## KMNS(*, k=15): iter=  4, indx=30

# Dados padronizados.
wssplot(Z, nc = 15)

#-----------------------------------------------------------------------
# Determinar o melhor número de grupos.

# NbClust Package for determining the best number of clusters.
library(NbClust)

# help(package = "NbClust", help_type = "html")

set.seed(1234)

nc <- NbClust(Z, min.nc = 2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 5 proposed 8 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

table(nc$Best.n[1, ])

## 
##  0  2  3  4  5  6  8 13 14 15 
##  2  6  3  1  1  2  5  1  2  3

splom(Z, groups = nc$Best.partition)

table(nc$Best.partition)

## 
##  1  2 
## 22  8

split(x = ManlyTb1.5[, 1], f = nc$Best.partition)

## $`1`
##  [1] "Belgica"       "Dinamarca"     "Franca"        "Alemanha"     
##  [5] "Grecia"        "Irlanda"       "Italia"        "Luxemburgo"   
##  [9] "Paises Baixos" "Portugal"      "Espanha"       "Reino Unido"  
## [13] "Austria"       "Finlandia"     "Islandia"      "Norurega"     
## [17] "Suecia"        "Suica"         "Iugoslavia"    "Chipre"       
## [21] "Gibraltar"     "Malta"        
## 
## $`2`
## [1] "Albania"                     "Bulgaria"                   
## [3] "Republica Tcheca/Eslovaquia" "Hungria"                    
## [5] "Polonia"                     "Romenia"                    
## [7] "USSR"                        "Turquia"

#-----------------------------------------------------------------------
# Os agrupamentos vistos com os primeiros scores.

pc <- princomp(Z)
summary(pc)

## Importance of components:
##                          Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     1.737796 1.3444684 1.1511163 1.0160728 0.79149291
## Proportion of Variance 0.347119 0.2077696 0.1523067 0.1186671 0.07200701
## Cumulative Proportion  0.347119 0.5548885 0.7071953 0.8258624 0.89786940
##                            Comp.6     Comp.7     Comp.8      Comp.9
## Standard deviation     0.58277877 0.55904883 0.42721031 0.232079414
## Proportion of Variance 0.03903806 0.03592363 0.02097801 0.006190903
## Cumulative Proportion  0.93690746 0.97283109 0.99380910 1.000000000

screeplot(pc, type = "lines")

biplot(pc)

xyplot(pc$scores[, 2] ~ pc$scores[, 1],
       groups = nc$Best.partition)

splom(pc$scores[, 1:4],
      groups = nc$Best.partition)

2 Agrupamento hierárquico

ManlyTb1.4

##                grup largm altm comppm largpm comppt comppq
## 1       Cao moderno   9.7 21.0   19.4    7.7   32.0   36.5
## 2    Chacal dourado   8.1 16.7   18.3    7.0   30.3   32.9
## 3       Lobo chines  13.5 27.3   26.8   10.6   41.9   48.1
## 4      Lobo indiano  11.5 24.3   24.5    9.3   40.0   44.6
## 5              Cuon  10.7 23.5   21.4    8.5   28.8   37.6
## 6             Dingo   9.6 22.6   21.1    8.3   34.4   43.1
## 7 Cao pre-historico  10.3 22.1   19.1    8.1   32.2   35.0

Z <- scale(ManlyTb1.4[, -1])
rownames(Z) <- as.character(ManlyTb1.4[, 1])
Z

##                        largm        altm      comppm     largpm
## Cao moderno       -0.4628718 -0.46166596 -0.68197696 -0.6868366
## Chacal dourado    -1.4054471 -1.78510838 -1.03678930 -1.2878186
## Lobo chines        1.7757445  1.47733107  1.70494241  1.8029460
## Lobo indiano       0.5975254  0.55399915  0.96306206  0.6868366
## Cuon               0.1262378  0.30777731 -0.03686362  0.0000000
## Dingo             -0.5217828  0.03077773 -0.13363062 -0.1717091
## Cao pre-historico -0.1094061 -0.12311092 -0.77874396 -0.3434183
##                        comppt     comppq
## Cao moderno       -0.45150929 -0.5674490
## Chacal dourado    -0.79592984 -1.2086918
## Lobo chines        1.55423391  1.4987779
## Lobo indiano       1.16929330  0.8753473
## Cuon              -1.09983033 -0.3715137
## Dingo              0.03473148  0.6081628
## Cao pre-historico -0.41098923 -0.8346335
## attr(,"scaled:center")
##    largm     altm   comppm   largpm   comppt   comppq 
## 10.48571 22.50000 21.51429  8.50000 34.22857 39.68571 
## attr(,"scaled:scale")
##    largm     altm   comppm   largpm   comppt   comppq 
## 1.697477 3.249102 3.100230 1.164760 4.935826 5.614098

# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)
rownames(Z) <- ManlyTb1.5[, 1]
Z

##                                     afp         mep         fab        fea
## Belgica                     -0.77896674 -0.36620404  0.13850687  0.0000000
## Dinamarca                   -0.53520107 -0.37748342  0.09685067 -0.1610564
## Franca                      -0.57582868 -0.35492465  0.07602257  0.1610564
## Alemanha                    -0.73021361 -0.30980711  0.55506889  0.3221129
## Grecia                       0.81363563 -0.33236588 -0.02811794  0.3221129
## Irlanda                      0.13109176 -0.32108650  0.03436637  0.6442258
## Italia                      -0.30768645 -0.26468957  0.25306143 -1.2884515
## Luxemburgo                  -0.72208808 -0.37748342  0.01353827 -0.1610564
## Paises Baixos               -0.64895838 -0.37748342 -0.02811794 -0.1610564
## Portugal                    -0.05579525 -0.33236588  0.43010028 -0.1610564
## Espanha                     -0.18580361 -0.33236588  0.16974902 -0.3221129
## Reino Unido                 -0.81146883 -0.30980711  0.19057712  0.6442258
## Austria                     -0.38894167 -0.35492465  0.77376395  0.6442258
## Finlandia                   -0.29956092 -0.36620404  0.71127965  0.6442258
## Islandia                    -0.13705048 -0.38876281 -0.08018819  0.1610564
## Norurega                    -0.51895003 -0.26468957 -0.50716426  0.4831693
## Suecia                      -0.73021361 -0.35492465 -0.04894604  0.0000000
## Suica                       -0.53520107 -0.38876281  0.54465484 -1.2884515
## Albania                      3.51943457  1.79943790 -2.02761563 -1.2884515
## Bulgaria                     0.55361892 -0.38876281 -1.66312386 -1.2884515
## Republica Tcheca/Eslovaquia  0.04983654  3.81844784 -2.02761563 -1.2884515
## Hungria                      0.25297459  2.87097949 -2.02761563 -1.2884515
## Polonia                      0.92739295  0.05113321  0.48217054  0.1610564
## Romenia                      0.79738459 -0.09549879  1.91930950  1.9326773
## USSR                         0.51299131 -0.38876281  0.97163091 -1.2884515
## Iugoslavia                  -0.58395420 -0.14061633  2.00262191  2.2547902
## Chipre                       0.10671519 -0.35492465 -0.04894604 -0.4831693
## Gibraltar                   -0.99023032 -0.38876281 -1.31946019  1.9326773
## Malta                       -0.77896674 -0.32108650  0.87790445  1.1273951
## Turquia                      2.65000368 -0.28724834 -0.43426590 -0.9663386
##                                    con        ser          fin         ssp
## Belgica                     -0.4500407  0.2421102  0.510937277  1.13451620
## Dinamarca                   -0.4134521 -0.2303000  0.611285023  1.06580392
## Franca                      -0.1573313  0.2027427  0.887241326  0.69933839
## Alemanha                     0.6842083  0.3011615  0.736719707  0.16109214
## Grecia                      -0.2670973  0.4979991 -0.342018570 -0.82378397
## Irlanda                     -0.1573313  0.4192641  0.435676467 -0.17101725
## Italia                       0.5744422  1.1672469 -0.517627126  0.11528395
## Luxemburgo                   0.8671517  1.0885118  0.510937277  0.29851671
## Paises Baixos               -2.5355954  0.5570504  1.213371503  1.29484487
## Portugal                     0.2451441  0.8129392  0.009198543 -0.27408568
## Espanha                      0.7207970  0.8719905 -0.191496950 -0.03359267
## Reino Unido                 -0.1939200  0.8916743  1.439153933  0.16109214
## Austria                      0.3549102  0.6751529  0.009198543 -0.42296230
## Finlandia                   -0.2670973 -0.2106162  0.485850340  0.71079043
## Islandia                     0.9037403 -0.2303000  0.335328720  0.42448924
## Norurega                    -0.3768634  0.3798965  0.234980973  1.20322849
## Suecia                      -0.4134521 -0.2893513  0.686545833  1.43226945
## Suica                        0.6110309  0.9507255  1.012676010 -0.44586640
## Albania                     -1.5111124 -2.2380433  2.166675096 -3.09128944
## Bulgaria                    -0.3036860 -1.2341717 -1.295322163 -0.69781145
## Republica Tcheca/Eslovaquia  0.3183215 -1.0767016 -1.270235226 -0.46877049
## Hungria                     -0.4134521 -0.4665051 -1.671626213  0.03511961
## Polonia                     -0.4500407 -1.0570178 -1.345496036 -0.28553773
## Romenia                     -0.6329841 -1.7262656 -1.521104593 -1.33912613
## USSR                         0.9769177 -1.5294280 -1.521104593 -0.15956520
## Iugoslavia                   0.2085555 -0.3680863 -0.893931176 -0.90394831
## Chipre                       0.5744422  1.5806058  0.009198543 -0.66345530
## Gibraltar                    3.4283591  1.7380759  1.037762946  0.80240682
## Malta                       -1.0720483 -1.0767016 -0.693235683  1.67276245
## Turquia                     -0.8525162 -0.6436589 -1.069539733 -1.43074251
##                                      tc
## Belgica                      0.35026166
## Dinamarca                    0.45378727
## Franca                       0.14321043
## Alemanha                    -0.27089202
## Grecia                       0.40202446
## Irlanda                     -0.16736641
## Italia                      -0.42618044
## Luxemburgo                   0.35026166
## Paises Baixos                0.35026166
## Portugal                    -0.68499447
## Espanha                     -0.16736641
## Reino Unido                  0.19497324
## Austria                      0.14321043
## Finlandia                    0.71260130
## Islandia                     0.29849885
## Norurega                     1.02317814
## Suecia                       0.55731288
## Suica                        0.03968482
## Albania                     -1.61672498
## Bulgaria                     0.71260130
## Republica Tcheca/Eslovaquia  0.40202446
## Hungria                      1.38551778
## Polonia                     -0.47794324
## Romenia                      0.35026166
## USSR                         1.17846656
## Iugoslavia                   0.86788972
## Chipre                      -2.85903233
## Gibraltar                   -2.91079514
## Malta                        0.55731288
## Turquia                     -0.89204569
## attr(,"scaled:center")
##       afp       mep       fab       fea       con       ser       fin 
## 12.186667  3.446667 19.470000  0.800000  7.530000 15.670000  6.663333 
##       ssp        tc 
## 26.993333  6.123333 
## attr(,"scaled:scale")
##        afp        mep        fab        fea        con        ser 
## 12.3069012  8.8657315  9.6024117  0.6209003  2.7330859  5.0803306 
##        fin        ssp         tc 
##  3.9861383  8.7320627  1.9318891

X <- MingotiTb6.8[, -1]
Z <- scale(X)
rownames(Z) <- MingotiTb6.8[, 1]

# help(dist, help_type = "html")
# method = {"euclidean", "maximum", "manhattan", "canberra", "binary",
#     "minkowski"}

# Distâncias entre cada par de observações.
dis <- dist(as.matrix(Z), method = "euclidian")
dis

##                Reino Unido  Australia     Canada Estados Unidos      Japao
## Australia       0.19355717                                                
## Canada          0.20779029 0.06214472                                     
## Estados Unidos  0.27814083 0.23083305 0.18944523                          
## Japao           0.34306768 0.27686469 0.24307601     0.36501441           
## Franca          0.11524943 0.23117298 0.21629167     0.27115746 0.27976169
## Cingapura       0.56346107 0.51578127 0.49426773     0.55841921 0.38504873
## Argentina       0.81503568 0.99573052 0.99835354     1.01201180 0.97486121
## Uruguai         0.79304403 0.91853110 0.94118345     1.01703855 0.91976319
## Cuba            1.57464604 1.74300711 1.75509726     1.81103978 1.69268529
## Colombia        2.62053376 2.80240051 2.79453633     2.77759859 2.74107540
## Brasil          1.39730464 1.55967812 1.56089377     1.56280255 1.52095232
## Paraguai        2.38117422 2.56418995 2.56221781     2.56335473 2.50400764
## Egito           2.30077327 2.43849365 2.43285246     2.45416892 2.32476330
## Nigeria         4.12434564 4.29743418 4.29634370     4.28781833 4.23679822
## Senegal         3.99268232 4.13795639 4.12793757     4.12067522 4.03017525
## Serra Leoa      5.20377422 5.36000779 5.35551180     5.33990817 5.28765187
## Angola          4.79232674 4.95631145 4.94060936     4.89963995 4.86422212
## Etiopia         4.61000337 4.75452588 4.75081624     4.74404744 4.67620299
## Mocambique      4.44243520 4.57073694 4.56886398     4.55770153 4.51054149
## China           1.72871565 1.88158340 1.89033659     1.93594702 1.81731811
##                    Franca  Cingapura  Argentina    Uruguai       Cuba
## Australia                                                            
## Canada                                                               
## Estados Unidos                                                       
## Japao                                                                
## Franca                                                               
## Cingapura      0.53572548                                            
## Argentina      0.79962669 1.00873195                                 
## Uruguai        0.82056450 0.83922181 0.52273547                      
## Cuba           1.56108474 1.73851071 0.85221230 1.05471980           
## Colombia       2.57826852 2.83055887 1.89897272 2.32307348 1.42530436
## Brasil         1.38785238 1.41533081 0.65612362 0.81139431 0.82648468
## Paraguai       2.34942738 2.54271845 1.59630059 1.94746841 1.02269495
## Egito          2.27008853 2.16311141 1.58678209 1.65971252 1.30277924
## Nigeria        4.10520655 4.13691211 3.31754977 3.50771308 2.78800690
## Senegal        3.95998198 3.85456871 3.24687526 3.37222754 2.86413655
## Serra Leoa     5.18483894 5.11503352 4.43676407 4.55809423 4.00763735
## Angola         4.75208952 4.75042321 4.02465495 4.28389218 3.64392021
## Etiopia        4.59325154 4.47425463 3.87369095 3.94049664 3.49259593
## Mocambique     4.43662764 4.26173028 3.77759800 3.77095591 3.53002192
## China          1.72007417 1.72277325 0.99114196 1.02759921 0.64018451
##                  Colombia     Brasil   Paraguai      Egito    Nigeria
## Australia                                                            
## Canada                                                               
## Estados Unidos                                                       
## Japao                                                                
## Franca                                                               
## Cingapura                                                            
## Argentina                                                            
## Uruguai                                                              
## Cuba                                                                 
## Colombia                                                             
## Brasil         1.75721371                                            
## Paraguai       0.54485463 1.33825611                                 
## Egito          1.81601873 1.02174755 1.34700726                      
## Nigeria        2.23607847 2.75545568 2.04116381 2.09993050           
## Senegal        2.60147614 2.63923886 2.32477032 1.74851973 1.12248639
## Serra Leoa     3.57079502 3.81203313 3.36083930 3.02993261 1.40239103
## Angola         2.83188240 3.48389970 2.79973410 2.78997555 1.29118690
## Etiopia        3.29649696 3.23122080 2.99498604 2.41058540 1.33348354
## Mocambique     3.57976631 3.12247186 3.21159177 2.40049057 1.89335274
## China          1.70543417 0.54131547 1.20286998 0.78347250 2.52769902
##                   Senegal Serra Leoa     Angola    Etiopia Mocambique
## Australia                                                            
## Canada                                                               
## Estados Unidos                                                       
## Japao                                                                
## Franca                                                               
## Cingapura                                                            
## Argentina                                                            
## Uruguai                                                              
## Cuba                                                                 
## Colombia                                                             
## Brasil                                                               
## Paraguai                                                             
## Egito                                                                
## Nigeria                                                              
## Senegal                                                              
## Serra Leoa     1.39271139                                            
## Angola         1.37024188 1.33467034                                 
## Etiopia        0.88634951 0.78523156 1.59216786                      
## Mocambique     1.32823508 1.44631896 2.20491693 0.79112688           
## China          2.43706482 3.61222383 3.37501936 3.02465361 2.97869150

# help(hclust, help_type = "html")
# method = {"ward.D", "ward.D2", "single", "complete", "average"
#   (=UPGMA), "mcquitty" (=WPGMA), "median" (=WPGMC), "centroid" (=
#   UPGMC)}

layout(1)
hcl <- hclust(dis, method = "average")
# hcl <- hclust(dis, method = "ward.D")
plot(hcl, hang = -1)

# Cortando de 2 a 4 grupos.
cutree(hcl, k = 2:4)

##                2 3 4
## Reino Unido    1 1 1
## Australia      1 1 1
## Canada         1 1 1
## Estados Unidos 1 1 1
## Japao          1 1 1
## Franca         1 1 1
## Cingapura      1 1 1
## Argentina      1 1 2
## Uruguai        1 1 2
## Cuba           1 1 2
## Colombia       1 2 3
## Brasil         1 1 2
## Paraguai       1 2 3
## Egito          1 1 2
## Nigeria        2 3 4
## Senegal        2 3 4
## Serra Leoa     2 3 4
## Angola         2 3 4
## Etiopia        2 3 4
## Mocambique     2 3 4
## China          1 1 2

#-----------------------------------------------------------------------
# Aprimorando a exibição fazendo cortes.

library(dendextend)

den <- as.dendrogram(hcl)
k <- 2

plot(den, ylab = "Distância")
rect.hclust(hcl, k = 2, border = "red")

den <- color_branches(dend = den, k = k)
plot(den, ylab = "Distância")
rect.dendrogram(den,
                k = k,
                lty = 2,
                border = "gray30")

# Determina o número ótimo de grupos.
den_k <- find_k(den)
plot(den_k)

plot(color_branches(den, k = den_k$nc))

#-----------------------------------------------------------------------

library(pvclust)

pvc <- pvclust(t(Z),
               method.dist = "euclidean",
               method.hclust = "average",
               nboot = 2000)

## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.75)... Done.
## Bootstrap (r = 0.75)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.25)... Done.
## Bootstrap (r = 1.25)... Done.

pvc

## 
## Cluster method: average
## Distance      : euclidean
## 
## Estimates on edges:
## 
##       au    bp se.au se.bp      v     c  pchi
## 1  0.975 0.954 0.005 0.002 -1.820 0.139 0.000
## 2  0.939 0.705 0.006 0.004 -1.044 0.504 0.007
## 3  0.848 0.243 0.014 0.004 -0.167 0.862 0.000
## 4  0.704 0.528 0.018 0.004 -0.304 0.233 0.000
## 5  0.989 0.793 0.002 0.003 -1.556 0.739 0.040
## 6  0.926 0.771 0.008 0.004 -1.095 0.354 0.000
## 7  0.936 0.686 0.007 0.004 -1.004 0.520 0.416
## 8  0.744 0.406 0.017 0.004 -0.209 0.446 0.000
## 9  0.948 0.714 0.006 0.004 -1.093 0.529 0.005
## 10 0.999 0.216 0.000 0.004 -1.232 2.016 0.004
## 11 0.716 0.189 0.021 0.003  0.155 0.725 0.000
## 12 0.471 0.094 0.060 0.002  0.695 0.622 0.834
## 13 0.938 0.439 0.007 0.004 -0.691 0.846 0.000
## 14 0.588 0.336 0.020 0.004  0.100 0.322 0.006
## 15 0.923 0.272 0.009 0.004 -0.407 1.015 0.010
## 16 0.957 0.285 0.006 0.004 -0.576 1.144 0.000
## 17 0.887 0.827 0.011 0.003 -1.076 0.135 0.000
## 18 0.564 0.386 0.020 0.004  0.063 0.225 0.000
## 19 0.938 0.688 0.006 0.004 -1.012 0.523 0.000
## 20 1.000 1.000 0.000 0.000  0.000 0.000 0.000

dim(Z)

## [1] 21  4

plot(pvc)
pvrect(pvc, alpha = 0.9)

3 DBSCAN

Density Based Clustering of Applications with Noise (DBSCAN);
https://cran.r-project.org/web/packages/dbscan/vignettes/dbscan.pdf;
https://github.com/mhahsler/dbscan;
http://www.sthda.com/english/wiki/print.php?id=246;
http://en.proft.me/2017/02/3/density-based-clustering-r/;

# Dois pacotes com o algoritmo implementado.
# library(dbscan)
library(fpc)

# help(dbscan, help_type = "html")

# dbscan(data,
#        eps,
#        MinPts = 5,
#        scale = FALSE,
#        method = c("hybrid", "raw", "dist"))

# Usando o iris.
X <- scale(iris[, 1:2])

set.seed(123)
db <- fpc::dbscan(X, eps = 0.25, MinPts = 3)

class(db)

## [1] "dbscan"

methods(class = class(db))

## [1] plot    predict print  
## see '?methods' for accessing help and source code

db

## dbscan Pts=150 MinPts=3 eps=0.25
##         0  1  2 3 4  5 6 7 8 9
## border 33  2  1 1 2  1 0 1 0 2
## seed    0 17 11 5 1 61 4 4 3 1
## total  33 19 12 6 3 62 4 5 3 3

# Plot DBSCAN results.
plot(db, X, main = "DBSCAN", frame = FALSE)
box()

# Predição.
predict(db)

##   [1] 1 2 2 2 1 0 1 1 4 2 3 1 2 4 0 0 0 1 0 3 1 3 0 1 1 2 1 1 1 2 2 1 0 0 2
##  [36] 1 0 1 4 1 1 0 2 1 3 2 3 2 3 1 5 6 5 5 5 5 7 9 5 0 0 5 8 5 5 5 5 5 8 5
##  [71] 0 5 0 5 5 5 0 5 5 5 5 5 5 5 5 7 5 0 5 5 5 5 5 0 5 5 5 5 9 5 7 5 0 5 5
## [106] 0 9 0 0 0 6 5 5 5 5 6 5 0 0 8 5 5 0 5 0 5 5 5 5 0 0 0 5 5 0 0 7 6 5 5
## [141] 5 5 5 5 0 5 0 5 7 5

#-----------------------------------------------------------------------

# Cria grid cruzando valores dos parâmetros de tunning.
grid <- expand.grid(eps = c(0.05, 0.1, 0.25, 0.5, 0.8),
                    MinPts = c(1, 2, 3, 5, 8, 13),
                    KEEP.OUT.ATTRS = FALSE)

# Aplica o algoritmo para cada condição de tunning.
m <- mapply(FUN = dbscan,
            eps = grid$eps,
            MinPts = grid$MinPts,
            MoreArgs = list(data = X))

# Empilha as predições de cada condição gerando vetor.
pred <- c(sapply(m, predict))

# Junta as observações, as condições e a predição.
pred <- cbind(X,
              grid[rep(seq_len(nrow(grid)),
                       each = nrow(X)), ],
              clas = pred)
str(pred)

## 'data.frame':    4500 obs. of  5 variables:
##  $ Sepal.Length: num  -0.898 -1.139 -1.381 -1.501 -1.018 ...
##  $ Sepal.Width : num  1.0156 -0.1315 0.3273 0.0979 1.245 ...
##  $ eps         : num  0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ...
##  $ MinPts      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ clas        : num  1 2 3 4 5 6 7 8 9 10 ...

# Converte para fator.
pred <- transform(pred,
                  eps = factor(eps),
                  MinPts = factor(MinPts))

library(grid)

useOuterStrips(
    xyplot(Sepal.Length ~ Sepal.Width | eps + MinPts,
           groups = clas,
           # auto.key = list(columns = 5),
           as.table = TRUE,
           data = pred)) +
    layer({
        n <- nlevels(factor(groups[subscripts]))
        grid.text(x = 0.9, y = 0.9, label = n)
    })

#-----------------------------------------------------------------------

# Número ótimo do parâmetro de tunning.
MinPts <- 2
dbscan::kNNdistplot(X, k = MinPts)

eps <- 0.4
abline(h = eps, lty = 2)

set.seed(456)
db <- fpc::dbscan(X, eps = eps, MinPts = MinPts)
db

## dbscan Pts=150 MinPts=2 eps=0.4
##         0  1 2  3 4 5 6 7
## border 12  0 0  0 0 0 0 0
## seed    0 42 2 82 4 4 2 2
## total  12 42 2 82 4 4 2 2

p <- predict(db)

# Plot DBSCAN results.
plot(db, X, frame = FALSE, asp = 1, type = "n")
points(X, col = p, pch = 19)
points(X[p == 0, ], pch = 4)
for (i in seq_len(nrow(X))) {
    plotrix::draw.circle(X[i, 1],
                         X[i, 2],
                         radius = eps,
                         border = "gray50")
}
box()

# Usando rpanel para controlar.
library(rpanel)

action <- function(panel) {
    with(panel, {
        EPS <- as.numeric(eps)
        MINPTS <- as.numeric(MinPts)
        db <- dbscan::dbscan(X,
                             eps = EPS,
                             minPts = MINPTS)
        p <- predict(db)
        plot(db, X, frame = FALSE, asp = 1, type = "n")
        points(X, col = p, pch = 19)
        points(X[p == 0, ], pch = 4)
        if (circle) {
            for (i in seq_len(nrow(X))) {
                plotrix::draw.circle(X[i, 1],
                                     X[i, 2],
                                     radius = EPS,
                                     border = "gray70")
            }
        }
        box()
    })
    return(panel)
}

# apropos("^rp\\.")
# help(package = "rpanel", help_type = "html")

panel <- rp.control()
# rp.textentry(panel = panel,
#              variable = eps,
#              action = action,
#              labels = c("eps"),
#              initval = c(0.4))
# rp.textentry(panel = panel,
#              variable = MinPts,
#              action = action,
#              labels = c("minPts"),
#              initval = c(5))
rp.checkbox(panel = panel,
            variable = circle,
            action = action,
            title = "Circulos?",
            initval = FALSE)
rp.doublebutton(panel = panel,
                variable = eps,
                step = 0.05,
                initval = 0.4,
                range = c(0.05, 0.8),
                action = action,
                title = "eps",
                showvalue = TRUE)
rp.doublebutton(panel = panel,
                variable = MinPts,
                step = 1,
                initval = 4,
                range = c(1, 8),
                action = action,
                title = "MinPts",
                showvalue = TRUE)

Machine Learning

`leg.ufpr.br/~walmes/ensino/ML`

Análise de Agrupamento

Prof. Walmes M. Zeviani & Prof. Eduardo V. Ferreira

2017-10-26

1 K-means

2 Agrupamento hierárquico

3 DBSCAN