1 Classificador de k vizinhos mais próximos

1.1 Preparando os dados

rm(list = objects())
library(lattice)
library(latticeExtra)

#-----------------------------------------------------------------------
# Aquisição dos dados.

url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
da <- read.csv(url)

# x <- "radius
# texture
# perimeter
# area
# smoothness
# compactness
# concavity
# concave
# symmetry
# fractal"
# dput(readLines(textConnection(x)))

# Criando o nome das variáveis.
nms <- c("radius", "texture", "perimeter", "area", "smoothness",
         "compactness", "concavity", "concave", "symmetry",
         "fractal")
n <- outer(Y = c("mn", "sd", "lg"),
           X = nms,
           FUN = paste, sep = "_")
names(da) <- c("id", "diagnosis", n)
str(da)

## 'data.frame':    568 obs. of  32 variables:
##  $ id            : int  842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 845636 ...
##  $ diagnosis     : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mn     : num  20.6 19.7 11.4 20.3 12.4 ...
##  $ texture_mn    : num  17.8 21.2 20.4 14.3 15.7 ...
##  $ perimeter_mn  : num  132.9 130 77.6 135.1 82.6 ...
##  $ area_mn       : num  1326 1203 386 1297 477 ...
##  $ smoothness_mn : num  0.0847 0.1096 0.1425 0.1003 0.1278 ...
##  $ compactness_mn: num  0.0786 0.1599 0.2839 0.1328 0.17 ...
##  $ concavity_mn  : num  0.0869 0.1974 0.2414 0.198 0.1578 ...
##  $ concave_mn    : num  0.0702 0.1279 0.1052 0.1043 0.0809 ...
##  $ symmetry_mn   : num  0.181 0.207 0.26 0.181 0.209 ...
##  $ fractal_mn    : num  0.0567 0.06 0.0974 0.0588 0.0761 ...
##  $ radius_sd     : num  0.543 0.746 0.496 0.757 0.335 ...
##  $ texture_sd    : num  0.734 0.787 1.156 0.781 0.89 ...
##  $ perimeter_sd  : num  3.4 4.58 3.44 5.44 2.22 ...
##  $ area_sd       : num  74.1 94 27.2 94.4 27.2 ...
##  $ smoothness_sd : num  0.00522 0.00615 0.00911 0.01149 0.00751 ...
##  $ compactness_sd: num  0.0131 0.0401 0.0746 0.0246 0.0335 ...
##  $ concavity_sd  : num  0.0186 0.0383 0.0566 0.0569 0.0367 ...
##  $ concave_sd    : num  0.0134 0.0206 0.0187 0.0188 0.0114 ...
##  $ symmetry_sd   : num  0.0139 0.0225 0.0596 0.0176 0.0216 ...
##  $ fractal_sd    : num  0.00353 0.00457 0.00921 0.00511 0.00508 ...
##  $ radius_lg     : num  25 23.6 14.9 22.5 15.5 ...
##  $ texture_lg    : num  23.4 25.5 26.5 16.7 23.8 ...
##  $ perimeter_lg  : num  158.8 152.5 98.9 152.2 103.4 ...
##  $ area_lg       : num  1956 1709 568 1575 742 ...
##  $ smoothness_lg : num  0.124 0.144 0.21 0.137 0.179 ...
##  $ compactness_lg: num  0.187 0.424 0.866 0.205 0.525 ...
##  $ concavity_lg  : num  0.242 0.45 0.687 0.4 0.535 ...
##  $ concave_lg    : num  0.186 0.243 0.258 0.163 0.174 ...
##  $ symmetry_lg   : num  0.275 0.361 0.664 0.236 0.399 ...
##  $ fractal_lg    : num  0.089 0.0876 0.173 0.0768 0.1244 ...

# head(da[, c(3, 13, 23)])
head(da[, grepl("radius", names(da))])

##   radius_mn radius_sd radius_lg
## 1     20.57    0.5435     24.99
## 2     19.69    0.7456     23.57
## 3     11.42    0.4956     14.91
## 4     20.29    0.7572     22.54
## 5     12.45    0.3345     15.47
## 6     18.25    0.4467     22.88

#-----------------------------------------------------------------------
# Análise exploratória.

# Tabela de frequência.
xtabs(~diagnosis, data = da)

## diagnosis
##   B   M 
## 357 211

# Criando a formula com todas as medidas.
f <- sprintf("%s ~ diagnosis",
             paste(names(da)[-(1:2)], collapse = " + "))

# Opções estéticas do gráfico de caixas.
grep(x = names(trellis.par.get()), pattern = "box", value = TRUE)

## [1] "box.dot"       "box.rectangle" "box.umbrella"  "box.3d"

# Gráfico de caixas.
bwplot(as.formula(f),
       data = da,
       outer = TRUE,
       as.table = TRUE,
       pch = "|",
       fill = "gray",
       scales = "free",
       par.settings = list(
           box.umbrella = list(lty = 1)))

# Só os valores médios.
splom(~da[, grepl(x = names(da), pattern = "_mn")],
      groups = da$diagnosis,
      as.matrix = TRUE,
      auto.key = TRUE)

# Só os desvios padrões.
splom(~da[, grepl(x = names(da), pattern = "_sd")],
      groups = da$diagnosis,
      as.matrix = TRUE,
      auto.key = TRUE)

# Só a média dos extremos.
splom(~da[, grepl(x = names(da), pattern = "_lg")],
      groups = da$diagnosis,
      as.matrix = TRUE,
      auto.key = TRUE)

#-----------------------------------------------------------------------
# Padronização de escala.

# Para verificar as escalas das medidas.
summary(da)

##        id            diagnosis   radius_mn        texture_mn   
##  Min.   :     8670   B:357     Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869222   M:211     1st Qu.:11.697   1st Qu.:16.18  
##  Median :   906157             Median :13.355   Median :18.86  
##  Mean   : 30423820             Mean   :14.120   Mean   :19.31  
##  3rd Qu.:  8825022             3rd Qu.:15.780   3rd Qu.:21.80  
##  Max.   :911320502             Max.   :28.110   Max.   :39.28  
##   perimeter_mn       area_mn       smoothness_mn     compactness_mn   
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
##  1st Qu.: 75.14   1st Qu.: 420.2   1st Qu.:0.08629   1st Qu.:0.06481  
##  Median : 86.21   Median : 548.8   Median :0.09587   Median :0.09252  
##  Mean   : 91.91   Mean   : 654.3   Mean   :0.09632   Mean   :0.10404  
##  3rd Qu.:103.88   3rd Qu.: 782.6   3rd Qu.:0.10530   3rd Qu.:0.13040  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##   concavity_mn       concave_mn       symmetry_mn       fractal_mn     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.1060   Min.   :0.04996  
##  1st Qu.:0.02954   1st Qu.:0.02031   1st Qu.:0.1619   1st Qu.:0.05770  
##  Median :0.06140   Median :0.03345   Median :0.1792   Median :0.06152  
##  Mean   :0.08843   Mean   :0.04875   Mean   :0.1811   Mean   :0.06277  
##  3rd Qu.:0.12965   3rd Qu.:0.07373   3rd Qu.:0.1956   3rd Qu.:0.06612  
##  Max.   :0.42680   Max.   :0.20120   Max.   :0.3040   Max.   :0.09744  
##    radius_sd        texture_sd      perimeter_sd       area_sd       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8331   1st Qu.: 1.605   1st Qu.: 17.850  
##  Median :0.3240   Median :1.1095   Median : 2.285   Median : 24.485  
##  Mean   :0.4040   Mean   :1.2174   Mean   : 2.856   Mean   : 40.138  
##  3rd Qu.:0.4773   3rd Qu.:1.4743   3rd Qu.: 3.337   3rd Qu.: 45.017  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_sd      compactness_sd      concavity_sd    
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005166   1st Qu.:0.013048   1st Qu.:0.01506  
##  Median :0.006374   Median :0.020435   Median :0.02587  
##  Mean   :0.007042   Mean   :0.025437   Mean   :0.03186  
##  3rd Qu.:0.008151   3rd Qu.:0.032218   3rd Qu.:0.04176  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##    concave_sd        symmetry_sd         fractal_sd          radius_lg    
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   Min.   : 7.93  
##  1st Qu.:0.007634   1st Qu.:0.015128   1st Qu.:0.0022445   1st Qu.:13.01  
##  Median :0.010920   Median :0.018725   Median :0.0031615   Median :14.96  
##  Mean   :0.011789   Mean   :0.020526   Mean   :0.0037907   Mean   :16.25  
##  3rd Qu.:0.014710   3rd Qu.:0.023398   3rd Qu.:0.0045258   3rd Qu.:18.77  
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   Max.   :36.04  
##    texture_lg     perimeter_lg       area_lg       smoothness_lg    
##  Min.   :12.02   Min.   : 50.41   Min.   : 185.2   Min.   :0.07117  
##  1st Qu.:21.09   1st Qu.: 84.10   1st Qu.: 515.0   1st Qu.:0.11660  
##  Median :25.43   Median : 97.66   Median : 685.5   Median :0.13130  
##  Mean   :25.69   Mean   :107.13   Mean   : 878.6   Mean   :0.13232  
##  3rd Qu.:29.76   3rd Qu.:125.17   3rd Qu.:1073.5   3rd Qu.:0.14600  
##  Max.   :49.54   Max.   :251.20   Max.   :4254.0   Max.   :0.22260  
##  compactness_lg     concavity_lg      concave_lg       symmetry_lg    
##  Min.   :0.02729   Min.   :0.0000   Min.   :0.00000   Min.   :0.1565  
##  1st Qu.:0.14690   1st Qu.:0.1145   1st Qu.:0.06473   1st Qu.:0.2504  
##  Median :0.21185   Median :0.2266   Median :0.09984   Median :0.2821  
##  Mean   :0.25354   Mean   :0.2714   Mean   :0.11434   Mean   :0.2898  
##  3rd Qu.:0.33760   3rd Qu.:0.3814   3rd Qu.:0.16132   3rd Qu.:0.3177  
##  Max.   :1.05800   Max.   :1.2520   Max.   :0.29100   Max.   :0.6638  
##    fractal_lg     
##  Min.   :0.05504  
##  1st Qu.:0.07141  
##  Median :0.08002  
##  Mean   :0.08388  
##  3rd Qu.:0.09206  
##  Max.   :0.20750

# Para padronizar para intervalo unitário.
to_unit <- function(x, ...) {
    z <- x - min(x, ...)
    return(z/max(z, ...))
}

# 1, 2, 3, testando...
sort(to_unit(rnorm(5)))

## [1] 0.0000000 0.1808968 0.3273864 0.8289552 1.0000000

# Criando uma versão com valores padronizados.
db <- da
db$id <- NULL
db[, -1] <- as.data.frame(lapply(da[, -(1:2)],
                                 FUN = to_unit))
summary(db)

##  diagnosis   radius_mn        texture_mn      perimeter_mn   
##  B:357     Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  M:211     1st Qu.:0.2232   1st Qu.:0.2187   1st Qu.:0.2166  
##            Median :0.3017   Median :0.3093   Median :0.2931  
##            Mean   :0.3379   Mean   :0.3245   Mean   :0.3326  
##            3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4152  
##            Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     area_mn       smoothness_mn    compactness_mn    concavity_mn    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1174   1st Qu.:0.3039   1st Qu.:0.1394   1st Qu.:0.06921  
##  Median :0.1719   Median :0.3903   Median :0.2244   Median :0.14386  
##  Mean   :0.2167   Mean   :0.3944   Mean   :0.2597   Mean   :0.20719  
##  3rd Qu.:0.2711   3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30377  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##    concave_mn      symmetry_mn       fractal_mn       radius_sd      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1009   1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04377  
##  Median :0.1663   Median :0.3697   Median :0.2434   Median :0.07693  
##  Mean   :0.2423   Mean   :0.3791   Mean   :0.2698   Mean   :0.10591  
##  3rd Qu.:0.3665   3rd Qu.:0.4527   3rd Qu.:0.3404   3rd Qu.:0.13247  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##    texture_sd      perimeter_sd        area_sd        smoothness_sd   
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.1045   1st Qu.:0.03996   1st Qu.:0.02064   1st Qu.:0.1174  
##  Median :0.1656   Median :0.07202   Median :0.03303   Median :0.1585  
##  Mean   :0.1894   Mean   :0.09890   Mean   :0.06226   Mean   :0.1812  
##  3rd Qu.:0.2462   3rd Qu.:0.12155   3rd Qu.:0.07138   3rd Qu.:0.2189  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##  compactness_sd     concavity_sd       concave_sd      symmetry_sd    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.08108   1st Qu.:0.03804   1st Qu.:0.1446   1st Qu.:0.1020  
##  Median :0.13656   Median :0.06534   Median :0.2069   Median :0.1526  
##  Mean   :0.17413   Mean   :0.08044   Mean   :0.2233   Mean   :0.1779  
##  3rd Qu.:0.22505   3rd Qu.:0.10547   3rd Qu.:0.2787   3rd Qu.:0.2183  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    fractal_sd        radius_lg        texture_lg      perimeter_lg   
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.04663   1st Qu.:0.1807   1st Qu.:0.2419   1st Qu.:0.1678  
##  Median :0.07831   Median :0.2503   Median :0.3573   Median :0.2353  
##  Mean   :0.10005   Mean   :0.2961   Mean   :0.3644   Mean   :0.2825  
##  3rd Qu.:0.12544   3rd Qu.:0.3855   3rd Qu.:0.4727   3rd Qu.:0.3724  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     area_lg        smoothness_lg    compactness_lg    concavity_lg    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.08105   1st Qu.:0.3000   1st Qu.:0.1160   1st Qu.:0.09143  
##  Median :0.12297   Median :0.3971   Median :0.1791   Median :0.18095  
##  Mean   :0.17041   Mean   :0.4038   Mean   :0.2195   Mean   :0.21678  
##  3rd Qu.:0.21832   3rd Qu.:0.4942   3rd Qu.:0.3011   3rd Qu.:0.30463  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##    concave_lg      symmetry_lg       fractal_lg    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2224   1st Qu.:0.1850   1st Qu.:0.1074  
##  Median :0.3431   Median :0.2475   Median :0.1638  
##  Mean   :0.3929   Mean   :0.2627   Mean   :0.1892  
##  3rd Qu.:0.5544   3rd Qu.:0.3177   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

1.2 Aplicando o k-NN com o pacote `class`

#-----------------------------------------------------------------------
# Aplicando o k-nn.

library(class)

# Separando em treino e test.
set.seed(123)
n <- nrow(da)
# i <- sample(x = 1:n, size = floor(n * 0.75))
# db_train <- db[i, ]
# nrow(db_train)
# db_test <- db[-i, ]
# nrow(db_test)
i <- sample(x = c(TRUE, FALSE),
            size = n,
            replace = TRUE,
            prob = c(0.75, 1 - 0.75))
db_train <- db[i, ]
nrow(db_train)

## [1] 430

db_test <- db[!i, ]
nrow(db_test)

## [1] 138

c(nrow(db_train), nrow(db_test))/nrow(db)

## [1] 0.7570423 0.2429577

# Obtendo as predições para o conjunto de teste via conjunto de treino.
m0 <- knn(train = db_train[, -1],
          test = db_test[, -1],
          cl = db_train[, 1],
          k = 1)

# Tabela de confusão.
ct <- table(db_test[, 1], m0)
ct

##    m0
##      B  M
##   B 90  4
##   M  1 43

# Fração de acertos.
sum(diag(ct))/sum(ct)

## [1] 0.9637681

#-----------------------------------------------------------------------
# Simplificando para visualizar e entender.

# Formulas usando sd contra mn.
f <- sapply(nms,
            FUN = function(x) {
                as.formula(sprintf("%s_sd ~ %s_mn", x, x))
            })
xyplot.list(f,
            data = db,
            groups = diagnosis,
            type = c("p", "smooth"),
            as.table = TRUE,
            auto.key = TRUE,
            x.same = FALSE, y.same = FALSE)

# Dados de treino (aberto) e teste (fechado).
xyplot(radius_mn ~ radius_sd,
       data = db_train,
       groups = diagnosis,
       auto.key = TRUE,
       aspect = "iso") +
    as.layer(xyplot(radius_mn ~ radius_sd,
                    data = db_test,
                    groups = diagnosis,
                    pch = 19))

# Criando um grid fino de valores para traçar a fronteira do
# classificador.
grid <- expand.grid(seq(0, 1, length.out = 100),
                    seq(0, 1, length.out = 100),
                    KEEP.OUT.ATTRS = FALSE)
names(grid) <- c("radius_mn", "radius_sd")

# Usando apenas o vizinho mais próximo.
m0 <- knn(train = db_train[, names(grid)],
          test = grid,
          cl = db_train[, 1],
          k = 1)

# Gráfico da fronteira de classificação.
levelplot(-as.integer(m0) ~ radius_sd + radius_mn,
          data = grid,
          col.regions = gray.colors,
          colorkey = FALSE,
          aspect = "iso") +
    as.layer(xyplot(radius_mn ~ radius_sd,
                    data = db_train,
                    groups = diagnosis))

#-----------------------------------------------------------------------
# Serializar variando o número de vizinhos.

# Valores para o número de vizinhos.
k <- c(1:5, 7, 11, 19)

# Criando a predição das categorias variando o k.
pred <- lapply(k,
               FUN = knn,
               train = db_train[, names(grid)],
               test = grid,
               cl = db_train[, 1])
names(pred) <- paste0("k", k)
# Coerção.
pred <- as.data.frame(pred)
# Junção.
pred <- cbind(grid, pred)
# Empilha.
pred <- reshape2::melt(data = pred, id.vars = names(grid))
# Formata.
pred$value <- factor(pred$value, levels(db$diagnosis))
str(pred)

## 'data.frame':    80000 obs. of  4 variables:
##  $ radius_mn: num  0 0.0101 0.0202 0.0303 0.0404 ...
##  $ radius_sd: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ variable : Factor w/ 8 levels "k1","k2","k3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value    : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 1 1 1 ...

# Suavização da fronteira como função do número de vizinhos.
levelplot(value ~ radius_sd + radius_mn | variable,
          data = pred,
          col.regions = gray.colors,
          colorkey = FALSE,
          as.table = TRUE,
          aspect = "iso") +
    as.layer(xyplot(radius_mn ~ radius_sd,
                    data = db_train,
                    groups = diagnosis))

1.3 Aplicando o k-NN com o pacote `caret`

#-----------------------------------------------------------------------
# Divisão dos dados em treino e teste.

library(caret)

# Proporções das classes no treino e teste usando sample().
rbind(train = prop.table(xtabs(~diagnosis, db_train)),
      test = prop.table(xtabs(~diagnosis, db_test)))

##               B         M
## train 0.6116279 0.3883721
## test  0.6811594 0.3188406

# Criando as partições.
set.seed(789)
intrain <- createDataPartition(y = db$diagnosis,
                               p = 0.75,
                               list = FALSE)

db_train <- db[intrain, ]
db_test <- db[-intrain, ]
nrow(db_train)

## [1] 427

nrow(db_test)

## [1] 141

# Proporções das classes no treino e teste usando createDataPartition().
rbind(train = prop.table(xtabs(~diagnosis, db_train)),
      test = prop.table(xtabs(~diagnosis, db_test)))

##               B         M
## train 0.6276347 0.3723653
## test  0.6312057 0.3687943

#-----------------------------------------------------------------------
# Submete para o método.

# Parametriza a valiação cruzada.
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

set.seed(159)
knn_fit <- train(diagnosis ~ .,
                 data = db_train,
                 method = "knn",
                 trControl = trctrl,
                 tuneLength = 15)

# Classe e métodos.
class(knn_fit)

## [1] "train"         "train.formula"

methods(class = class(knn_fit))

##  [1] confusionMatrix densityplot     fitted          ggplot         
##  [5] histogram       levels          plot            predictors     
##  [9] predict         print           residuals       stripplot      
## [13] summary         update          varImp          xyplot         
## see '?methods' for accessing help and source code

# Usa a função caret::knn3() como workhorse. Baseia-se no código em C do
# pacote class.
class(knn_fit$finalModel)

## [1] "knn3"

methods(class = class(knn_fit$finalModel))

## [1] predict print  
## see '?methods' for accessing help and source code

# help(knn3, help_type = "html")

# Resultado do procedimento.
knn_fit

## k-Nearest Neighbors 
## 
## 427 samples
##  30 predictors
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 385, 384, 384, 384, 385, 384, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9657438  0.9258467
##    7  0.9688446  0.9317329
##    9  0.9719454  0.9384056
##   11  0.9704319  0.9349688
##   13  0.9657623  0.9240747
##   15  0.9657438  0.9241964
##   17  0.9649871  0.9223949
##   19  0.9649686  0.9223829
##   21  0.9633813  0.9187670
##   23  0.9626061  0.9171135
##   25  0.9618309  0.9153436
##   27  0.9602621  0.9118553
##   29  0.9610557  0.9133645
##   31  0.9594869  0.9099370
##   33  0.9594869  0.9099370
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was k = 9.

# Gráfico para escolha do parâmetro de tunning.
plot(knn_fit)

# Predição nos dados deixados de fora.
m0 <- predict(knn_fit, newdata = db_test)

# Matriz de confusão.
confusionMatrix(m0, db_test$diagnosis)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 86  3
##          M  3 49
##                                           
##                Accuracy : 0.9574          
##                  95% CI : (0.9097, 0.9842)
##     No Information Rate : 0.6312          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9086          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9663          
##             Specificity : 0.9423          
##          Pos Pred Value : 0.9663          
##          Neg Pred Value : 0.9423          
##              Prevalence : 0.6312          
##          Detection Rate : 0.6099          
##    Detection Prevalence : 0.6312          
##       Balanced Accuracy : 0.9543          
##                                           
##        'Positive' Class : B               
##

2 k-NN para os dados de cultivares de uva

# Carrega os dados.
url <- "http://www.leg.ufpr.br/~walmes/data/areafoliarUva.txt"
uva <- read.table(url, header = TRUE, sep = "\t",
                  stringsAsFactors = FALSE)
uva$cult <- factor(uva$cult)
uva$id <- NULL

# Comprimento da nervura lateral: média dos lados direito e esquerdo.
uva$nl <- with(uva, apply(cbind(nld, nle), 1, mean))
uva <- subset(uva, select = -c(nld, nle))
str(uva)

## 'data.frame':    300 obs. of  7 variables:
##  $ cult: Factor w/ 3 levels "malbec","merlot",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ area: num  100.8 85.8 119.5 137 84.7 ...
##  $ mc  : num  12 11.5 12.5 15.5 10 12 15.5 17.5 13.5 13.3 ...
##  $ nc  : num  7.5 9 8.5 10 7 8.5 11 13 10 9.5 ...
##  $ ml  : num  12.8 10.5 13 14.4 11 12 14 14 12 15 ...
##  $ cll : num  9.5 9.5 10.2 12 7.5 8.9 13.5 10.8 9.7 10.3 ...
##  $ nl  : num  6.95 7.75 8.8 9.5 6.75 ...

# Especificação da validação cruzada.
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)

# Opções de SVM.
grep(pattern = "^svm", x = names(getModelInfo()), value = TRUE)

##  [1] "svmBoundrangeString" "svmExpoString"       "svmLinear"          
##  [4] "svmLinear2"          "svmLinear3"          "svmLinearWeights"   
##  [7] "svmLinearWeights2"   "svmPoly"             "svmRadial"          
## [10] "svmRadialCost"       "svmRadialSigma"      "svmRadialWeights"   
## [13] "svmSpectrumString"

# Ajuste.
set.seed(1234)
svm_fit <- train(cult ~ .,
                 data = uva,
                 method = "svmRadial",
                 trControl = trctrl,
                 preProcess = c("center", "scale"),
                 tuneLength = 10)

knn_fit <- train(cult ~ .,
                 data = uva,
                 method = "knn",
                 trControl = trctrl,
                 preProcess = c("center", "scale"),
                 tuneLength = 10)

# Resultado dos métodos de classificação.
svm_fit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 300 samples
##   6 predictors
##   3 classes: 'malbec', 'merlot', 'sauvignonblanc' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 270, 270, 270, 270, 270, 270, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##     0.25  0.5466667  0.3200000
##     0.50  0.5444444  0.3166667
##     1.00  0.5600000  0.3400000
##     2.00  0.5644444  0.3466667
##     4.00  0.5666667  0.3500000
##     8.00  0.5522222  0.3283333
##    16.00  0.5711111  0.3566667
##    32.00  0.5477778  0.3216667
##    64.00  0.5311111  0.2966667
##   128.00  0.5055556  0.2583333
## 
## Tuning parameter 'sigma' was held constant at a value of 0.4384058
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.4384058 and C = 16.

knn_fit

## k-Nearest Neighbors 
## 
## 300 samples
##   6 predictors
##   3 classes: 'malbec', 'merlot', 'sauvignonblanc' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 270, 270, 270, 270, 270, 270, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.4855556  0.2283333
##    7  0.5111111  0.2666667
##    9  0.5155556  0.2733333
##   11  0.5288889  0.2933333
##   13  0.5266667  0.2900000
##   15  0.5144444  0.2716667
##   17  0.5100000  0.2650000
##   19  0.5133333  0.2700000
##   21  0.4933333  0.2400000
##   23  0.5088889  0.2633333
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was k = 11.

plot(svm_fit)

plot(knn_fit)

# Mostra que foi feito a chamada da kernlab::ksvm().
svm_fit$finalModel

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  1 
## 
## Number of Support Vectors : 249 
## 
## Objective Function Value : -740.2169 -899.5419 -480.9223 
## Training error : 0.096667

knn_fit$finalModel

## 11-nearest neighbor classification model
## Training set class distribution:
## 
##         malbec         merlot sauvignonblanc 
##            100            100            100

# Matriz de confusão.
confusionMatrix(predict(svm_fit), uva$cult)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       malbec merlot sauvignonblanc
##   malbec             91      9              9
##   merlot              3     89              0
##   sauvignonblanc      6      2             91
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9033          
##                  95% CI : (0.8641, 0.9343)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.855           
##  Mcnemar's Test P-Value : 0.1328          
## 
## Statistics by Class:
## 
##                      Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity                 0.9100        0.8900                0.9100
## Specificity                 0.9100        0.9850                0.9600
## Pos Pred Value              0.8349        0.9674                0.9192
## Neg Pred Value              0.9529        0.9471                0.9552
## Prevalence                  0.3333        0.3333                0.3333
## Detection Rate              0.3033        0.2967                0.3033
## Detection Prevalence        0.3633        0.3067                0.3300
## Balanced Accuracy           0.9100        0.9375                0.9350

confusionMatrix(predict(knn_fit), uva$cult)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       malbec merlot sauvignonblanc
##   malbec             54     20             15
##   merlot             16     65             14
##   sauvignonblanc     30     15             71
## 
## Overall Statistics
##                                         
##                Accuracy : 0.6333        
##                  95% CI : (0.576, 0.688)
##     No Information Rate : 0.3333        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.45          
##  Mcnemar's Test P-Value : 0.1399        
## 
## Statistics by Class:
## 
##                      Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity                 0.5400        0.6500                0.7100
## Specificity                 0.8250        0.8500                0.7750
## Pos Pred Value              0.6067        0.6842                0.6121
## Neg Pred Value              0.7820        0.8293                0.8424
## Prevalence                  0.3333        0.3333                0.3333
## Detection Rate              0.1800        0.2167                0.2367
## Detection Prevalence        0.2967        0.3167                0.3867
## Balanced Accuracy           0.6825        0.7500                0.7425

Machine Learning

`leg.ufpr.br/~walmes/ensino/ML`

Classificador de k vizinhos mais próximos

Prof. Walmes M. Zeviani & Prof. Eduardo V. Ferreira

2017-10-24

1 Classificador de k vizinhos mais próximos

1.1 Preparando os dados

1.2 Aplicando o k-NN com o pacote `class`

1.3 Aplicando o k-NN com o pacote `caret`

2 k-NN para os dados de cultivares de uva

3 Referências

Machine Learning

leg.ufpr.br/~walmes/ensino/ML

Classificador de k vizinhos mais próximos

Prof. Walmes M. Zeviani & Prof. Eduardo V. Ferreira

2017-10-24

1 Classificador de k vizinhos mais próximos

1.1 Preparando os dados

1.2 Aplicando o k-NN com o pacote class

1.3 Aplicando o k-NN com o pacote caret

2 k-NN para os dados de cultivares de uva

3 Referências

`leg.ufpr.br/~walmes/ensino/ML`

1.2 Aplicando o k-NN com o pacote `class`

1.3 Aplicando o k-NN com o pacote `caret`