Classificador de k vizinhos mais próximos
Preparando os dados
rm(list = objects())
library(lattice)
library(latticeExtra)
#-----------------------------------------------------------------------
# Aquisição dos dados.
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
da <- read.csv(url)
# x <- "radius
# texture
# perimeter
# area
# smoothness
# compactness
# concavity
# concave
# symmetry
# fractal"
# dput(readLines(textConnection(x)))
# Criando o nome das variáveis.
nms <- c("radius", "texture", "perimeter", "area", "smoothness",
"compactness", "concavity", "concave", "symmetry",
"fractal")
n <- outer(Y = c("mn", "sd", "lg"),
X = nms,
FUN = paste, sep = "_")
names(da) <- c("id", "diagnosis", n)
str(da)
## 'data.frame': 568 obs. of 32 variables:
## $ id : int 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 845636 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mn : num 20.6 19.7 11.4 20.3 12.4 ...
## $ texture_mn : num 17.8 21.2 20.4 14.3 15.7 ...
## $ perimeter_mn : num 132.9 130 77.6 135.1 82.6 ...
## $ area_mn : num 1326 1203 386 1297 477 ...
## $ smoothness_mn : num 0.0847 0.1096 0.1425 0.1003 0.1278 ...
## $ compactness_mn: num 0.0786 0.1599 0.2839 0.1328 0.17 ...
## $ concavity_mn : num 0.0869 0.1974 0.2414 0.198 0.1578 ...
## $ concave_mn : num 0.0702 0.1279 0.1052 0.1043 0.0809 ...
## $ symmetry_mn : num 0.181 0.207 0.26 0.181 0.209 ...
## $ fractal_mn : num 0.0567 0.06 0.0974 0.0588 0.0761 ...
## $ radius_sd : num 0.543 0.746 0.496 0.757 0.335 ...
## $ texture_sd : num 0.734 0.787 1.156 0.781 0.89 ...
## $ perimeter_sd : num 3.4 4.58 3.44 5.44 2.22 ...
## $ area_sd : num 74.1 94 27.2 94.4 27.2 ...
## $ smoothness_sd : num 0.00522 0.00615 0.00911 0.01149 0.00751 ...
## $ compactness_sd: num 0.0131 0.0401 0.0746 0.0246 0.0335 ...
## $ concavity_sd : num 0.0186 0.0383 0.0566 0.0569 0.0367 ...
## $ concave_sd : num 0.0134 0.0206 0.0187 0.0188 0.0114 ...
## $ symmetry_sd : num 0.0139 0.0225 0.0596 0.0176 0.0216 ...
## $ fractal_sd : num 0.00353 0.00457 0.00921 0.00511 0.00508 ...
## $ radius_lg : num 25 23.6 14.9 22.5 15.5 ...
## $ texture_lg : num 23.4 25.5 26.5 16.7 23.8 ...
## $ perimeter_lg : num 158.8 152.5 98.9 152.2 103.4 ...
## $ area_lg : num 1956 1709 568 1575 742 ...
## $ smoothness_lg : num 0.124 0.144 0.21 0.137 0.179 ...
## $ compactness_lg: num 0.187 0.424 0.866 0.205 0.525 ...
## $ concavity_lg : num 0.242 0.45 0.687 0.4 0.535 ...
## $ concave_lg : num 0.186 0.243 0.258 0.163 0.174 ...
## $ symmetry_lg : num 0.275 0.361 0.664 0.236 0.399 ...
## $ fractal_lg : num 0.089 0.0876 0.173 0.0768 0.1244 ...
# head(da[, c(3, 13, 23)])
head(da[, grepl("radius", names(da))])
## radius_mn radius_sd radius_lg
## 1 20.57 0.5435 24.99
## 2 19.69 0.7456 23.57
## 3 11.42 0.4956 14.91
## 4 20.29 0.7572 22.54
## 5 12.45 0.3345 15.47
## 6 18.25 0.4467 22.88
#-----------------------------------------------------------------------
# Análise exploratória.
# Tabela de frequência.
xtabs(~diagnosis, data = da)
## diagnosis
## B M
## 357 211
# Criando a formula com todas as medidas.
f <- sprintf("%s ~ diagnosis",
paste(names(da)[-(1:2)], collapse = " + "))
# Opções estéticas do gráfico de caixas.
grep(x = names(trellis.par.get()), pattern = "box", value = TRUE)
## [1] "box.dot" "box.rectangle" "box.umbrella" "box.3d"
# Gráfico de caixas.
bwplot(as.formula(f),
data = da,
outer = TRUE,
as.table = TRUE,
pch = "|",
fill = "gray",
scales = "free",
par.settings = list(
box.umbrella = list(lty = 1)))

# Só os valores médios.
splom(~da[, grepl(x = names(da), pattern = "_mn")],
groups = da$diagnosis,
as.matrix = TRUE,
auto.key = TRUE)

# Só os desvios padrões.
splom(~da[, grepl(x = names(da), pattern = "_sd")],
groups = da$diagnosis,
as.matrix = TRUE,
auto.key = TRUE)

# Só a média dos extremos.
splom(~da[, grepl(x = names(da), pattern = "_lg")],
groups = da$diagnosis,
as.matrix = TRUE,
auto.key = TRUE)

#-----------------------------------------------------------------------
# Padronização de escala.
# Para verificar as escalas das medidas.
summary(da)
## id diagnosis radius_mn texture_mn
## Min. : 8670 B:357 Min. : 6.981 Min. : 9.71
## 1st Qu.: 869222 M:211 1st Qu.:11.697 1st Qu.:16.18
## Median : 906157 Median :13.355 Median :18.86
## Mean : 30423820 Mean :14.120 Mean :19.31
## 3rd Qu.: 8825022 3rd Qu.:15.780 3rd Qu.:21.80
## Max. :911320502 Max. :28.110 Max. :39.28
## perimeter_mn area_mn smoothness_mn compactness_mn
## Min. : 43.79 Min. : 143.5 Min. :0.05263 Min. :0.01938
## 1st Qu.: 75.14 1st Qu.: 420.2 1st Qu.:0.08629 1st Qu.:0.06481
## Median : 86.21 Median : 548.8 Median :0.09587 Median :0.09252
## Mean : 91.91 Mean : 654.3 Mean :0.09632 Mean :0.10404
## 3rd Qu.:103.88 3rd Qu.: 782.6 3rd Qu.:0.10530 3rd Qu.:0.13040
## Max. :188.50 Max. :2501.0 Max. :0.16340 Max. :0.34540
## concavity_mn concave_mn symmetry_mn fractal_mn
## Min. :0.00000 Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02954 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.06140 Median :0.03345 Median :0.1792 Median :0.06152
## Mean :0.08843 Mean :0.04875 Mean :0.1811 Mean :0.06277
## 3rd Qu.:0.12965 3rd Qu.:0.07373 3rd Qu.:0.1956 3rd Qu.:0.06612
## Max. :0.42680 Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_sd texture_sd perimeter_sd area_sd
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8331 1st Qu.: 1.605 1st Qu.: 17.850
## Median :0.3240 Median :1.1095 Median : 2.285 Median : 24.485
## Mean :0.4040 Mean :1.2174 Mean : 2.856 Mean : 40.138
## 3rd Qu.:0.4773 3rd Qu.:1.4743 3rd Qu.: 3.337 3rd Qu.: 45.017
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_sd compactness_sd concavity_sd
## Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.:0.005166 1st Qu.:0.013048 1st Qu.:0.01506
## Median :0.006374 Median :0.020435 Median :0.02587
## Mean :0.007042 Mean :0.025437 Mean :0.03186
## 3rd Qu.:0.008151 3rd Qu.:0.032218 3rd Qu.:0.04176
## Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave_sd symmetry_sd fractal_sd radius_lg
## Min. :0.000000 Min. :0.007882 Min. :0.0008948 Min. : 7.93
## 1st Qu.:0.007634 1st Qu.:0.015128 1st Qu.:0.0022445 1st Qu.:13.01
## Median :0.010920 Median :0.018725 Median :0.0031615 Median :14.96
## Mean :0.011789 Mean :0.020526 Mean :0.0037907 Mean :16.25
## 3rd Qu.:0.014710 3rd Qu.:0.023398 3rd Qu.:0.0045258 3rd Qu.:18.77
## Max. :0.052790 Max. :0.078950 Max. :0.0298400 Max. :36.04
## texture_lg perimeter_lg area_lg smoothness_lg
## Min. :12.02 Min. : 50.41 Min. : 185.2 Min. :0.07117
## 1st Qu.:21.09 1st Qu.: 84.10 1st Qu.: 515.0 1st Qu.:0.11660
## Median :25.43 Median : 97.66 Median : 685.5 Median :0.13130
## Mean :25.69 Mean :107.13 Mean : 878.6 Mean :0.13232
## 3rd Qu.:29.76 3rd Qu.:125.17 3rd Qu.:1073.5 3rd Qu.:0.14600
## Max. :49.54 Max. :251.20 Max. :4254.0 Max. :0.22260
## compactness_lg concavity_lg concave_lg symmetry_lg
## Min. :0.02729 Min. :0.0000 Min. :0.00000 Min. :0.1565
## 1st Qu.:0.14690 1st Qu.:0.1145 1st Qu.:0.06473 1st Qu.:0.2504
## Median :0.21185 Median :0.2266 Median :0.09984 Median :0.2821
## Mean :0.25354 Mean :0.2714 Mean :0.11434 Mean :0.2898
## 3rd Qu.:0.33760 3rd Qu.:0.3814 3rd Qu.:0.16132 3rd Qu.:0.3177
## Max. :1.05800 Max. :1.2520 Max. :0.29100 Max. :0.6638
## fractal_lg
## Min. :0.05504
## 1st Qu.:0.07141
## Median :0.08002
## Mean :0.08388
## 3rd Qu.:0.09206
## Max. :0.20750
# Para padronizar para intervalo unitário.
to_unit <- function(x, ...) {
z <- x - min(x, ...)
return(z/max(z, ...))
}
# 1, 2, 3, testando...
sort(to_unit(rnorm(5)))
## [1] 0.0000000 0.1808968 0.3273864 0.8289552 1.0000000
# Criando uma versão com valores padronizados.
db <- da
db$id <- NULL
db[, -1] <- as.data.frame(lapply(da[, -(1:2)],
FUN = to_unit))
summary(db)
## diagnosis radius_mn texture_mn perimeter_mn
## B:357 Min. :0.0000 Min. :0.0000 Min. :0.0000
## M:211 1st Qu.:0.2232 1st Qu.:0.2187 1st Qu.:0.2166
## Median :0.3017 Median :0.3093 Median :0.2931
## Mean :0.3379 Mean :0.3245 Mean :0.3326
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4152
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## area_mn smoothness_mn compactness_mn concavity_mn
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1174 1st Qu.:0.3039 1st Qu.:0.1394 1st Qu.:0.06921
## Median :0.1719 Median :0.3903 Median :0.2244 Median :0.14386
## Mean :0.2167 Mean :0.3944 Mean :0.2597 Mean :0.20719
## 3rd Qu.:0.2711 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30377
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## concave_mn symmetry_mn fractal_mn radius_sd
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1009 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04377
## Median :0.1663 Median :0.3697 Median :0.2434 Median :0.07693
## Mean :0.2423 Mean :0.3791 Mean :0.2698 Mean :0.10591
## 3rd Qu.:0.3665 3rd Qu.:0.4527 3rd Qu.:0.3404 3rd Qu.:0.13247
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## texture_sd perimeter_sd area_sd smoothness_sd
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.1045 1st Qu.:0.03996 1st Qu.:0.02064 1st Qu.:0.1174
## Median :0.1656 Median :0.07202 Median :0.03303 Median :0.1585
## Mean :0.1894 Mean :0.09890 Mean :0.06226 Mean :0.1812
## 3rd Qu.:0.2462 3rd Qu.:0.12155 3rd Qu.:0.07138 3rd Qu.:0.2189
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## compactness_sd concavity_sd concave_sd symmetry_sd
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.08108 1st Qu.:0.03804 1st Qu.:0.1446 1st Qu.:0.1020
## Median :0.13656 Median :0.06534 Median :0.2069 Median :0.1526
## Mean :0.17413 Mean :0.08044 Mean :0.2233 Mean :0.1779
## 3rd Qu.:0.22505 3rd Qu.:0.10547 3rd Qu.:0.2787 3rd Qu.:0.2183
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## fractal_sd radius_lg texture_lg perimeter_lg
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.04663 1st Qu.:0.1807 1st Qu.:0.2419 1st Qu.:0.1678
## Median :0.07831 Median :0.2503 Median :0.3573 Median :0.2353
## Mean :0.10005 Mean :0.2961 Mean :0.3644 Mean :0.2825
## 3rd Qu.:0.12544 3rd Qu.:0.3855 3rd Qu.:0.4727 3rd Qu.:0.3724
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## area_lg smoothness_lg compactness_lg concavity_lg
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.08105 1st Qu.:0.3000 1st Qu.:0.1160 1st Qu.:0.09143
## Median :0.12297 Median :0.3971 Median :0.1791 Median :0.18095
## Mean :0.17041 Mean :0.4038 Mean :0.2195 Mean :0.21678
## 3rd Qu.:0.21832 3rd Qu.:0.4942 3rd Qu.:0.3011 3rd Qu.:0.30463
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## concave_lg symmetry_lg fractal_lg
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2224 1st Qu.:0.1850 1st Qu.:0.1074
## Median :0.3431 Median :0.2475 Median :0.1638
## Mean :0.3929 Mean :0.2627 Mean :0.1892
## 3rd Qu.:0.5544 3rd Qu.:0.3177 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000 Max. :1.0000
Aplicando o k-NN com o pacote class
#-----------------------------------------------------------------------
# Aplicando o k-nn.
library(class)
# Separando em treino e test.
set.seed(123)
n <- nrow(da)
# i <- sample(x = 1:n, size = floor(n * 0.75))
# db_train <- db[i, ]
# nrow(db_train)
# db_test <- db[-i, ]
# nrow(db_test)
i <- sample(x = c(TRUE, FALSE),
size = n,
replace = TRUE,
prob = c(0.75, 1 - 0.75))
db_train <- db[i, ]
nrow(db_train)
## [1] 430
db_test <- db[!i, ]
nrow(db_test)
## [1] 138
c(nrow(db_train), nrow(db_test))/nrow(db)
## [1] 0.7570423 0.2429577
# Obtendo as predições para o conjunto de teste via conjunto de treino.
m0 <- knn(train = db_train[, -1],
test = db_test[, -1],
cl = db_train[, 1],
k = 1)
# Tabela de confusão.
ct <- table(db_test[, 1], m0)
ct
## m0
## B M
## B 90 4
## M 1 43
# Fração de acertos.
sum(diag(ct))/sum(ct)
## [1] 0.9637681
#-----------------------------------------------------------------------
# Simplificando para visualizar e entender.
# Formulas usando sd contra mn.
f <- sapply(nms,
FUN = function(x) {
as.formula(sprintf("%s_sd ~ %s_mn", x, x))
})
xyplot.list(f,
data = db,
groups = diagnosis,
type = c("p", "smooth"),
as.table = TRUE,
auto.key = TRUE,
x.same = FALSE, y.same = FALSE)

# Dados de treino (aberto) e teste (fechado).
xyplot(radius_mn ~ radius_sd,
data = db_train,
groups = diagnosis,
auto.key = TRUE,
aspect = "iso") +
as.layer(xyplot(radius_mn ~ radius_sd,
data = db_test,
groups = diagnosis,
pch = 19))

# Criando um grid fino de valores para traçar a fronteira do
# classificador.
grid <- expand.grid(seq(0, 1, length.out = 100),
seq(0, 1, length.out = 100),
KEEP.OUT.ATTRS = FALSE)
names(grid) <- c("radius_mn", "radius_sd")
# Usando apenas o vizinho mais próximo.
m0 <- knn(train = db_train[, names(grid)],
test = grid,
cl = db_train[, 1],
k = 1)
# Gráfico da fronteira de classificação.
levelplot(-as.integer(m0) ~ radius_sd + radius_mn,
data = grid,
col.regions = gray.colors,
colorkey = FALSE,
aspect = "iso") +
as.layer(xyplot(radius_mn ~ radius_sd,
data = db_train,
groups = diagnosis))

#-----------------------------------------------------------------------
# Serializar variando o número de vizinhos.
# Valores para o número de vizinhos.
k <- c(1:5, 7, 11, 19)
# Criando a predição das categorias variando o k.
pred <- lapply(k,
FUN = knn,
train = db_train[, names(grid)],
test = grid,
cl = db_train[, 1])
names(pred) <- paste0("k", k)
# Coerção.
pred <- as.data.frame(pred)
# Junção.
pred <- cbind(grid, pred)
# Empilha.
pred <- reshape2::melt(data = pred, id.vars = names(grid))
# Formata.
pred$value <- factor(pred$value, levels(db$diagnosis))
str(pred)
## 'data.frame': 80000 obs. of 4 variables:
## $ radius_mn: num 0 0.0101 0.0202 0.0303 0.0404 ...
## $ radius_sd: num 0 0 0 0 0 0 0 0 0 0 ...
## $ variable : Factor w/ 8 levels "k1","k2","k3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 1 1 1 ...
# Suavização da fronteira como função do número de vizinhos.
levelplot(value ~ radius_sd + radius_mn | variable,
data = pred,
col.regions = gray.colors,
colorkey = FALSE,
as.table = TRUE,
aspect = "iso") +
as.layer(xyplot(radius_mn ~ radius_sd,
data = db_train,
groups = diagnosis))

Aplicando o k-NN com o pacote caret
#-----------------------------------------------------------------------
# Divisão dos dados em treino e teste.
library(caret)
# Proporções das classes no treino e teste usando sample().
rbind(train = prop.table(xtabs(~diagnosis, db_train)),
test = prop.table(xtabs(~diagnosis, db_test)))
## B M
## train 0.6116279 0.3883721
## test 0.6811594 0.3188406
# Criando as partições.
set.seed(789)
intrain <- createDataPartition(y = db$diagnosis,
p = 0.75,
list = FALSE)
db_train <- db[intrain, ]
db_test <- db[-intrain, ]
nrow(db_train)
## [1] 427
## [1] 141
# Proporções das classes no treino e teste usando createDataPartition().
rbind(train = prop.table(xtabs(~diagnosis, db_train)),
test = prop.table(xtabs(~diagnosis, db_test)))
## B M
## train 0.6276347 0.3723653
## test 0.6312057 0.3687943
#-----------------------------------------------------------------------
# Submete para o método.
# Parametriza a valiação cruzada.
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(159)
knn_fit <- train(diagnosis ~ .,
data = db_train,
method = "knn",
trControl = trctrl,
tuneLength = 15)
# Classe e métodos.
class(knn_fit)
## [1] "train" "train.formula"
methods(class = class(knn_fit))
## [1] confusionMatrix densityplot fitted ggplot
## [5] histogram levels plot predictors
## [9] predict print residuals stripplot
## [13] summary update varImp xyplot
## see '?methods' for accessing help and source code
# Usa a função caret::knn3() como workhorse. Baseia-se no código em C do
# pacote class.
class(knn_fit$finalModel)
## [1] "knn3"
methods(class = class(knn_fit$finalModel))
## [1] predict print
## see '?methods' for accessing help and source code
# help(knn3, help_type = "html")
# Resultado do procedimento.
knn_fit
## k-Nearest Neighbors
##
## 427 samples
## 30 predictors
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 385, 384, 384, 384, 385, 384, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9657438 0.9258467
## 7 0.9688446 0.9317329
## 9 0.9719454 0.9384056
## 11 0.9704319 0.9349688
## 13 0.9657623 0.9240747
## 15 0.9657438 0.9241964
## 17 0.9649871 0.9223949
## 19 0.9649686 0.9223829
## 21 0.9633813 0.9187670
## 23 0.9626061 0.9171135
## 25 0.9618309 0.9153436
## 27 0.9602621 0.9118553
## 29 0.9610557 0.9133645
## 31 0.9594869 0.9099370
## 33 0.9594869 0.9099370
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
# Gráfico para escolha do parâmetro de tunning.
plot(knn_fit)

# Predição nos dados deixados de fora.
m0 <- predict(knn_fit, newdata = db_test)
# Matriz de confusão.
confusionMatrix(m0, db_test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 86 3
## M 3 49
##
## Accuracy : 0.9574
## 95% CI : (0.9097, 0.9842)
## No Information Rate : 0.6312
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9086
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9663
## Specificity : 0.9423
## Pos Pred Value : 0.9663
## Neg Pred Value : 0.9423
## Prevalence : 0.6312
## Detection Rate : 0.6099
## Detection Prevalence : 0.6312
## Balanced Accuracy : 0.9543
##
## 'Positive' Class : B
##
k-NN para os dados de cultivares de uva
# Carrega os dados.
url <- "http://www.leg.ufpr.br/~walmes/data/areafoliarUva.txt"
uva <- read.table(url, header = TRUE, sep = "\t",
stringsAsFactors = FALSE)
uva$cult <- factor(uva$cult)
uva$id <- NULL
# Comprimento da nervura lateral: média dos lados direito e esquerdo.
uva$nl <- with(uva, apply(cbind(nld, nle), 1, mean))
uva <- subset(uva, select = -c(nld, nle))
str(uva)
## 'data.frame': 300 obs. of 7 variables:
## $ cult: Factor w/ 3 levels "malbec","merlot",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ area: num 100.8 85.8 119.5 137 84.7 ...
## $ mc : num 12 11.5 12.5 15.5 10 12 15.5 17.5 13.5 13.3 ...
## $ nc : num 7.5 9 8.5 10 7 8.5 11 13 10 9.5 ...
## $ ml : num 12.8 10.5 13 14.4 11 12 14 14 12 15 ...
## $ cll : num 9.5 9.5 10.2 12 7.5 8.9 13.5 10.8 9.7 10.3 ...
## $ nl : num 6.95 7.75 8.8 9.5 6.75 ...
# Especificação da validação cruzada.
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
# Opções de SVM.
grep(pattern = "^svm", x = names(getModelInfo()), value = TRUE)
## [1] "svmBoundrangeString" "svmExpoString" "svmLinear"
## [4] "svmLinear2" "svmLinear3" "svmLinearWeights"
## [7] "svmLinearWeights2" "svmPoly" "svmRadial"
## [10] "svmRadialCost" "svmRadialSigma" "svmRadialWeights"
## [13] "svmSpectrumString"
# Ajuste.
set.seed(1234)
svm_fit <- train(cult ~ .,
data = uva,
method = "svmRadial",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
knn_fit <- train(cult ~ .,
data = uva,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
# Resultado dos métodos de classificação.
svm_fit
## Support Vector Machines with Radial Basis Function Kernel
##
## 300 samples
## 6 predictors
## 3 classes: 'malbec', 'merlot', 'sauvignonblanc'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 270, 270, 270, 270, 270, 270, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5466667 0.3200000
## 0.50 0.5444444 0.3166667
## 1.00 0.5600000 0.3400000
## 2.00 0.5644444 0.3466667
## 4.00 0.5666667 0.3500000
## 8.00 0.5522222 0.3283333
## 16.00 0.5711111 0.3566667
## 32.00 0.5477778 0.3216667
## 64.00 0.5311111 0.2966667
## 128.00 0.5055556 0.2583333
##
## Tuning parameter 'sigma' was held constant at a value of 0.4384058
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.4384058 and C = 16.
## k-Nearest Neighbors
##
## 300 samples
## 6 predictors
## 3 classes: 'malbec', 'merlot', 'sauvignonblanc'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 270, 270, 270, 270, 270, 270, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4855556 0.2283333
## 7 0.5111111 0.2666667
## 9 0.5155556 0.2733333
## 11 0.5288889 0.2933333
## 13 0.5266667 0.2900000
## 15 0.5144444 0.2716667
## 17 0.5100000 0.2650000
## 19 0.5133333 0.2700000
## 21 0.4933333 0.2400000
## 23 0.5088889 0.2633333
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.


# Mostra que foi feito a chamada da kernlab::ksvm().
svm_fit$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 1
##
## Number of Support Vectors : 249
##
## Objective Function Value : -740.2169 -899.5419 -480.9223
## Training error : 0.096667
## 11-nearest neighbor classification model
## Training set class distribution:
##
## malbec merlot sauvignonblanc
## 100 100 100
# Matriz de confusão.
confusionMatrix(predict(svm_fit), uva$cult)
## Confusion Matrix and Statistics
##
## Reference
## Prediction malbec merlot sauvignonblanc
## malbec 91 9 9
## merlot 3 89 0
## sauvignonblanc 6 2 91
##
## Overall Statistics
##
## Accuracy : 0.9033
## 95% CI : (0.8641, 0.9343)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.855
## Mcnemar's Test P-Value : 0.1328
##
## Statistics by Class:
##
## Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity 0.9100 0.8900 0.9100
## Specificity 0.9100 0.9850 0.9600
## Pos Pred Value 0.8349 0.9674 0.9192
## Neg Pred Value 0.9529 0.9471 0.9552
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3033 0.2967 0.3033
## Detection Prevalence 0.3633 0.3067 0.3300
## Balanced Accuracy 0.9100 0.9375 0.9350
confusionMatrix(predict(knn_fit), uva$cult)
## Confusion Matrix and Statistics
##
## Reference
## Prediction malbec merlot sauvignonblanc
## malbec 54 20 15
## merlot 16 65 14
## sauvignonblanc 30 15 71
##
## Overall Statistics
##
## Accuracy : 0.6333
## 95% CI : (0.576, 0.688)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.45
## Mcnemar's Test P-Value : 0.1399
##
## Statistics by Class:
##
## Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity 0.5400 0.6500 0.7100
## Specificity 0.8250 0.8500 0.7750
## Pos Pred Value 0.6067 0.6842 0.6121
## Neg Pred Value 0.7820 0.8293 0.8424
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.1800 0.2167 0.2367
## Detection Prevalence 0.2967 0.3167 0.3867
## Balanced Accuracy 0.6825 0.7500 0.7425