1 Análise discriminante linear

# Pacotes.
library(lattice)
library(latticeExtra)
library(MASS)
library(ellipse)
library(mvtnorm)

#-----------------------------------------------------------------------
# Usando um par de preditoras para visualizar a fronteira.

# Obtendo a matriz de covariância residual.
X <- as.matrix(subset(iris, select = c(Sepal.Length, Sepal.Width)))
Xs <- by(X,
         INDICES = iris$Species,
         FUN = scale,
         center = TRUE,
         scale = FALSE)
X <- do.call(rbind, Xs)
Sigma <- var(X)

# Dados centrados removendo efeito de Species, escala preservada.
xyplot(X[, 2] ~ X[, 1],
       groups = iris$Species,
       aspect = "iso") +
    layer(panel.ellipse(..., groups = NULL, col = 1))

# Elipses para cada espécie e comum.
xyplot(Sepal.Width ~ Sepal.Length,
       groups = Species,
       aspect = "iso",
       data = iris) +
    layer(panel.ellipse(...)) +
    glayer({
        ell <- ellipse(Sigma,
                       level = 0.68,
                       centre = c(mean(x), mean(y)))
        print(head(ell))
        panel.lines(ell, col = col.line, lty = 2)
    })

##      Sepal.Length Sepal.Width
## [1,]     5.681179    3.873522
## [2,]     5.656093    3.888281
## [3,]     5.628390    3.901187
## [4,]     5.598180    3.912187
## [5,]     5.565586    3.921238
## [6,]     5.530739    3.928303
##      Sepal.Length Sepal.Width
## [1,]     6.611179    3.215522
## [2,]     6.586093    3.230281
## [3,]     6.558390    3.243187
## [4,]     6.528180    3.254187
## [5,]     6.495586    3.263238
## [6,]     6.460739    3.270303
##      Sepal.Length Sepal.Width
## [1,]     7.263179    3.419522
## [2,]     7.238093    3.434281
## [3,]     7.210390    3.447187
## [4,]     7.180180    3.458187
## [5,]     7.147586    3.467238
## [6,]     7.112739    3.474303
# Ajuste do modelo.
fit <- lda(Species ~ Sepal.Length + Sepal.Width,
           data = iris)
fit
## Call:
## lda(Species ~ Sepal.Length + Sepal.Width, data = iris)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width
## setosa            5.006       3.428
## versicolor        5.936       2.770
## virginica         6.588       2.974
## 
## Coefficients of linear discriminants:
##                    LD1        LD2
## Sepal.Length -2.141178 -0.8152721
## Sepal.Width   2.768109 -2.0960764
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9628 0.0372
# Para apresentar a fronteira.
grid <- with(iris,
             expand.grid(
                 Sepal.Length = seq(min(Sepal.Length),
                                    max(Sepal.Length),
                                    length.out = 41),
                 Sepal.Width = seq(min(Sepal.Width),
                                   max(Sepal.Width),
                                   length.out = 41),
                 KEEP.OUT.ATTRS = FALSE))
grid$pred <- predict(fit, newdata = grid)$class
str(grid)
## 'data.frame':    1681 obs. of  3 variables:
##  $ Sepal.Length: num  4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
##  $ Sepal.Width : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ pred        : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
# Gráfico com pontos, classficações, fronteira e elipses de confiança.
xyplot(Sepal.Width ~ Sepal.Length,
       data = iris,
       groups = Species,
       pch = 19) +
    latticeExtra::layer(panel.xyplot(x = Sepal.Length,
                                     y = Sepal.Width,
                                     groups = pred,
                                     subscripts = seq_along(pred),
                                     pch = 4),
                        data = grid) +
    latticeExtra::glayer({
        ell <- ellipse(Sigma,
                       level = 0.68,
                       centre = c(mean(x), mean(y)))
        panel.lines(ell, col = 1)
    }) +
    latticeExtra::layer(panel.ellipse(..., lwd = 2))

# Avaliando a densidade das normais multivariadas.
dens <- by(iris[, 1:2],
           iris$Species,
           FUN = function(x) {
               m <- colMeans(x)
               p <- nrow(x)/nrow(iris)
               p * dmvnorm(grid[, 1:2],
                           mean = m,
                           sigma = Sigma)
           })
dens <- as.data.frame(do.call(cbind, dens[1:3]))
grid <- cbind(grid, dens)
str(grid)
## 'data.frame':    1681 obs. of  6 variables:
##  $ Sepal.Length: num  4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
##  $ Sepal.Width : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ pred        : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ setosa      : num  2.78e-05 2.21e-05 1.67e-05 1.22e-05 8.46e-06 ...
##  $ versicolor  : num  0.00171 0.00272 0.00416 0.00607 0.0085 ...
##  $ virginica   : num  1.35e-05 2.73e-05 5.25e-05 9.70e-05 1.72e-04 ...
wp <- wireframe(setosa + versicolor + virginica ~
                    Sepal.Length + Sepal.Width,
                zlab = "Densidade",
                data = grid,
                par.settings =  simpleTheme(alpha = 0.7))
wp

# Vendo de cima.
update(wp,
       par.settings =  simpleTheme(alpha = 1),
       screen = list(x = 0, z = 0, y = 0))

# library(wzRfun)
# library(rpanel)
# rp.wire(wp)

# Predições.
yp <- predict(fit)$class

# Acurácia.
tb <- table(yp, iris$Species)
tb
##             
## yp           setosa versicolor virginica
##   setosa         49          0         0
##   versicolor      1         36        15
##   virginica       0         14        35
sum(diag(tb))/sum(tb)
## [1] 0.8
#-----------------------------------------------------------------------
# Usando todas as variáveis.

# Ajuste do modelo.
fit <- lda(Species ~ .,
           data = iris)
fit
## Call:
## lda(Species ~ ., data = iris)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.8293776  0.02410215
## Sepal.Width   1.5344731  2.16452123
## Petal.Length -2.2012117 -0.93192121
## Petal.Width  -2.8104603  2.83918785
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9912 0.0088
# Predições.
yp <- predict(fit)$class

# Acurácia.
tb <- table(yp, iris$Species)
tb
##             
## yp           setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         1
##   virginica       0          2        49
sum(diag(tb))/sum(tb)
## [1] 0.98

2 Análise discriminante quadrática

# Ajuste do modelo.
fit <- qda(Species ~ Sepal.Length + Sepal.Width,
           data = iris)
fit
## Call:
## qda(Species ~ Sepal.Length + Sepal.Width, data = iris)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width
## setosa            5.006       3.428
## versicolor        5.936       2.770
## virginica         6.588       2.974
# Para apresentar a fronteira.
grid$pred <- predict(fit, newdata = grid)$class
str(grid)
## 'data.frame':    1681 obs. of  6 variables:
##  $ Sepal.Length: num  4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
##  $ Sepal.Width : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ pred        : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ setosa      : num  2.78e-05 2.21e-05 1.67e-05 1.22e-05 8.46e-06 ...
##  $ versicolor  : num  0.00171 0.00272 0.00416 0.00607 0.0085 ...
##  $ virginica   : num  1.35e-05 2.73e-05 5.25e-05 9.70e-05 1.72e-04 ...
# Gráfico com pontos, classficações, fronteira e elipses de confiança.
xyplot(Sepal.Width ~ Sepal.Length,
       data = iris,
       groups = Species,
       pch = 19) +
    layer(panel.xyplot(x = Sepal.Length,
                       y = Sepal.Width,
                       groups = pred,
                       subscripts = seq_along(pred),
                       pch = 4),
          data = grid,
          under = TRUE) +
    glayer({
        ell <- ellipse(Sigma,
                       level = 0.68,
                       centre = c(mean(x), mean(y)))
        print(head(ell))
        panel.lines(ell, col = 1)
    }) +
    layer(panel.ellipse(..., lwd = 2))

##      Sepal.Length Sepal.Width
## [1,]     5.681179    3.873522
## [2,]     5.656093    3.888281
## [3,]     5.628390    3.901187
## [4,]     5.598180    3.912187
## [5,]     5.565586    3.921238
## [6,]     5.530739    3.928303
##      Sepal.Length Sepal.Width
## [1,]     6.611179    3.215522
## [2,]     6.586093    3.230281
## [3,]     6.558390    3.243187
## [4,]     6.528180    3.254187
## [5,]     6.495586    3.263238
## [6,]     6.460739    3.270303
##      Sepal.Length Sepal.Width
## [1,]     7.263179    3.419522
## [2,]     7.238093    3.434281
## [3,]     7.210390    3.447187
## [4,]     7.180180    3.458187
## [5,]     7.147586    3.467238
## [6,]     7.112739    3.474303
# Avaliando a densidade das normais multivariadas.
grid <- subset(grid, select = setdiff(names(grid), names(dens)))
dens <- by(iris[, 1:2],
           iris$Species,
           FUN = function(x) {
               m <- colMeans(x)
               p <- nrow(x)/nrow(iris)
               p * dmvnorm(grid[, 1:2],
                           mean = m,
                           sigma = var(x))
           })
dens <- as.data.frame(do.call(cbind, dens[1:3]))
grid <- cbind(grid, dens)
str(grid)
## 'data.frame':    1681 obs. of  6 variables:
##  $ Sepal.Length: num  4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
##  $ Sepal.Width : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ pred        : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ setosa      : num  2.43e-04 1.44e-04 7.36e-05 3.26e-05 1.25e-05 ...
##  $ versicolor  : num  0.00165 0.00255 0.00376 0.00532 0.00723 ...
##  $ virginica   : num  0.000136 0.0002 0.000286 0.000399 0.000543 ...
wp <- wireframe(setosa + versicolor + virginica ~
                    Sepal.Length + Sepal.Width,
                zlab = "Densidade",
                data = grid,
                par.settings =  simpleTheme(alpha = 0.5))
wp

# Vendo de cima.
update(wp,
       par.settings =  simpleTheme(alpha = 1),
       screen = list(x = 0, z = 0, y = 0))

# Predições.
yp <- predict(fit)$class

# Acurácia.
tb <- table(yp, iris$Species)
tb
##             
## yp           setosa versicolor virginica
##   setosa         49          0         0
##   versicolor      1         37        16
##   virginica       0         13        34
sum(diag(tb))/sum(tb)
## [1] 0.8
#-----------------------------------------------------------------------
# Usando todas as variáveis.

# Ajuste do modelo.
fit <- qda(Species ~ .,
           data = iris)
fit
## Call:
## qda(Species ~ ., data = iris)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026
# Predições.
yp <- predict(fit)$class

# Acurácia.
tb <- table(yp, iris$Species)
tb
##             
## yp           setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         1
##   virginica       0          2        49
sum(diag(tb))/sum(tb)
## [1] 0.98

3 Dados de uva

#-----------------------------------------------------------------------
# Dados hospedados na web.

url <- "http://www.leg.ufpr.br/~walmes/data/areafoliarUva.txt"
uva <- read.table(url, header = TRUE, sep = "\t",
                  stringsAsFactors = FALSE)
uva$cult <- factor(uva$cult)
uva$id <- NULL

# Comprimento da nervura lateral: média dos lados direito e esquerdo.
uva$nl <- with(uva, apply(cbind(nld, nle), 1, mean))
uva <- subset(uva, select = -c(nld, nle))
str(uva)
## 'data.frame':    300 obs. of  7 variables:
##  $ cult: Factor w/ 3 levels "malbec","merlot",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ area: num  100.8 85.8 119.5 137 84.7 ...
##  $ mc  : num  12 11.5 12.5 15.5 10 12 15.5 17.5 13.5 13.3 ...
##  $ nc  : num  7.5 9 8.5 10 7 8.5 11 13 10 9.5 ...
##  $ ml  : num  12.8 10.5 13 14.4 11 12 14 14 12 15 ...
##  $ cll : num  9.5 9.5 10.2 12 7.5 8.9 13.5 10.8 9.7 10.3 ...
##  $ nl  : num  6.95 7.75 8.8 9.5 6.75 ...
splom(uva[-(1:2)],
      groups = uva$cult,
      auto.key = TRUE,
      cex = 0.2)

m1 <- lda(cult ~ ., data = uva)
m1
## Call:
## lda(cult ~ ., data = uva)
## 
## Prior probabilities of groups:
##         malbec         merlot sauvignonblanc 
##      0.3333333      0.3333333      0.3333333 
## 
## Group means:
##                     area     mc     nc     ml    cll     nl
## malbec         116.44748 12.941  9.410 12.263 10.483 8.3540
## merlot         144.40203 15.403 10.851 14.190 12.056 9.3560
## sauvignonblanc  99.61412 11.641  8.235 11.536  9.213 7.3405
## 
## Coefficients of linear discriminants:
##              LD1          LD2
## area  0.03488656 -0.006443026
## mc   -0.77691545 -0.762481480
## nc    0.34141328  0.485986328
## ml   -0.15452055 -0.485121884
## cll  -0.14263063  0.068195514
## nl   -0.01744802  1.593497512
## 
## Proportion of trace:
##    LD1    LD2 
## 0.7243 0.2757
m2 <- qda(cult ~ ., data = uva)
m2
## Call:
## qda(cult ~ ., data = uva)
## 
## Prior probabilities of groups:
##         malbec         merlot sauvignonblanc 
##      0.3333333      0.3333333      0.3333333 
## 
## Group means:
##                     area     mc     nc     ml    cll     nl
## malbec         116.44748 12.941  9.410 12.263 10.483 8.3540
## merlot         144.40203 15.403 10.851 14.190 12.056 9.3560
## sauvignonblanc  99.61412 11.641  8.235 11.536  9.213 7.3405
y1 <- predict(m1)$class
y2 <- predict(m2)$class

tb1 <- table(y1, uva$cult)
tb1
##                 
## y1               malbec merlot sauvignonblanc
##   malbec             50     18             20
##   merlot             21     68             25
##   sauvignonblanc     29     14             55
sum(diag(tb1))/sum(tb1)
## [1] 0.5766667
tb2 <- table(y2, uva$cult)
tb2
##                 
## y2               malbec merlot sauvignonblanc
##   malbec             46     14             13
##   merlot             24     72             21
##   sauvignonblanc     30     14             66
sum(diag(tb2))/sum(tb2)
## [1] 0.6133333

4 Usando o pacote caret

4.1 Exemplo com o iris

#-----------------------------------------------------------------------

library(caret)

# Criando as partições de treino e validação.
set.seed(135)
intrain <- createDataPartition(y = iris$Species,
                               p = 0.75,
                               list = FALSE)
data_train <- iris[intrain, ]
data_test <- iris[-intrain, ]

# Parametriza a valiação cruzada.
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)

fit1 <- train(Species ~ .,
              data = data_train,
              method = "lda",
              trControl = trctrl)
fit1
## Linear Discriminant Analysis 
## 
## 114 samples
##   4 predictors
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 102, 102, 102, 102, 102, 104, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9767677  0.9649406
# Predição e matriz de confusão.
yp1 <- predict(fit1, newdata = data_test)
confusionMatrix(yp1, data_test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         11         0
##   virginica       0          1        12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9722          
##                  95% CI : (0.8547, 0.9993)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 4.864e-16       
##                                           
##                   Kappa : 0.9583          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9167           1.0000
## Specificity                 1.0000            1.0000           0.9583
## Pos Pred Value              1.0000            1.0000           0.9231
## Neg Pred Value              1.0000            0.9600           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3056           0.3333
## Detection Prevalence        0.3333            0.3056           0.3611
## Balanced Accuracy           1.0000            0.9583           0.9792
fit2 <- train(Species ~ .,
              data = data_train,
              method = "qda",
              trControl = trctrl)
fit2
## Quadratic Discriminant Analysis 
## 
## 114 samples
##   4 predictors
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 103, 102, 103, 103, 102, 102, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9741919  0.9611995
# Predição e matriz de confusão.
yp2 <- predict(fit2, newdata = data_test)
confusionMatrix(yp2, data_test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         11         0
##   virginica       0          1        12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9722          
##                  95% CI : (0.8547, 0.9993)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 4.864e-16       
##                                           
##                   Kappa : 0.9583          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9167           1.0000
## Specificity                 1.0000            1.0000           0.9583
## Pos Pred Value              1.0000            1.0000           0.9231
## Neg Pred Value              1.0000            0.9600           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3056           0.3333
## Detection Prevalence        0.3333            0.3056           0.3611
## Balanced Accuracy           1.0000            0.9583           0.9792

4.2 Exemplo com o conjunto de dados de folhas de uva

#-----------------------------------------------------------------------

# Criando as partições de treino e validação.
set.seed(135)
intrain <- createDataPartition(y = uva$cult,
                               p = 0.75,
                               list = FALSE)
data_train <- uva[intrain, ]
data_test <- uva[-intrain, ]

fit1 <- train(cult ~ .,
              data = data_train,
              method = "lda",
              trControl = trctrl)
fit1
## Linear Discriminant Analysis 
## 
## 225 samples
##   6 predictors
##   3 classes: 'malbec', 'merlot', 'sauvignonblanc' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 202, 204, 202, 203, 203, 203, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.5872585  0.3803179
# Predição e matriz de confusão.
yp1 <- predict(fit1, newdata = data_test)
confusionMatrix(yp1, data_test$cult)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       malbec merlot sauvignonblanc
##   malbec             11      3              7
##   merlot              5     16             10
##   sauvignonblanc      9      6              8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4667          
##                  95% CI : (0.3505, 0.5855)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 0.01126         
##                                           
##                   Kappa : 0.2             
##  Mcnemar's Test P-Value : 0.62588         
## 
## Statistics by Class:
## 
##                      Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity                 0.4400        0.6400                0.3200
## Specificity                 0.8000        0.7000                0.7000
## Pos Pred Value              0.5238        0.5161                0.3478
## Neg Pred Value              0.7407        0.7955                0.6731
## Prevalence                  0.3333        0.3333                0.3333
## Detection Rate              0.1467        0.2133                0.1067
## Detection Prevalence        0.2800        0.4133                0.3067
## Balanced Accuracy           0.6200        0.6700                0.5100
fit2 <- train(cult ~ .,
              data = data_train,
              method = "qda",
              trControl = trctrl)
fit2
## Quadratic Discriminant Analysis 
## 
## 225 samples
##   6 predictors
##   3 classes: 'malbec', 'merlot', 'sauvignonblanc' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 203, 202, 203, 203, 203, 203, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.5963423  0.3939673
# Predição e matriz de confusão.
yp2 <- predict(fit2, newdata = data_test)
confusionMatrix(yp2, data_test$cult)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       malbec merlot sauvignonblanc
##   malbec              9      4              7
##   merlot              7     17              9
##   sauvignonblanc      9      4              9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4667          
##                  95% CI : (0.3505, 0.5855)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 0.01126         
##                                           
##                   Kappa : 0.2             
##  Mcnemar's Test P-Value : 0.39297         
## 
## Statistics by Class:
## 
##                      Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity                 0.3600        0.6800                0.3600
## Specificity                 0.7800        0.6800                0.7400
## Pos Pred Value              0.4500        0.5152                0.4091
## Neg Pred Value              0.7091        0.8095                0.6981
## Prevalence                  0.3333        0.3333                0.3333
## Detection Rate              0.1200        0.2267                0.1200
## Detection Prevalence        0.2667        0.4400                0.2933
## Balanced Accuracy           0.5700        0.6800                0.5500
25px