1 Métodos para árvores de regressão

1.1 Carregar os dados

rm(list = objects())
library(lattice)
library(latticeExtra)

#-----------------------------------------------------------------------
# Produção de teca.

# Endereço das tabelas.
pre <- "https://raw.githubusercontent.com/walmes/EACS/master/data-raw/"
files <- c(hidric = "teca_crapar.csv",
           quimic = "teca_qui.csv",
           prod = "teca_arv.csv")
urls <- paste0(pre, files)
names(urls) <- names(files)

# Lista com as tabelas.
da <- sapply(urls,
             FUN = read.table,
             header = TRUE,
             sep = ";",
             simplify = FALSE)
str(da)

## List of 3
##  $ hidric:'data.frame':  141 obs. of  10 variables:
##   ..$ loc: int [1:141] 1 1 1 2 2 2 3 3 3 4 ...
##   ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 3 2 1 1 3 2 1 3 2 3 ...
##   ..$ Ur : num [1:141] 0.178 0.188 0.223 0.131 0.165 ...
##   ..$ Us : num [1:141] 0.58 0.647 0.648 0.697 0.573 ...
##   ..$ alp: num [1:141] -0.833 -0.722 -0.724 -0.548 -0.547 ...
##   ..$ n  : num [1:141] 3.11 3.25 3.59 4.01 2.92 ...
##   ..$ I  : num [1:141] 0.957 0.835 0.815 0.62 0.691 ...
##   ..$ Ui : num [1:141] 0.396 0.435 0.45 0.431 0.387 ...
##   ..$ S  : num [1:141] -0.273 -0.329 -0.341 -0.515 -0.257 ...
##   ..$ cad: num [1:141] 0.217 0.247 0.227 0.3 0.222 ...
##  $ quimic:'data.frame':  150 obs. of  15 variables:
##   ..$ loc: int [1:150] 1 1 1 2 2 2 3 3 3 4 ...
##   ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 1 3 2 1 3 2 1 3 2 1 ...
##   ..$ ph : num [1:150] 6.8 6.7 6.7 4.7 4.7 4.9 7.6 6.8 6.9 6.6 ...
##   ..$ p  : num [1:150] 22.51 0.83 0.01 3.89 0.69 ...
##   ..$ k  : num [1:150] 72.24 13.42 7.23 48.13 12.34 ...
##   ..$ ca : num [1:150] 8.27 2.91 2.33 0.97 0.76 0.21 7.63 3.22 2.22 5.54 ...
##   ..$ mg : num [1:150] 1.7 1.77 0.51 0.16 0.14 0 1.53 1.31 1.09 1.77 ...
##   ..$ al : num [1:150] 0 0 0 0.3 0.6 0.6 0 0 0 0 ...
##   ..$ ctc: num [1:150] 12.47 6.57 4.52 5.3 4.17 ...
##   ..$ sat: num [1:150] 81.4 71.7 63.2 23.7 22.4 ...
##   ..$ mo : num [1:150] 72.2 25.6 9.7 34.4 8.7 9.7 50.4 15.6 9.5 50.2 ...
##   ..$ arg: num [1:150] 184 215 286 232 213 ...
##   ..$ are: num [1:150] 770 750 674 741 775 ...
##   ..$ cas: num [1:150] 1.8 2.2 3.7 0.4 1.1 1.7 8.4 19 14.3 5.5 ...
##   ..$ acc: num [1:150] 770 750 676 741 775 ...
##  $ prod  :'data.frame':  50 obs. of  5 variables:
##   ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ alt : num [1:50] 12.9 11.9 22.5 20.4 15.6 ...
##   ..$ dap : num [1:50] 0.194 0.182 0.348 0.304 0.227 ...
##   ..$ vol : num [1:50] 0.2707 0.0963 0.4974 0.4074 0.2165 ...
##   ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...

# Manipular as tabelas para fazer o merge.
da$quimic <- subset(da$quimic, cam == "[0, 5)", select = -cam)
da$hidric <- subset(da$hidric, cam == "[0, 5)", select = -c(cam, cad))
da$prod <- subset(da$prod, select = c(loc, prod))
str(da)

## List of 3
##  $ hidric:'data.frame':  48 obs. of  8 variables:
##   ..$ loc: int [1:48] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ Ur : num [1:48] 0.223 0.131 0.225 0.233 0.172 ...
##   ..$ Us : num [1:48] 0.648 0.697 0.614 0.647 0.844 ...
##   ..$ alp: num [1:48] -0.724 -0.548 -0.707 -0.555 -0.25 ...
##   ..$ n  : num [1:48] 3.59 4.01 2.53 2.91 2.1 ...
##   ..$ I  : num [1:48] 0.815 0.62 0.905 0.7 0.557 ...
##   ..$ Ui : num [1:48] 0.45 0.431 0.44 0.459 0.556 ...
##   ..$ S  : num [1:48] -0.341 -0.515 -0.206 -0.26 -0.278 ...
##  $ quimic:'data.frame':  50 obs. of  14 variables:
##   ..$ loc: int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ ph : num [1:50] 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
##   ..$ p  : num [1:50] 22.51 3.89 11.52 26.43 3.79 ...
##   ..$ k  : num [1:50] 72.2 48.1 114.5 54.3 56.8 ...
##   ..$ ca : num [1:50] 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
##   ..$ mg : num [1:50] 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
##   ..$ al : num [1:50] 0 0.3 0 0 0 0 0 0 0 0.1 ...
##   ..$ ctc: num [1:50] 12.47 5.3 12.41 9.11 7.57 ...
##   ..$ sat: num [1:50] 81.4 23.7 76.1 81.8 69.4 ...
##   ..$ mo : num [1:50] 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
##   ..$ arg: num [1:50] 184 232 193 142 236 ...
##   ..$ are: num [1:50] 770 741 716 790 733 ...
##   ..$ cas: num [1:50] 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
##   ..$ acc: num [1:50] 770 741 718 791 734 ...
##  $ prod  :'data.frame':  50 obs. of  2 variables:
##   ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...

# Aplica o merge recursivamente.
teca <- Reduce(f = merge, x = da)

# Elimina a variável identificadora (agora desnecessária).
teca$loc <- NULL

# Estrutura da tabela.
str(teca)

## 'data.frame':    48 obs. of  21 variables:
##  $ Ur  : num  0.223 0.131 0.225 0.233 0.172 ...
##  $ Us  : num  0.648 0.697 0.614 0.647 0.844 ...
##  $ alp : num  -0.724 -0.548 -0.707 -0.555 -0.25 ...
##  $ n   : num  3.59 4.01 2.53 2.91 2.1 ...
##  $ I   : num  0.815 0.62 0.905 0.7 0.557 ...
##  $ Ui  : num  0.45 0.431 0.44 0.459 0.556 ...
##  $ S   : num  -0.341 -0.515 -0.206 -0.26 -0.278 ...
##  $ ph  : num  6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
##  $ p   : num  22.51 3.89 11.52 26.43 3.79 ...
##  $ k   : num  72.2 48.1 114.5 54.3 56.8 ...
##  $ ca  : num  8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
##  $ mg  : num  1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
##  $ al  : num  0 0.3 0 0 0 0 0 0 0 0.1 ...
##  $ ctc : num  12.47 5.3 12.41 9.11 7.57 ...
##  $ sat : num  81.4 23.7 76.1 81.8 69.4 ...
##  $ mo  : num  72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
##  $ arg : num  184 232 193 142 236 ...
##  $ are : num  770 741 716 790 733 ...
##  $ cas : num  1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
##  $ acc : num  770 741 718 791 734 ...
##  $ prod: num  68.2 24.3 125.3 102.7 54.6 ...

#-----------------------------------------------------------------------
# Preço de imóveis para 7 bairros em Curitiba.

u <- "http://www.leg.ufpr.br/~walmes/data/ap_venda7bairros_cwb_210314.txt"
ap <- read.table(file = u, header = TRUE, sep = "\t")
# str(ap)

# Usar o log do preço e da metragem.
ap <- transform(ap,
                larea = log10(area),
                lpreco = log10(preco),
                preco = NULL,
                area = NULL)

# Exclui outliers.
# plot(lpreco ~ larea, data = ap)
# dput(with(ap, identify(larea, lpreco)))

ap <- ap[-c(1966L, 2696L, 3267L), ]
ap <- subset(ap, lpreco > 4)
rownames(ap) <- NULL
str(ap)

## 'data.frame':    4467 obs. of  7 variables:
##  $ quartos  : int  2 3 2 2 3 3 3 3 3 3 ...
##  $ banheiros: int  2 2 2 2 2 2 2 5 1 1 ...
##  $ vagas    : int  1 1 2 1 1 1 1 2 1 1 ...
##  $ suites   : int  1 1 1 1 1 1 1 3 1 1 ...
##  $ bairro   : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ larea    : num  1.8 1.88 1.8 1.8 1.88 ...
##  $ lpreco   : num  5.46 5.52 5.57 5.47 5.41 ...

#-----------------------------------------------------------------------
# Análise exploratória.

xyplot(lpreco ~ larea | cut(vagas, c(0:3, Inf)),
       data = ap,
       as.table = TRUE)

xyplot(lpreco ~ larea | bairro,
       data = ap,
       as.table = TRUE) +
    layer(panel.smoother(...))

1.2 Árvore de regressão

#-----------------------------------------------------------------------
# Ajuste de árvore de regressão para os dados de teca.

library(rpart)
library(rpart.plot)
# help(rpart, help_type = "html")

# Ajuste do modelo.
m0 <- rpart(prod ~ ., data = teca)

# Resumo do ajuste.
summary(m0)

## Call:
## rpart(formula = prod ~ ., data = teca)
##   n= 48 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.62742947      0 1.0000000 1.0876071 0.18680894
## 2 0.07594851      1 0.3725705 0.3962829 0.08804663
## 3 0.01000000      2 0.2966220 0.4585990 0.10622713
## 
## Variable importance
##  ca  ph sat ctc are  mo   I   k acc   n 
##  21  18  18  18  11  10   1   1   1   1 
## 
## Node number 1: 48 observations,    complexity param=0.6274295
##   mean=82.81385, MSE=1317.39 
##   left son=2 (19 obs) right son=3 (29 obs)
##   Primary splits:
##       ca  < 4.14      to the left,  improve=0.6274295, (0 missing)
##       ctc < 8.97      to the left,  improve=0.5302188, (0 missing)
##       sat < 72.875    to the left,  improve=0.5157282, (0 missing)
##       ph  < 6.25      to the left,  improve=0.4864565, (0 missing)
##       k   < 85.805    to the left,  improve=0.3785542, (0 missing)
##   Surrogate splits:
##       ctc < 8.97      to the left,  agree=0.938, adj=0.842, (0 split)
##       sat < 72.875    to the left,  agree=0.917, adj=0.789, (0 split)
##       ph  < 6.25      to the left,  agree=0.896, adj=0.737, (0 split)
##       are < 631.25    to the right, agree=0.812, adj=0.526, (0 split)
##       mo  < 37.05     to the left,  agree=0.792, adj=0.474, (0 split)
## 
## Node number 2: 19 observations
##   mean=47.29476, MSE=442.1655 
## 
## Node number 3: 29 observations,    complexity param=0.07594851
##   mean=106.085, MSE=522.6983 
##   left son=6 (18 obs) right son=7 (11 obs)
##   Primary splits:
##       ph  < 6.95      to the left,  improve=0.3168297, (0 missing)
##       cas < 41.4      to the left,  improve=0.3025569, (0 missing)
##       ca  < 8.58      to the left,  improve=0.2635895, (0 missing)
##       ctc < 12.715    to the left,  improve=0.2252505, (0 missing)
##       k   < 101.29    to the left,  improve=0.1968285, (0 missing)
##   Surrogate splits:
##       I   < 0.6788505 to the right, agree=0.793, adj=0.455, (0 split)
##       sat < 86.18     to the left,  agree=0.793, adj=0.455, (0 split)
##       k   < 96.6      to the left,  agree=0.759, adj=0.364, (0 split)
##       n   < 3.908742  to the left,  agree=0.724, adj=0.273, (0 split)
##       acc < 632.105   to the left,  agree=0.724, adj=0.273, (0 split)
## 
## Node number 6: 18 observations
##   mean=96.02497, MSE=160.483 
## 
## Node number 7: 11 observations
##   mean=122.5468, MSE=678.8158

# Visualização da árvore de regressão.
rpart.plot(m0)

# Valores preditos.
predict(m0)

##         1         2         3         4         5         6         7 
##  96.02497  47.29476 122.54682  96.02497  47.29476  47.29476  47.29476 
##         8         9        10        11        12        13        14 
##  96.02497  47.29476  47.29476 122.54682 122.54682  96.02497  47.29476 
##        15        16        17        18        19        20        21 
##  47.29476  47.29476 122.54682  96.02497 122.54682  47.29476  96.02497 
##        22        23        24        25        26        27        28 
##  47.29476 122.54682  47.29476  96.02497 122.54682 122.54682 122.54682 
##        29        30        31        32        33        34        35 
##  96.02497  47.29476  96.02497 122.54682  47.29476  47.29476  47.29476 
##        36        37        38        39        40        41        42 
##  96.02497  47.29476  47.29476  96.02497  96.02497  96.02497  96.02497 
##        43        44        45        46        47        48 
##  96.02497  47.29476 122.54682  96.02497  96.02497  96.02497

# Valores preditos (médias em cada região).
unique(sort(predict(m0)))

## [1]  47.29476  96.02497 122.54682

table(sort(predict(m0)))

## 
## 47.2947619118874 96.0249682912905 122.546820129111 
##               19               18               11

#-----------------------------------------------------------------------
# Deixar a árvore crescer mais.

m1 <- rpart(prod ~ .,
            data = teca,
            control = list(minsplit = 5,
                           cp = 0.001))

rpart.plot(m1)

# Valores preditos (médias em cada região).
unique(sort(predict(m1)))

## [1]  31.66904  58.65893  90.26256 101.78738 122.54682

#-----------------------------------------------------------------------
# Ajuste para os dados de imóveis.

m0 <- rpart(lpreco ~ ., data = ap)

# Resumo do ajuste.
summary(m0)

## Call:
## rpart(formula = lpreco ~ ., data = ap)
##   n= 4467 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.55273200      0 1.0000000 1.0006471 0.022309188
## 2 0.10528202      1 0.4472680 0.4543668 0.010557382
## 3 0.08675949      2 0.3419860 0.3451494 0.006988209
## 4 0.02481863      3 0.2552265 0.2598557 0.005719558
## 5 0.02041587      4 0.2304079 0.2285848 0.005312121
## 6 0.01628417      5 0.2099920 0.2126618 0.005200752
## 7 0.01526750      6 0.1937078 0.1943037 0.004587582
## 8 0.01073530      7 0.1784403 0.1809174 0.004398348
## 9 0.01000000      8 0.1677050 0.1755710 0.004332548
## 
## Variable importance
##   larea quartos   vagas  bairro  suites 
##      54      21      14      10       1 
## 
## Node number 1: 4467 observations,    complexity param=0.552732
##   mean=5.755241, MSE=0.09402792 
##   left son=2 (2831 obs) right son=3 (1636 obs)
##   Primary splits:
##       larea     < 2.093071 to the left,  improve=0.5527320, (0 missing)
##       vagas     < 2.5      to the left,  improve=0.4074442, (532 missing)
##       quartos   < 3.5      to the left,  improve=0.3332465, (48 missing)
##       suites    < 1.5      to the left,  improve=0.3233594, (1159 missing)
##       banheiros < 2.5      to the left,  improve=0.2814297, (1029 missing)
##   Surrogate splits:
##       quartos < 3.5      to the left,  agree=0.793, adj=0.435, (0 split)
##       bairro  splits as  LRLLLRL,      agree=0.710, adj=0.208, (0 split)
##       vagas   < 2.5      to the left,  agree=0.694, adj=0.163, (0 split)
## 
## Node number 2: 2831 observations,    complexity param=0.08675949
##   mean=5.581937, MSE=0.03292292 
##   left son=4 (1497 obs) right son=5 (1334 obs)
##   Primary splits:
##       larea     < 1.876247 to the left,  improve=0.3909773, (0 missing)
##       vagas     < 1.5      to the left,  improve=0.2576053, (464 missing)
##       quartos   < 1.5      to the left,  improve=0.1730252, (25 missing)
##       banheiros < 1.5      to the left,  improve=0.1709035, (643 missing)
##       bairro    splits as  RRRLLRL,      improve=0.1645266, (0 missing)
##   Surrogate splits:
##       quartos   < 2.5      to the left,  agree=0.786, adj=0.546, (0 split)
##       bairro    splits as  RRRLLRL,      agree=0.636, adj=0.228, (0 split)
##       vagas     < 1.5      to the left,  agree=0.612, adj=0.176, (0 split)
##       banheiros < 1.5      to the left,  agree=0.539, adj=0.021, (0 split)
## 
## Node number 3: 1636 observations,    complexity param=0.105282
##   mean=6.055132, MSE=0.05785938 
##   left son=6 (1154 obs) right son=7 (482 obs)
##   Primary splits:
##       larea     < 2.358287 to the left,  improve=0.4671646, (0 missing)
##       vagas     < 2.5      to the left,  improve=0.4056675, (68 missing)
##       suites    < 3.5      to the left,  improve=0.3204377, (172 missing)
##       banheiros < 5.5      to the left,  improve=0.1420578, (386 missing)
##       quartos   < 3.5      to the left,  improve=0.1340611, (23 missing)
##   Surrogate splits:
##       vagas   < 3.5      to the left,  agree=0.796, adj=0.309, (0 split)
##       suites  < 3.5      to the left,  agree=0.718, adj=0.041, (0 split)
##       quartos < 4.5      to the left,  agree=0.711, adj=0.019, (0 split)
## 
## Node number 4: 1497 observations,    complexity param=0.02041587
##   mean=5.474836, MSE=0.02047765 
##   left son=8 (565 obs) right son=9 (932 obs)
##   Primary splits:
##       larea     < 1.73882  to the left,  improve=0.27972990, (0 missing)
##       quartos   < 1.5      to the left,  improve=0.11830160, (18 missing)
##       banheiros < 1.5      to the left,  improve=0.11754160, (384 missing)
##       bairro    splits as  RRRLLRL,      improve=0.09454728, (0 missing)
##       vagas     < 1.5      to the left,  improve=0.04037166, (375 missing)
##   Surrogate splits:
##       quartos < 1.5      to the left,  agree=0.868, adj=0.651, (0 split)
##       bairro  splits as  RRRLRRR,      agree=0.721, adj=0.262, (0 split)
## 
## Node number 5: 1334 observations,    complexity param=0.0152675
##   mean=5.702124, MSE=0.01957181 
##   left son=10 (527 obs) right son=11 (807 obs)
##   Primary splits:
##       vagas     < 1.5      to the left,  improve=0.27545610, (89 missing)
##       larea     < 1.963929 to the left,  improve=0.18560880, (0 missing)
##       bairro    splits as  RRRLLRL,      improve=0.10414990, (0 missing)
##       banheiros < 2.5      to the left,  improve=0.08565034, (259 missing)
##       suites    < 1.5      to the left,  improve=0.04342607, (218 missing)
##   Surrogate splits:
##       larea  < 1.906657 to the left,  agree=0.645, adj=0.14, (89 split)
##       bairro splits as  RRRLLRR,      agree=0.612, adj=0.06, (0 split)
## 
## Node number 6: 1154 observations,    complexity param=0.02481863
##   mean=5.948879, MSE=0.02474197 
##   left son=12 (777 obs) right son=13 (377 obs)
##   Primary splits:
##       vagas     < 2.5      to the left,  improve=0.33614730, (54 missing)
##       larea     < 2.211788 to the left,  improve=0.28684050, (0 missing)
##       suites    < 1.5      to the left,  improve=0.25779510, (128 missing)
##       bairro    splits as  RRRLLRL,      improve=0.11106200, (0 missing)
##       banheiros < 3.5      to the left,  improve=0.08901843, (253 missing)
##   Surrogate splits:
##       larea  < 2.239298 to the left,  agree=0.73, adj=0.175, (54 split)
##       bairro splits as  LLLLLRL,      agree=0.68, adj=0.022, (0 split)
## 
## Node number 7: 482 observations,    complexity param=0.01628417
##   mean=6.309523, MSE=0.04540429 
##   left son=14 (238 obs) right son=15 (244 obs)
##   Primary splits:
##       vagas     < 3.5      to the left,  improve=0.31065010, (14 missing)
##       suites    < 3.5      to the left,  improve=0.30449720, (44 missing)
##       larea     < 2.54512  to the left,  improve=0.25779660, (0 missing)
##       bairro    splits as  LRRLLRL,      improve=0.08727943, (0 missing)
##       banheiros < 5.5      to the left,  improve=0.08022878, (133 missing)
##   Surrogate splits:
##       larea   < 2.529554 to the left,  agree=0.686, adj=0.352, (14 split)
##       suites  < 3.5      to the left,  agree=0.667, adj=0.313, (0 split)
##       bairro  splits as  LLRLLRL,      agree=0.656, adj=0.291, (0 split)
##       quartos < 3.5      to the left,  agree=0.588, adj=0.150, (0 split)
## 
## Node number 8: 565 observations
##   mean=5.37763, MSE=0.01696386 
## 
## Node number 9: 932 observations
##   mean=5.533765, MSE=0.013407 
## 
## Node number 10: 527 observations
##   mean=5.616327, MSE=0.01477494 
## 
## Node number 11: 807 observations
##   mean=5.758153, MSE=0.01475799 
## 
## Node number 12: 777 observations,    complexity param=0.0107353
##   mean=5.882675, MSE=0.01808852 
##   left son=24 (126 obs) right son=25 (651 obs)
##   Primary splits:
##       vagas     < 1.5      to the left,  improve=0.23951310, (37 missing)
##       larea     < 2.209783 to the left,  improve=0.14406520, (0 missing)
##       suites    < 1.5      to the left,  improve=0.11933820, (90 missing)
##       bairro    splits as  RRRLRRR,      improve=0.11233650, (0 missing)
##       banheiros < 3.5      to the left,  improve=0.05954215, (167 missing)
##   Surrogate splits:
##       bairro splits as  RRRLRRR, agree=0.865, adj=0.099, (37 split)
## 
## Node number 13: 377 observations
##   mean=6.085325, MSE=0.0108039 
## 
## Node number 14: 238 observations
##   mean=6.188907, MSE=0.03177745 
## 
## Node number 15: 244 observations
##   mean=6.427172, MSE=0.03066439 
## 
## Node number 24: 126 observations
##   mean=5.709519, MSE=0.01694193 
## 
## Node number 25: 651 observations
##   mean=5.916189, MSE=0.01138406

# Visualização da árvore de regressão.
rpart.plot(m0)

# Visualização alternativa.
plot(m0)
text(m0)

# Valores preditos (médias em cada região).
unique(sort(predict(m0)))

## [1] 5.377630 5.533765 5.616327 5.709519 5.758153 5.916189 6.085325 6.188907
## [9] 6.427172

# Importância das variáveis.
cbind(m0$variable.importance)

##                  [,1]
## larea     326.4799195
## quartos   128.2119647
## vagas      84.9330547
## bairro     61.7563236
## suites      3.9613041
## banheiros   0.7648777

# Soma de quadrados residual do modelo nulo.
m0$frame$dev[1]                      # SQres do ~1.

## [1] 420.0227

(m0$frame$n[1] - 1) * var(ap$lpreco) # SQres do ~1.

## [1] 420.0227

# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])

## [1] 0.832295

# Criando um grid nas variáveis consideradas pela árvore.
grid <- with(ap,
             expand.grid(larea = seq(min(larea, na.rm = TRUE),
                                     max(larea, na.rm = TRUE),
                                     length.out = 50),
                         vagas = seq(min(vagas, na.rm = TRUE),
                                     max(vagas, na.rm = TRUE),
                                     length.out = 50),
                         bairro = levels(bairro)[1],
                         quartos = median(quartos, na.rm = TRUE),
                         banheiros = median(banheiros, na.rm = TRUE),
                         suites = median(suites, na.rm = TRUE),
                         KEEP.OUT.ATTRS = FALSE))
str(grid)

## 'data.frame':    2500 obs. of  6 variables:
##  $ larea    : num  1.24 1.27 1.31 1.34 1.38 ...
##  $ vagas    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ bairro   : Factor w/ 1 level "agua-verde": 1 1 1 1 1 1 1 1 1 1 ...
##  $ quartos  : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ banheiros: num  2 2 2 2 2 2 2 2 2 2 ...
##  $ suites   : num  1 1 1 1 1 1 1 1 1 1 ...

# Predição para os valores no grid.
grid$y <- predict(m0, newdata = grid)

# Um fator para indicar as diferentes regiões.
yp <- predict(m0, newdata = ap)
yp <- rank(yp, ties.method = "min")
yp <- as.integer(factor(yp))

# Visualização das regiões criadas pelos cortes perpendiculares aos
# eixos.
levelplot(y ~ larea + vagas, data = grid, contour = TRUE) +
    layer(panel.points(larea, vagas, col = yp), data = ap)

# Diagrama de dispersão 3D.
cloud(lpreco ~ larea + vagas, data = ap, col = yp)

# Os patamares.
wireframe(y ~ larea + vagas, data = grid, drape = TRUE)

# Deixar a árvore crescer mais.
m0 <- rpart(lpreco ~ .,
            data = ap,
            control = list(cp = 0.0025))

# Visualização da árvore de regressão.
rpart.plot(m0)

# Visualização alternativa.
plot(m0)
text(m0)

# Valores preditos (médias em cada região).
unique(sort(predict(m0)))

##  [1] 5.302384 5.417912 5.455249 5.563499 5.599947 5.681727 5.709519
##  [8] 5.743817 5.789740 5.886212 5.974917 6.085325 6.124560 6.280833
## [15] 6.344578 6.515365

# Importância das variáveis.
cbind(m0$variable.importance)

##                  [,1]
## larea     333.3563170
## quartos   128.7666179
## vagas      85.4301102
## bairro     65.8767544
## suites      6.2421718
## banheiros   0.7648777

# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])

## [1] 0.8591228

1.3 Árvores de regressão com bagging

# Sempre fazer a predição com esses inputs.
pred <- subset(ap, select = -lpreco)
names(pred)

## [1] "quartos"   "banheiros" "vagas"     "suites"    "bairro"    "larea"

# ID de cada registro.
n <- nrow(ap)
s <- 1:n

# Índices para amostragem com reposição (bootstrap).
i <- sample(s, size = n, replace = TRUE)

# Qual a proporção de registros únicos tomados?
u <- unique(i)
length(u)/n

## [1] 0.6330871

# Quais as observações que ficaram de fora?
out <- which(!(s %in% u))
head(out)

## [1]  3  4  5  6  7 16

tail(out)

## [1] 4446 4447 4456 4458 4463 4465

# Amostra boostrap da tabela.
ap_bs <- ap[i, ]

# Ajuste do modelo.
m_bs <- rpart(lpreco ~ ., data = ap_bs)

# Valores preditos.
y_bs <- predict(m_bs, newdata = pred)
head(y_bs)

##        1        2        3        4        5        6 
## 5.533305 5.533305 5.533305 5.533305 5.533305 5.533305

# Out of bag mean square error.
sum((ap[out, ]$lpreco -
     predict(m_bs, newdata = pred[out, ]))^2)/length(out)

## [1] 0.01614876

#-----------------------------------------------------------------------
# Repetir B vezes.

set.seed(102030)
B <- 200
j <- 1
frac <- numeric(B)
fits <- replicate(B,
                  simplify = FALSE,
                  expr = {
                      # Reamostra com reposição.
                      i <- sample(s, size = n, replace = TRUE)
                      frac[j] <<- length(unique(i))/n
                      j <<- j + 1
                      ap_bs <- ap[i, ]
                      # Ajuste da árvore aos dados de treino.
                      m_bs <- rpart(lpreco ~ ., data = ap_bs)
                      return(m_bs)
                  })

# A proporção de valores usados nas amostras bootstrap.
mean(frac)

## [1] 0.6322084

# Os preditos em cada "ensacamento" dos dados.
pred$y <- sapply(fits, FUN = predict, newdata = pred)
str(pred)

## 'data.frame':    4467 obs. of  7 variables:
##  $ quartos  : int  2 3 2 2 3 3 3 3 3 3 ...
##  $ banheiros: int  2 2 2 2 2 2 2 5 1 1 ...
##  $ vagas    : int  1 1 2 1 1 1 1 2 1 1 ...
##  $ suites   : int  1 1 1 1 1 1 1 3 1 1 ...
##  $ bairro   : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ larea    : num  1.8 1.88 1.8 1.8 1.88 ...
##  $ y        : num [1:4467, 1:200] 5.51 5.6 5.51 5.51 5.6 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr  "1" "2" "3" "4" ...
##   .. ..$ : NULL

# Predição para os primeiros casos.
head(pred)

##   quartos banheiros vagas suites bairro    larea      y.1      y.2
## 1       2         2     1      1 portao 1.799341 5.511545 5.482001
## 2       3         2     1      1 portao 1.875061 5.603164 5.603398
## 3       2         2     2      1 portao 1.797752 5.511545 5.482001
## 4       2         2     1      1 portao 1.798513 5.511545 5.482001
## 5       3         2     1      1 portao 1.875061 5.603164 5.603398
## 6       3         2     1      1 portao 1.875061 5.603164 5.603398
##        y.3      y.4      y.5      y.6      y.7      y.8      y.9     y.10
## 1 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 2 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 3 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 4 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 5 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 6 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
##       y.11     y.12     y.13     y.14     y.15     y.16     y.17     y.18
## 1 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 2 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 3 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 4 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 5 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 6 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
##       y.19     y.20     y.21     y.22     y.23     y.24     y.25     y.26
## 1 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 2 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 3 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 4 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 5 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 6 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
##       y.27     y.28     y.29     y.30     y.31     y.32     y.33     y.34
## 1 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 2 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 3 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 4 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 5 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 6 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
##       y.35     y.36     y.37     y.38     y.39     y.40     y.41     y.42
## 1 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 2 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 3 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 4 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 5 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 6 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
##       y.43     y.44     y.45     y.46     y.47     y.48     y.49     y.50
## 1 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 2 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 3 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 4 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 5 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 6 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
##       y.51     y.52     y.53     y.54     y.55     y.56     y.57     y.58
## 1 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 2 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 3 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 4 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 5 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 6 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
##       y.59     y.60     y.61     y.62     y.63     y.64     y.65     y.66
## 1 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 2 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 3 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 4 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 5 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 6 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
##       y.67     y.68     y.69     y.70     y.71     y.72     y.73     y.74
## 1 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 2 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 3 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 4 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 5 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 6 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
##       y.75     y.76     y.77     y.78     y.79     y.80     y.81     y.82
## 1 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 2 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 3 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 4 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 5 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 6 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
##       y.83     y.84     y.85     y.86     y.87     y.88     y.89     y.90
## 1 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 2 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 3 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 4 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 5 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 6 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
##       y.91     y.92     y.93     y.94     y.95     y.96     y.97     y.98
## 1 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 2 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 3 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 4 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 5 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 6 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
##       y.99    y.100    y.101    y.102    y.103    y.104    y.105    y.106
## 1 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 2 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 3 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 4 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 5 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 6 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
##      y.107    y.108    y.109    y.110    y.111    y.112    y.113    y.114
## 1 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 2 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 3 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 4 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 5 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 6 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
##      y.115    y.116    y.117    y.118    y.119    y.120    y.121    y.122
## 1 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 2 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 3 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 4 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 5 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 6 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
##      y.123    y.124    y.125    y.126    y.127    y.128    y.129    y.130
## 1 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.566150
## 2 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 3 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 4 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 5 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 6 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
##      y.131    y.132    y.133    y.134    y.135    y.136    y.137    y.138
## 1 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 2 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 3 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 4 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 5 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 6 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
##      y.139    y.140    y.141    y.142    y.143    y.144    y.145    y.146
## 1 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 2 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 3 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 4 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 5 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 6 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
##      y.147    y.148    y.149    y.150    y.151    y.152    y.153    y.154
## 1 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 2 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 3 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 4 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 5 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 6 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
##      y.155    y.156    y.157    y.158    y.159    y.160    y.161    y.162
## 1 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 2 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 3 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 4 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 5 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 6 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
##      y.163    y.164    y.165    y.166    y.167    y.168    y.169    y.170
## 1 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 2 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 3 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 4 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 5 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 6 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
##      y.171    y.172    y.173    y.174    y.175    y.176    y.177    y.178
## 1 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 2 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 3 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 4 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 5 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 6 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
##      y.179    y.180    y.181    y.182    y.183    y.184    y.185    y.186
## 1 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 2 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 3 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 4 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 5 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 6 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
##      y.187    y.188    y.189    y.190    y.191    y.192    y.193    y.194
## 1 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 2 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 3 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 4 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 5 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 6 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
##      y.195    y.196    y.197    y.198    y.199    y.200
## 1 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 2 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 3 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 4 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 5 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 6 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568

# Estatísticas para o B valores preditos para alguns casos.
mean(as.vector(pred[1, "y"]))

## [1] 5.495819

var(as.vector(pred[1, "y"]))

## [1] 0.002245619

# O predito médio ("a sabedoria das multidões").
pred$ym <- rowMeans(pred$y)

# Predito contra observado.
# x11()
xyplot(pred$ym ~ ap$lpreco, aspect = "iso") +
    layer(panel.abline(a = 0, b = 1))

# Qual o predito para o imóvel mediano?
new <- lapply(pred,
              FUN = function(x) {
                  if (is.numeric(x)) {
                      median(x, na.rm = TRUE)
                  } else {
                      levels(x)[1]
                  }
              })
new

## $quartos
## [1] 3
## 
## $banheiros
## [1] 2
## 
## $vagas
## [1] 2
## 
## $suites
## [1] 1
## 
## $bairro
## [1] "agua-verde"
## 
## $larea
## [1] 2
## 
## $y
## [1] 5.734044
## 
## $ym
## [1] 5.750674

# Predito por cada árvore.
y <- sapply(fits, FUN = predict, newdata = new)

# Distribuição dos B valores preditos e valor médio.
plot(density(y))
rug(y)
m <- mean(y)
abline(v = m, col = 2)

## [1] 5.752909

library(ipred)

# help(package = "ipred", help_type = "html")

# Fazendo bagging.
bg <- bagging(lpreco ~ ., data = ap, nbagg = 200, coob = TRUE)
bg

## 
## Bagging regression trees with 200 bootstrap replications 
## 
## Call: bagging.data.frame(formula = lpreco ~ ., data = ap, nbagg = 200, 
##     coob = TRUE)
## 
## Out-of-bag estimate of root mean squared error:  0.1048

# Predito contra observado.
xyplot(predict(bg, newdata = ap) ~ ap$lpreco, aspect = "iso")

1.4 Árvores de regressão com random forests

#-----------------------------------------------------------------------
# Prototipando.

xvars <- names(teca)[1:20]
nv <- floor(length(xvars)/3)

# Seleciona variáveis preditoras.
v <- sample(xvars, size = nv, replace = FALSE)
v

## [1] "p"   "al"  "n"   "Ur"  "ca"  "acc"

# Seleciona registros.
n <- nrow(teca)
i <- sample(1:n, size = n, replace = TRUE)

# Ajusta o modelo.
m0 <- rpart(prod ~ .,
            data = teca[i, c(v, "prod")],
            control = list(minsplit = 3,
                           cp = 0.001))
m0

## n= 48 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 48 52391.7900  79.30677  
##    2) ca< 3.21 12  3050.7400  33.67443 *
##    3) ca>=3.21 36 16024.0800  94.51754  
##      6) ca< 8.58 27  8981.3940  87.90780  
##       12) n>=2.316487 19  2654.9530  81.34304  
##         24) n< 2.700025 9   630.5857  76.51017 *
##         25) n>=2.700025 10  1624.9690  85.69262 *
##       13) n< 2.316487 8  3562.9050 103.49910 *
##      7) ca>=8.58 9  2324.3050 114.34680 *

# Visualiza.
rpart.plot(m0)

# Replicar.
set.seed(302010)
B <- 1000
rf <- replicate(B,
                simplify = FALSE,
                expr = {
                    v <- sample(xvars, size = nv, replace = FALSE)
                    i <- sample(1:n, size = n, replace = TRUE)
                    m0 <- rpart(prod ~ .,
                                data = teca[i, c(v, "prod")],
                                control = list(minsplit = 3,
                                               cp = 0.01))
                    return(m0)
                })

# ATTENTION: na árvore de regressão, o sorteio das variáveis é feito
# após cada split e não uma única vez como o que está neste código.

# Obtenção dos preditos.
y_rf <- sapply(rf, FUN = predict, newdata = teca)
head(y_rf[, 1:6])

##        [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## 1 120.42047  86.03186 148.18260  88.03894 125.74756  89.45850
## 2  27.36223  45.04491  42.23671  33.10333  31.73443  36.10144
## 3 120.42047 132.64916 120.09817  88.03894  60.18041 126.74866
## 4  63.93345  97.72206  92.24409 102.20512  99.91257  89.45850
## 5  63.93345  45.04491  42.23671  49.02340  60.18041  71.36626
## 6  27.36223  45.04491  42.23671  33.10333  31.73443  36.10144

# Cálculo da média.
ym <- rowMeans(y_rf)

xyplot(ym ~ teca$prod,
       aspect = "iso",
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

# Correlação entre observado e predito.
cor(ym, teca$prod)

## [1] 0.8953036

#-----------------------------------------------------------------------

library(randomForest)

rf <- randomForest(prod ~ .,
                   data = teca,
                   ntree = 3,
                   mtry = 2,
                   keep.inbag = TRUE,
                   keep.forest = TRUE)
rf

## 
## Call:
##  randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2,      keep.inbag = TRUE, keep.forest = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 3
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1161.826
##                     % Var explained: 11.81

1 - sum(rf$oob.times == 0)/length(rf$oob.times)

## [1] 0.7083333

str(rf)

## List of 18
##  $ call           : language randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2,      keep.inbag = TRUE, keep.forest = TRUE)
##  $ type           : chr "regression"
##  $ predicted      : Named num [1:48] 92.1 28.9 84.3 NA 59.1 ...
##   ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
##  $ mse            : num [1:3] 1166 1270 1162
##  $ rsq            : num [1:3] 0.115 0.036 0.118
##  $ oob.times      : int [1:48] 1 1 1 0 1 1 0 0 1 0 ...
##  $ importance     : num [1:20, 1] 866.7 55.2 2243.9 0 368.8 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..$ : chr "IncNodePurity"
##  $ importanceSD   : NULL
##  $ localImportance: NULL
##  $ proximity      : NULL
##  $ ntree          : num 3
##  $ mtry           : num 2
##  $ forest         :List of 11
##   ..$ ndbigtree    : int [1:3] 31 33 35
##   ..$ nodestatus   : int [1:35, 1:3] -3 -3 -3 -3 -3 -1 -3 -1 -1 -1 ...
##   ..$ leftDaughter : int [1:35, 1:3] 2 4 6 8 10 0 12 0 0 0 ...
##   ..$ rightDaughter: int [1:35, 1:3] 3 5 7 9 11 0 13 0 0 0 ...
##   ..$ nodepred     : num [1:35, 1:3] 88.6 41.5 108 48.2 36.4 ...
##   ..$ bestvar      : int [1:35, 1:3] 11 5 20 6 5 0 8 0 0 0 ...
##   ..$ xbestsplit   : num [1:35, 1:3] 3.145 0.604 438.77 0.477 0.667 ...
##   ..$ ncat         : Named int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "names")= chr [1:20] "Ur" "Us" "alp" "n" ...
##   ..$ nrnodes      : int 35
##   ..$ ntree        : num 3
##   ..$ xlevels      :List of 20
##   .. ..$ Ur : num 0
##   .. ..$ Us : num 0
##   .. ..$ alp: num 0
##   .. ..$ n  : num 0
##   .. ..$ I  : num 0
##   .. ..$ Ui : num 0
##   .. ..$ S  : num 0
##   .. ..$ ph : num 0
##   .. ..$ p  : num 0
##   .. ..$ k  : num 0
##   .. ..$ ca : num 0
##   .. ..$ mg : num 0
##   .. ..$ al : num 0
##   .. ..$ ctc: num 0
##   .. ..$ sat: num 0
##   .. ..$ mo : num 0
##   .. ..$ arg: num 0
##   .. ..$ are: num 0
##   .. ..$ cas: num 0
##   .. ..$ acc: num 0
##  $ coefs          : NULL
##  $ y              : Named num [1:48] 68.2 24.3 125.3 102.7 54.6 ...
##   ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
##  $ test           : NULL
##  $ inbag          : int [1:48, 1:3] 2 1 2 1 1 1 1 1 2 1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:48] "1" "2" "3" "4" ...
##   .. ..$ : NULL
##  $ terms          :Classes 'terms', 'formula'  language prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg + al +      ctc + sat + mo + arg + are + cas + acc
##   .. ..- attr(*, "variables")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat,      mo, arg, are, cas, acc)
##   .. ..- attr(*, "factors")= int [1:21, 1:20] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:21] "prod" "Ur" "Us" "alp" ...
##   .. .. .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..- attr(*, "term.labels")= chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..- attr(*, "order")= int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= num 0
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat,      mo, arg, are, cas, acc)
##   .. ..- attr(*, "dataClasses")= Named chr [1:21] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:21] "prod" "Ur" "Us" "alp" ...
##  - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"

# Número de vezes que cada variável foi usada em cada bag.
head(rf$inbag)

##   [,1] [,2] [,3]
## 1    2    0    1
## 2    1    0    1
## 3    2    0    2
## 4    1    1    3
## 5    1    1    0
## 6    1    1    0

# Valores preditos.
sort(unique(predict(rf)))

##  [1]  25.48482  28.89996  44.40381  56.02661  59.11806  62.78211  63.90061
##  [8]  68.94633  71.36430  71.57095  79.79300  84.33689  91.26764  92.09106
## [15]  98.10957  99.46530 100.46795 105.44807 108.87186 121.28472 121.47778
## [22] 123.38324 130.45181 136.58047

# Inspecionando uma das árvores da floresta.
t <- 1
one_tree <- getTree(rf, k = t, labelVar = TRUE)
nrow(one_tree)

## [1] 31

rf$forest$ndbigtree[t]

## [1] 31

one_tree

##    left daughter right daughter split var split point status prediction
## 1              2              3        ca   3.1450000     -3   88.61266
## 2              4              5         I   0.6043793     -3   41.46031
## 3              6              7       acc 438.7700000     -3  108.02833
## 4              8              9        Ui   0.4772272     -3   48.19479
## 5             10             11         I   0.6669049     -3   36.40944
## 6              0              0      <NA>   0.0000000     -1  170.76465
## 7             12             13        ph   6.8500000     -3   99.66349
## 8              0              0      <NA>   0.0000000     -1   29.94343
## 9              0              0      <NA>   0.0000000     -1   66.44616
## 10             0              0      <NA>   0.0000000     -1   22.58625
## 11            14             15        Ui   0.4738361     -3   41.01717
## 12            16             17       are 711.5000000     -3   90.98453
## 13            18             19         p  52.9000000     -3  114.65443
## 14             0              0      <NA>   0.0000000     -1   47.97021
## 15             0              0      <NA>   0.0000000     -1   37.54065
## 16            20             21         p   2.5350000     -3   94.54145
## 17            22             23       alp  -0.7142416     -3   83.27785
## 18            24             25        Ur   0.1850338     -3  117.94994
## 19             0              0      <NA>   0.0000000     -1   81.69934
## 20             0              0      <NA>   0.0000000     -1  109.64509
## 21            26             27        Us   0.7230838     -3   91.79534
## 22             0              0      <NA>   0.0000000     -1   68.22631
## 23             0              0      <NA>   0.0000000     -1   90.80361
## 24             0              0      <NA>   0.0000000     -1  105.44807
## 25             0              0      <NA>   0.0000000     -1  130.45181
## 26            28             29       cas  32.0500000     -3   93.02273
## 27             0              0      <NA>   0.0000000     -1   79.52139
## 28            30             31        ph   6.4500000     -3   88.72769
## 29             0              0      <NA>   0.0000000     -1   99.46530
## 30             0              0      <NA>   0.0000000     -1   70.95099
## 31             0              0      <NA>   0.0000000     -1   92.28303

# Outra forma de acessar as variáveis usadas.
rf$forest$bestvar[, t]

##  [1] 11  5 20  6  5  0  8  0  0  0  6 18  9  0  0  9  3  1  0  0  2  0  0
## [24]  0  0 19  0  8  0  0  0  0  0  0  0

names(rf$forest$xlevels)[rf$forest$bestvar[, t]]

##  [1] "ca"  "I"   "acc" "Ui"  "I"   "ph"  "Ui"  "are" "p"   "p"   "alp"
## [12] "Ur"  "Us"  "cas" "ph"

#-----------------------------------------------------------------------
# help(package = "randomForest", help_type = "html")

rf <- randomForest(prod ~ .,
                   data = teca,
                   ntree = B,
                   mtry = nv)
rf

## 
## Call:
##  randomForest(formula = prod ~ ., data = teca, ntree = B, mtry = nv) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 632.4727
##                     % Var explained: 51.99

# Valores preditos.
yp <- predict(rf)

# Correlação entre observado e predito.
cor(yp, teca$prod)

## [1] 0.7210519

xyplot(ym + yp ~ teca$prod,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

# ATTENTION: a randomForest() criou árvores mais profundas que o código
# didático feito algumas linhas acima. Além do mais, as `m` variáveis
# são sorteadas a cada split e não uma vez apenas.

# o <- order(yp)
# xyplot(ym[o] + yp[o] ~ seq_along(ym), auto.key = TRUE)

Para construir o gráfico de uma árvore, leia esse post: https://stats.stackexchange.com/questions/41443/how-to-actually-plot-a-sample-tree-from-randomforestgettree.

#-----------------------------------------------------------------------
# Dados de imóveis.

nv <- floor(sqrt(ncol(ap) - 1))
nv

## [1] 2

ap2 <- na.omit(ap)

rf <- randomForest(lpreco ~ .,
                   data = ap2,
                   ntree = 100,
                   mtry = nv)
rf

## 
## Call:
##  randomForest(formula = lpreco ~ ., data = ap2, ntree = 100, mtry = nv) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.007040021
##                     % Var explained: 90.87

# Valores preditos.
yp <- predict(rf)

cor(yp, ap2$lpreco)

## [1] 0.9533648

# Correlação entre observado e predito.
xyplot(yp ~ ap2$lpreco,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

importance(rf, type = 2)

##           IncNodePurity
## quartos        7.321537
## banheiros     12.829150
## vagas         50.273498
## suites        29.342710
## bairro         6.591223
## larea         86.955501

im <- importance(rf, type = 2)
100 * im/sum(im)

##           IncNodePurity
## quartos        3.787388
## banheiros      6.636444
## vagas         26.006185
## suites        15.178812
## bairro         3.409601
## larea         44.981570

varImpPlot(rf)

1.5 Árvore de regressão com boosting

Veja as ilustrações aqui para entender: https://medium.com/mlreview/gradient-boosting-from-scratch-1e317ae4587d.

Visite também gradiente descendente boosting.

library(gbm)

# help(package = "gbm", help_type = "html")

rb <- gbm(lpreco ~ .,
          data = ap,
          distribution = "gaussian",
          n.trees = 100,
          shrinkage = 0.05,
          interaction.depth = 1,
          train.fraction = 0.7,
          n.minobsinnode = 10,
          cv.folds = 3,
          keep.data = TRUE,
          verbose = FALSE,
          n.cores = 1)

rb

## gbm(formula = lpreco ~ ., distribution = "gaussian", data = ap, 
##     n.trees = 100, interaction.depth = 1, n.minobsinnode = 10, 
##     shrinkage = 0.05, train.fraction = 0.7, cv.folds = 3, keep.data = TRUE, 
##     verbose = FALSE, n.cores = 1)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 100.
## The best test-set iteration was 100.
## There were 6 predictors of which 5 had non-zero influence.

summary(rb)

##                 var    rel.inf
## larea         larea 54.0322474
## vagas         vagas 27.3236736
## suites       suites 17.0719486
## bairro       bairro  1.2286939
## banheiros banheiros  0.3434365
## quartos     quartos  0.0000000

# Valores preditos.
yp <- predict(rb)

## Using 100 trees...

# Correlação entre observado e predito.
cor(yp, ap$lpreco)

## [1] 0.930845

xyplot(yp ~ ap$lpreco,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    latticeExtra::layer(panel.abline(a = 0, b = 1))

Machine Learning

`leg.ufpr.br/~walmes/ensino/ML`

Métodos baseados em árvores

Prof. Walmes M. Zeviani & Prof. Eduardo V. Ferreira

2017-09-28

1 Métodos para árvores de regressão

1.1 Carregar os dados

1.2 Árvore de regressão

1.3 Árvores de regressão com bagging

1.4 Árvores de regressão com random forests

1.5 Árvore de regressão com boosting

2 Métodos para árvores de classificação