Machine Learning
|
rm(list = objects())
library(lattice)
library(latticeExtra)
#-----------------------------------------------------------------------
# Produção de teca.
# Endereço das tabelas.
pre <- "https://raw.githubusercontent.com/walmes/EACS/master/data-raw/"
files <- c(hidric = "teca_crapar.csv",
quimic = "teca_qui.csv",
prod = "teca_arv.csv")
urls <- paste0(pre, files)
names(urls) <- names(files)
# Lista com as tabelas.
da <- sapply(urls,
FUN = read.table,
header = TRUE,
sep = ";",
simplify = FALSE)
str(da)
## List of 3
## $ hidric:'data.frame': 141 obs. of 10 variables:
## ..$ loc: int [1:141] 1 1 1 2 2 2 3 3 3 4 ...
## ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 3 2 1 1 3 2 1 3 2 3 ...
## ..$ Ur : num [1:141] 0.178 0.188 0.223 0.131 0.165 ...
## ..$ Us : num [1:141] 0.58 0.647 0.648 0.697 0.573 ...
## ..$ alp: num [1:141] -0.833 -0.722 -0.724 -0.548 -0.547 ...
## ..$ n : num [1:141] 3.11 3.25 3.59 4.01 2.92 ...
## ..$ I : num [1:141] 0.957 0.835 0.815 0.62 0.691 ...
## ..$ Ui : num [1:141] 0.396 0.435 0.45 0.431 0.387 ...
## ..$ S : num [1:141] -0.273 -0.329 -0.341 -0.515 -0.257 ...
## ..$ cad: num [1:141] 0.217 0.247 0.227 0.3 0.222 ...
## $ quimic:'data.frame': 150 obs. of 15 variables:
## ..$ loc: int [1:150] 1 1 1 2 2 2 3 3 3 4 ...
## ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 1 3 2 1 3 2 1 3 2 1 ...
## ..$ ph : num [1:150] 6.8 6.7 6.7 4.7 4.7 4.9 7.6 6.8 6.9 6.6 ...
## ..$ p : num [1:150] 22.51 0.83 0.01 3.89 0.69 ...
## ..$ k : num [1:150] 72.24 13.42 7.23 48.13 12.34 ...
## ..$ ca : num [1:150] 8.27 2.91 2.33 0.97 0.76 0.21 7.63 3.22 2.22 5.54 ...
## ..$ mg : num [1:150] 1.7 1.77 0.51 0.16 0.14 0 1.53 1.31 1.09 1.77 ...
## ..$ al : num [1:150] 0 0 0 0.3 0.6 0.6 0 0 0 0 ...
## ..$ ctc: num [1:150] 12.47 6.57 4.52 5.3 4.17 ...
## ..$ sat: num [1:150] 81.4 71.7 63.2 23.7 22.4 ...
## ..$ mo : num [1:150] 72.2 25.6 9.7 34.4 8.7 9.7 50.4 15.6 9.5 50.2 ...
## ..$ arg: num [1:150] 184 215 286 232 213 ...
## ..$ are: num [1:150] 770 750 674 741 775 ...
## ..$ cas: num [1:150] 1.8 2.2 3.7 0.4 1.1 1.7 8.4 19 14.3 5.5 ...
## ..$ acc: num [1:150] 770 750 676 741 775 ...
## $ prod :'data.frame': 50 obs. of 5 variables:
## ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ alt : num [1:50] 12.9 11.9 22.5 20.4 15.6 ...
## ..$ dap : num [1:50] 0.194 0.182 0.348 0.304 0.227 ...
## ..$ vol : num [1:50] 0.2707 0.0963 0.4974 0.4074 0.2165 ...
## ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Manipular as tabelas para fazer o merge.
da$quimic <- subset(da$quimic, cam == "[0, 5)", select = -cam)
da$hidric <- subset(da$hidric, cam == "[0, 5)", select = -c(cam, cad))
da$prod <- subset(da$prod, select = c(loc, prod))
str(da)
## List of 3
## $ hidric:'data.frame': 48 obs. of 8 variables:
## ..$ loc: int [1:48] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ Ur : num [1:48] 0.223 0.131 0.225 0.233 0.172 ...
## ..$ Us : num [1:48] 0.648 0.697 0.614 0.647 0.844 ...
## ..$ alp: num [1:48] -0.724 -0.548 -0.707 -0.555 -0.25 ...
## ..$ n : num [1:48] 3.59 4.01 2.53 2.91 2.1 ...
## ..$ I : num [1:48] 0.815 0.62 0.905 0.7 0.557 ...
## ..$ Ui : num [1:48] 0.45 0.431 0.44 0.459 0.556 ...
## ..$ S : num [1:48] -0.341 -0.515 -0.206 -0.26 -0.278 ...
## $ quimic:'data.frame': 50 obs. of 14 variables:
## ..$ loc: int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ ph : num [1:50] 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
## ..$ p : num [1:50] 22.51 3.89 11.52 26.43 3.79 ...
## ..$ k : num [1:50] 72.2 48.1 114.5 54.3 56.8 ...
## ..$ ca : num [1:50] 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
## ..$ mg : num [1:50] 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
## ..$ al : num [1:50] 0 0.3 0 0 0 0 0 0 0 0.1 ...
## ..$ ctc: num [1:50] 12.47 5.3 12.41 9.11 7.57 ...
## ..$ sat: num [1:50] 81.4 23.7 76.1 81.8 69.4 ...
## ..$ mo : num [1:50] 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
## ..$ arg: num [1:50] 184 232 193 142 236 ...
## ..$ are: num [1:50] 770 741 716 790 733 ...
## ..$ cas: num [1:50] 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
## ..$ acc: num [1:50] 770 741 718 791 734 ...
## $ prod :'data.frame': 50 obs. of 2 variables:
## ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Aplica o merge recursivamente.
teca <- Reduce(f = merge, x = da)
# Elimina a variável identificadora (agora desnecessária).
teca$loc <- NULL
# Estrutura da tabela.
str(teca)
## 'data.frame': 48 obs. of 21 variables:
## $ Ur : num 0.223 0.131 0.225 0.233 0.172 ...
## $ Us : num 0.648 0.697 0.614 0.647 0.844 ...
## $ alp : num -0.724 -0.548 -0.707 -0.555 -0.25 ...
## $ n : num 3.59 4.01 2.53 2.91 2.1 ...
## $ I : num 0.815 0.62 0.905 0.7 0.557 ...
## $ Ui : num 0.45 0.431 0.44 0.459 0.556 ...
## $ S : num -0.341 -0.515 -0.206 -0.26 -0.278 ...
## $ ph : num 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
## $ p : num 22.51 3.89 11.52 26.43 3.79 ...
## $ k : num 72.2 48.1 114.5 54.3 56.8 ...
## $ ca : num 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
## $ mg : num 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
## $ al : num 0 0.3 0 0 0 0 0 0 0 0.1 ...
## $ ctc : num 12.47 5.3 12.41 9.11 7.57 ...
## $ sat : num 81.4 23.7 76.1 81.8 69.4 ...
## $ mo : num 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
## $ arg : num 184 232 193 142 236 ...
## $ are : num 770 741 716 790 733 ...
## $ cas : num 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
## $ acc : num 770 741 718 791 734 ...
## $ prod: num 68.2 24.3 125.3 102.7 54.6 ...
#-----------------------------------------------------------------------
# Preço de imóveis para 7 bairros em Curitiba.
u <- "http://www.leg.ufpr.br/~walmes/data/ap_venda7bairros_cwb_210314.txt"
ap <- read.table(file = u, header = TRUE, sep = "\t")
# str(ap)
# Usar o log do preço e da metragem.
ap <- transform(ap,
larea = log10(area),
lpreco = log10(preco),
preco = NULL,
area = NULL)
# Exclui outliers.
# plot(lpreco ~ larea, data = ap)
# dput(with(ap, identify(larea, lpreco)))
ap <- ap[-c(1966L, 2696L, 3267L), ]
ap <- subset(ap, lpreco > 4)
rownames(ap) <- NULL
str(ap)
## 'data.frame': 4467 obs. of 7 variables:
## $ quartos : int 2 3 2 2 3 3 3 3 3 3 ...
## $ banheiros: int 2 2 2 2 2 2 2 5 1 1 ...
## $ vagas : int 1 1 2 1 1 1 1 2 1 1 ...
## $ suites : int 1 1 1 1 1 1 1 3 1 1 ...
## $ bairro : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ larea : num 1.8 1.88 1.8 1.8 1.88 ...
## $ lpreco : num 5.46 5.52 5.57 5.47 5.41 ...
#-----------------------------------------------------------------------
# Análise exploratória.
xyplot(lpreco ~ larea | cut(vagas, c(0:3, Inf)),
data = ap,
as.table = TRUE)
xyplot(lpreco ~ larea | bairro,
data = ap,
as.table = TRUE) +
layer(panel.smoother(...))
#-----------------------------------------------------------------------
# Ajuste de árvore de regressão para os dados de teca.
library(rpart)
library(rpart.plot)
# help(rpart, help_type = "html")
# Ajuste do modelo.
m0 <- rpart(prod ~ ., data = teca)
# Resumo do ajuste.
summary(m0)
## Call:
## rpart(formula = prod ~ ., data = teca)
## n= 48
##
## CP nsplit rel error xerror xstd
## 1 0.62742947 0 1.0000000 1.0876071 0.18680894
## 2 0.07594851 1 0.3725705 0.3962829 0.08804663
## 3 0.01000000 2 0.2966220 0.4585990 0.10622713
##
## Variable importance
## ca ph sat ctc are mo I k acc n
## 21 18 18 18 11 10 1 1 1 1
##
## Node number 1: 48 observations, complexity param=0.6274295
## mean=82.81385, MSE=1317.39
## left son=2 (19 obs) right son=3 (29 obs)
## Primary splits:
## ca < 4.14 to the left, improve=0.6274295, (0 missing)
## ctc < 8.97 to the left, improve=0.5302188, (0 missing)
## sat < 72.875 to the left, improve=0.5157282, (0 missing)
## ph < 6.25 to the left, improve=0.4864565, (0 missing)
## k < 85.805 to the left, improve=0.3785542, (0 missing)
## Surrogate splits:
## ctc < 8.97 to the left, agree=0.938, adj=0.842, (0 split)
## sat < 72.875 to the left, agree=0.917, adj=0.789, (0 split)
## ph < 6.25 to the left, agree=0.896, adj=0.737, (0 split)
## are < 631.25 to the right, agree=0.812, adj=0.526, (0 split)
## mo < 37.05 to the left, agree=0.792, adj=0.474, (0 split)
##
## Node number 2: 19 observations
## mean=47.29476, MSE=442.1655
##
## Node number 3: 29 observations, complexity param=0.07594851
## mean=106.085, MSE=522.6983
## left son=6 (18 obs) right son=7 (11 obs)
## Primary splits:
## ph < 6.95 to the left, improve=0.3168297, (0 missing)
## cas < 41.4 to the left, improve=0.3025569, (0 missing)
## ca < 8.58 to the left, improve=0.2635895, (0 missing)
## ctc < 12.715 to the left, improve=0.2252505, (0 missing)
## k < 101.29 to the left, improve=0.1968285, (0 missing)
## Surrogate splits:
## I < 0.6788505 to the right, agree=0.793, adj=0.455, (0 split)
## sat < 86.18 to the left, agree=0.793, adj=0.455, (0 split)
## k < 96.6 to the left, agree=0.759, adj=0.364, (0 split)
## n < 3.908742 to the left, agree=0.724, adj=0.273, (0 split)
## acc < 632.105 to the left, agree=0.724, adj=0.273, (0 split)
##
## Node number 6: 18 observations
## mean=96.02497, MSE=160.483
##
## Node number 7: 11 observations
## mean=122.5468, MSE=678.8158
# Visualização da árvore de regressão.
rpart.plot(m0)
# Valores preditos.
predict(m0)
## 1 2 3 4 5 6 7
## 96.02497 47.29476 122.54682 96.02497 47.29476 47.29476 47.29476
## 8 9 10 11 12 13 14
## 96.02497 47.29476 47.29476 122.54682 122.54682 96.02497 47.29476
## 15 16 17 18 19 20 21
## 47.29476 47.29476 122.54682 96.02497 122.54682 47.29476 96.02497
## 22 23 24 25 26 27 28
## 47.29476 122.54682 47.29476 96.02497 122.54682 122.54682 122.54682
## 29 30 31 32 33 34 35
## 96.02497 47.29476 96.02497 122.54682 47.29476 47.29476 47.29476
## 36 37 38 39 40 41 42
## 96.02497 47.29476 47.29476 96.02497 96.02497 96.02497 96.02497
## 43 44 45 46 47 48
## 96.02497 47.29476 122.54682 96.02497 96.02497 96.02497
# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
## [1] 47.29476 96.02497 122.54682
table(sort(predict(m0)))
##
## 47.2947619118874 96.0249682912905 122.546820129111
## 19 18 11
#-----------------------------------------------------------------------
# Deixar a árvore crescer mais.
m1 <- rpart(prod ~ .,
data = teca,
control = list(minsplit = 5,
cp = 0.001))
rpart.plot(m1)
# Valores preditos (médias em cada região).
unique(sort(predict(m1)))
## [1] 31.66904 58.65893 90.26256 101.78738 122.54682
#-----------------------------------------------------------------------
# Ajuste para os dados de imóveis.
m0 <- rpart(lpreco ~ ., data = ap)
# Resumo do ajuste.
summary(m0)
## Call:
## rpart(formula = lpreco ~ ., data = ap)
## n= 4467
##
## CP nsplit rel error xerror xstd
## 1 0.55273200 0 1.0000000 1.0006471 0.022309188
## 2 0.10528202 1 0.4472680 0.4543668 0.010557382
## 3 0.08675949 2 0.3419860 0.3451494 0.006988209
## 4 0.02481863 3 0.2552265 0.2598557 0.005719558
## 5 0.02041587 4 0.2304079 0.2285848 0.005312121
## 6 0.01628417 5 0.2099920 0.2126618 0.005200752
## 7 0.01526750 6 0.1937078 0.1943037 0.004587582
## 8 0.01073530 7 0.1784403 0.1809174 0.004398348
## 9 0.01000000 8 0.1677050 0.1755710 0.004332548
##
## Variable importance
## larea quartos vagas bairro suites
## 54 21 14 10 1
##
## Node number 1: 4467 observations, complexity param=0.552732
## mean=5.755241, MSE=0.09402792
## left son=2 (2831 obs) right son=3 (1636 obs)
## Primary splits:
## larea < 2.093071 to the left, improve=0.5527320, (0 missing)
## vagas < 2.5 to the left, improve=0.4074442, (532 missing)
## quartos < 3.5 to the left, improve=0.3332465, (48 missing)
## suites < 1.5 to the left, improve=0.3233594, (1159 missing)
## banheiros < 2.5 to the left, improve=0.2814297, (1029 missing)
## Surrogate splits:
## quartos < 3.5 to the left, agree=0.793, adj=0.435, (0 split)
## bairro splits as LRLLLRL, agree=0.710, adj=0.208, (0 split)
## vagas < 2.5 to the left, agree=0.694, adj=0.163, (0 split)
##
## Node number 2: 2831 observations, complexity param=0.08675949
## mean=5.581937, MSE=0.03292292
## left son=4 (1497 obs) right son=5 (1334 obs)
## Primary splits:
## larea < 1.876247 to the left, improve=0.3909773, (0 missing)
## vagas < 1.5 to the left, improve=0.2576053, (464 missing)
## quartos < 1.5 to the left, improve=0.1730252, (25 missing)
## banheiros < 1.5 to the left, improve=0.1709035, (643 missing)
## bairro splits as RRRLLRL, improve=0.1645266, (0 missing)
## Surrogate splits:
## quartos < 2.5 to the left, agree=0.786, adj=0.546, (0 split)
## bairro splits as RRRLLRL, agree=0.636, adj=0.228, (0 split)
## vagas < 1.5 to the left, agree=0.612, adj=0.176, (0 split)
## banheiros < 1.5 to the left, agree=0.539, adj=0.021, (0 split)
##
## Node number 3: 1636 observations, complexity param=0.105282
## mean=6.055132, MSE=0.05785938
## left son=6 (1154 obs) right son=7 (482 obs)
## Primary splits:
## larea < 2.358287 to the left, improve=0.4671646, (0 missing)
## vagas < 2.5 to the left, improve=0.4056675, (68 missing)
## suites < 3.5 to the left, improve=0.3204377, (172 missing)
## banheiros < 5.5 to the left, improve=0.1420578, (386 missing)
## quartos < 3.5 to the left, improve=0.1340611, (23 missing)
## Surrogate splits:
## vagas < 3.5 to the left, agree=0.796, adj=0.309, (0 split)
## suites < 3.5 to the left, agree=0.718, adj=0.041, (0 split)
## quartos < 4.5 to the left, agree=0.711, adj=0.019, (0 split)
##
## Node number 4: 1497 observations, complexity param=0.02041587
## mean=5.474836, MSE=0.02047765
## left son=8 (565 obs) right son=9 (932 obs)
## Primary splits:
## larea < 1.73882 to the left, improve=0.27972990, (0 missing)
## quartos < 1.5 to the left, improve=0.11830160, (18 missing)
## banheiros < 1.5 to the left, improve=0.11754160, (384 missing)
## bairro splits as RRRLLRL, improve=0.09454728, (0 missing)
## vagas < 1.5 to the left, improve=0.04037166, (375 missing)
## Surrogate splits:
## quartos < 1.5 to the left, agree=0.868, adj=0.651, (0 split)
## bairro splits as RRRLRRR, agree=0.721, adj=0.262, (0 split)
##
## Node number 5: 1334 observations, complexity param=0.0152675
## mean=5.702124, MSE=0.01957181
## left son=10 (527 obs) right son=11 (807 obs)
## Primary splits:
## vagas < 1.5 to the left, improve=0.27545610, (89 missing)
## larea < 1.963929 to the left, improve=0.18560880, (0 missing)
## bairro splits as RRRLLRL, improve=0.10414990, (0 missing)
## banheiros < 2.5 to the left, improve=0.08565034, (259 missing)
## suites < 1.5 to the left, improve=0.04342607, (218 missing)
## Surrogate splits:
## larea < 1.906657 to the left, agree=0.645, adj=0.14, (89 split)
## bairro splits as RRRLLRR, agree=0.612, adj=0.06, (0 split)
##
## Node number 6: 1154 observations, complexity param=0.02481863
## mean=5.948879, MSE=0.02474197
## left son=12 (777 obs) right son=13 (377 obs)
## Primary splits:
## vagas < 2.5 to the left, improve=0.33614730, (54 missing)
## larea < 2.211788 to the left, improve=0.28684050, (0 missing)
## suites < 1.5 to the left, improve=0.25779510, (128 missing)
## bairro splits as RRRLLRL, improve=0.11106200, (0 missing)
## banheiros < 3.5 to the left, improve=0.08901843, (253 missing)
## Surrogate splits:
## larea < 2.239298 to the left, agree=0.73, adj=0.175, (54 split)
## bairro splits as LLLLLRL, agree=0.68, adj=0.022, (0 split)
##
## Node number 7: 482 observations, complexity param=0.01628417
## mean=6.309523, MSE=0.04540429
## left son=14 (238 obs) right son=15 (244 obs)
## Primary splits:
## vagas < 3.5 to the left, improve=0.31065010, (14 missing)
## suites < 3.5 to the left, improve=0.30449720, (44 missing)
## larea < 2.54512 to the left, improve=0.25779660, (0 missing)
## bairro splits as LRRLLRL, improve=0.08727943, (0 missing)
## banheiros < 5.5 to the left, improve=0.08022878, (133 missing)
## Surrogate splits:
## larea < 2.529554 to the left, agree=0.686, adj=0.352, (14 split)
## suites < 3.5 to the left, agree=0.667, adj=0.313, (0 split)
## bairro splits as LLRLLRL, agree=0.656, adj=0.291, (0 split)
## quartos < 3.5 to the left, agree=0.588, adj=0.150, (0 split)
##
## Node number 8: 565 observations
## mean=5.37763, MSE=0.01696386
##
## Node number 9: 932 observations
## mean=5.533765, MSE=0.013407
##
## Node number 10: 527 observations
## mean=5.616327, MSE=0.01477494
##
## Node number 11: 807 observations
## mean=5.758153, MSE=0.01475799
##
## Node number 12: 777 observations, complexity param=0.0107353
## mean=5.882675, MSE=0.01808852
## left son=24 (126 obs) right son=25 (651 obs)
## Primary splits:
## vagas < 1.5 to the left, improve=0.23951310, (37 missing)
## larea < 2.209783 to the left, improve=0.14406520, (0 missing)
## suites < 1.5 to the left, improve=0.11933820, (90 missing)
## bairro splits as RRRLRRR, improve=0.11233650, (0 missing)
## banheiros < 3.5 to the left, improve=0.05954215, (167 missing)
## Surrogate splits:
## bairro splits as RRRLRRR, agree=0.865, adj=0.099, (37 split)
##
## Node number 13: 377 observations
## mean=6.085325, MSE=0.0108039
##
## Node number 14: 238 observations
## mean=6.188907, MSE=0.03177745
##
## Node number 15: 244 observations
## mean=6.427172, MSE=0.03066439
##
## Node number 24: 126 observations
## mean=5.709519, MSE=0.01694193
##
## Node number 25: 651 observations
## mean=5.916189, MSE=0.01138406
# Visualização da árvore de regressão.
rpart.plot(m0)
# Visualização alternativa.
plot(m0)
text(m0)
# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
## [1] 5.377630 5.533765 5.616327 5.709519 5.758153 5.916189 6.085325 6.188907
## [9] 6.427172
# Importância das variáveis.
cbind(m0$variable.importance)
## [,1]
## larea 326.4799195
## quartos 128.2119647
## vagas 84.9330547
## bairro 61.7563236
## suites 3.9613041
## banheiros 0.7648777
# Soma de quadrados residual do modelo nulo.
m0$frame$dev[1] # SQres do ~1.
## [1] 420.0227
(m0$frame$n[1] - 1) * var(ap$lpreco) # SQres do ~1.
## [1] 420.0227
# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])
## [1] 0.832295
# Criando um grid nas variáveis consideradas pela árvore.
grid <- with(ap,
expand.grid(larea = seq(min(larea, na.rm = TRUE),
max(larea, na.rm = TRUE),
length.out = 50),
vagas = seq(min(vagas, na.rm = TRUE),
max(vagas, na.rm = TRUE),
length.out = 50),
bairro = levels(bairro)[1],
quartos = median(quartos, na.rm = TRUE),
banheiros = median(banheiros, na.rm = TRUE),
suites = median(suites, na.rm = TRUE),
KEEP.OUT.ATTRS = FALSE))
str(grid)
## 'data.frame': 2500 obs. of 6 variables:
## $ larea : num 1.24 1.27 1.31 1.34 1.38 ...
## $ vagas : num 1 1 1 1 1 1 1 1 1 1 ...
## $ bairro : Factor w/ 1 level "agua-verde": 1 1 1 1 1 1 1 1 1 1 ...
## $ quartos : int 3 3 3 3 3 3 3 3 3 3 ...
## $ banheiros: num 2 2 2 2 2 2 2 2 2 2 ...
## $ suites : num 1 1 1 1 1 1 1 1 1 1 ...
# Predição para os valores no grid.
grid$y <- predict(m0, newdata = grid)
# Um fator para indicar as diferentes regiões.
yp <- predict(m0, newdata = ap)
yp <- rank(yp, ties.method = "min")
yp <- as.integer(factor(yp))
# Visualização das regiões criadas pelos cortes perpendiculares aos
# eixos.
levelplot(y ~ larea + vagas, data = grid, contour = TRUE) +
layer(panel.points(larea, vagas, col = yp), data = ap)
# Diagrama de dispersão 3D.
cloud(lpreco ~ larea + vagas, data = ap, col = yp)
# Os patamares.
wireframe(y ~ larea + vagas, data = grid, drape = TRUE)
# Deixar a árvore crescer mais.
m0 <- rpart(lpreco ~ .,
data = ap,
control = list(cp = 0.0025))
# Visualização da árvore de regressão.
rpart.plot(m0)
# Visualização alternativa.
plot(m0)
text(m0)
# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
## [1] 5.302384 5.417912 5.455249 5.563499 5.599947 5.681727 5.709519
## [8] 5.743817 5.789740 5.886212 5.974917 6.085325 6.124560 6.280833
## [15] 6.344578 6.515365
# Importância das variáveis.
cbind(m0$variable.importance)
## [,1]
## larea 333.3563170
## quartos 128.7666179
## vagas 85.4301102
## bairro 65.8767544
## suites 6.2421718
## banheiros 0.7648777
# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])
## [1] 0.8591228
# Sempre fazer a predição com esses inputs.
pred <- subset(ap, select = -lpreco)
names(pred)
## [1] "quartos" "banheiros" "vagas" "suites" "bairro" "larea"
# ID de cada registro.
n <- nrow(ap)
s <- 1:n
# Índices para amostragem com reposição (bootstrap).
i <- sample(s, size = n, replace = TRUE)
# Qual a proporção de registros únicos tomados?
u <- unique(i)
length(u)/n
## [1] 0.6330871
# Quais as observações que ficaram de fora?
out <- which(!(s %in% u))
head(out)
## [1] 3 4 5 6 7 16
tail(out)
## [1] 4446 4447 4456 4458 4463 4465
# Amostra boostrap da tabela.
ap_bs <- ap[i, ]
# Ajuste do modelo.
m_bs <- rpart(lpreco ~ ., data = ap_bs)
# Valores preditos.
y_bs <- predict(m_bs, newdata = pred)
head(y_bs)
## 1 2 3 4 5 6
## 5.533305 5.533305 5.533305 5.533305 5.533305 5.533305
# Out of bag mean square error.
sum((ap[out, ]$lpreco -
predict(m_bs, newdata = pred[out, ]))^2)/length(out)
## [1] 0.01614876
#-----------------------------------------------------------------------
# Repetir B vezes.
set.seed(102030)
B <- 200
j <- 1
frac <- numeric(B)
fits <- replicate(B,
simplify = FALSE,
expr = {
# Reamostra com reposição.
i <- sample(s, size = n, replace = TRUE)
frac[j] <<- length(unique(i))/n
j <<- j + 1
ap_bs <- ap[i, ]
# Ajuste da árvore aos dados de treino.
m_bs <- rpart(lpreco ~ ., data = ap_bs)
return(m_bs)
})
# A proporção de valores usados nas amostras bootstrap.
mean(frac)
## [1] 0.6322084
# Os preditos em cada "ensacamento" dos dados.
pred$y <- sapply(fits, FUN = predict, newdata = pred)
str(pred)
## 'data.frame': 4467 obs. of 7 variables:
## $ quartos : int 2 3 2 2 3 3 3 3 3 3 ...
## $ banheiros: int 2 2 2 2 2 2 2 5 1 1 ...
## $ vagas : int 1 1 2 1 1 1 1 2 1 1 ...
## $ suites : int 1 1 1 1 1 1 1 3 1 1 ...
## $ bairro : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ larea : num 1.8 1.88 1.8 1.8 1.88 ...
## $ y : num [1:4467, 1:200] 5.51 5.6 5.51 5.51 5.6 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr "1" "2" "3" "4" ...
## .. ..$ : NULL
# Predição para os primeiros casos.
head(pred)
## quartos banheiros vagas suites bairro larea y.1 y.2
## 1 2 2 1 1 portao 1.799341 5.511545 5.482001
## 2 3 2 1 1 portao 1.875061 5.603164 5.603398
## 3 2 2 2 1 portao 1.797752 5.511545 5.482001
## 4 2 2 1 1 portao 1.798513 5.511545 5.482001
## 5 3 2 1 1 portao 1.875061 5.603164 5.603398
## 6 3 2 1 1 portao 1.875061 5.603164 5.603398
## y.3 y.4 y.5 y.6 y.7 y.8 y.9 y.10
## 1 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 2 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 3 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 4 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 5 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 6 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## y.11 y.12 y.13 y.14 y.15 y.16 y.17 y.18
## 1 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 2 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 3 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 4 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 5 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 6 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## y.19 y.20 y.21 y.22 y.23 y.24 y.25 y.26
## 1 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 2 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 3 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 4 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 5 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 6 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## y.27 y.28 y.29 y.30 y.31 y.32 y.33 y.34
## 1 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 2 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 3 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 4 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 5 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 6 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## y.35 y.36 y.37 y.38 y.39 y.40 y.41 y.42
## 1 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 2 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 3 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 4 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 5 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 6 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## y.43 y.44 y.45 y.46 y.47 y.48 y.49 y.50
## 1 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 2 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 3 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 4 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 5 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 6 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## y.51 y.52 y.53 y.54 y.55 y.56 y.57 y.58
## 1 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 2 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 3 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 4 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 5 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 6 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## y.59 y.60 y.61 y.62 y.63 y.64 y.65 y.66
## 1 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 2 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 3 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 4 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 5 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 6 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## y.67 y.68 y.69 y.70 y.71 y.72 y.73 y.74
## 1 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 2 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 3 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 4 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 5 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 6 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## y.75 y.76 y.77 y.78 y.79 y.80 y.81 y.82
## 1 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 2 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 3 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 4 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 5 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 6 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## y.83 y.84 y.85 y.86 y.87 y.88 y.89 y.90
## 1 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 2 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 3 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 4 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 5 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 6 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## y.91 y.92 y.93 y.94 y.95 y.96 y.97 y.98
## 1 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 2 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 3 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 4 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 5 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 6 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## y.99 y.100 y.101 y.102 y.103 y.104 y.105 y.106
## 1 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 2 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 3 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 4 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 5 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 6 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## y.107 y.108 y.109 y.110 y.111 y.112 y.113 y.114
## 1 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 2 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 3 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 4 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 5 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 6 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## y.115 y.116 y.117 y.118 y.119 y.120 y.121 y.122
## 1 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 2 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 3 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 4 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 5 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 6 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## y.123 y.124 y.125 y.126 y.127 y.128 y.129 y.130
## 1 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.566150
## 2 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 3 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 4 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 5 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 6 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## y.131 y.132 y.133 y.134 y.135 y.136 y.137 y.138
## 1 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 2 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 3 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 4 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 5 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 6 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## y.139 y.140 y.141 y.142 y.143 y.144 y.145 y.146
## 1 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 2 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 3 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 4 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 5 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 6 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## y.147 y.148 y.149 y.150 y.151 y.152 y.153 y.154
## 1 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 2 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 3 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 4 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 5 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 6 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## y.155 y.156 y.157 y.158 y.159 y.160 y.161 y.162
## 1 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 2 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 3 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 4 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 5 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 6 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## y.163 y.164 y.165 y.166 y.167 y.168 y.169 y.170
## 1 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 2 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 3 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 4 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 5 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 6 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## y.171 y.172 y.173 y.174 y.175 y.176 y.177 y.178
## 1 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 2 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 3 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 4 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 5 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 6 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## y.179 y.180 y.181 y.182 y.183 y.184 y.185 y.186
## 1 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 2 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 3 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 4 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 5 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 6 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## y.187 y.188 y.189 y.190 y.191 y.192 y.193 y.194
## 1 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 2 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 3 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 4 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 5 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 6 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## y.195 y.196 y.197 y.198 y.199 y.200
## 1 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 2 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 3 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 4 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 5 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 6 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
# Estatísticas para o B valores preditos para alguns casos.
mean(as.vector(pred[1, "y"]))
## [1] 5.495819
var(as.vector(pred[1, "y"]))
## [1] 0.002245619
# O predito médio ("a sabedoria das multidões").
pred$ym <- rowMeans(pred$y)
# Predito contra observado.
# x11()
xyplot(pred$ym ~ ap$lpreco, aspect = "iso") +
layer(panel.abline(a = 0, b = 1))
# Qual o predito para o imóvel mediano?
new <- lapply(pred,
FUN = function(x) {
if (is.numeric(x)) {
median(x, na.rm = TRUE)
} else {
levels(x)[1]
}
})
new
## $quartos
## [1] 3
##
## $banheiros
## [1] 2
##
## $vagas
## [1] 2
##
## $suites
## [1] 1
##
## $bairro
## [1] "agua-verde"
##
## $larea
## [1] 2
##
## $y
## [1] 5.734044
##
## $ym
## [1] 5.750674
# Predito por cada árvore.
y <- sapply(fits, FUN = predict, newdata = new)
# Distribuição dos B valores preditos e valor médio.
plot(density(y))
rug(y)
m <- mean(y)
abline(v = m, col = 2)
m
## [1] 5.752909
library(ipred)
# help(package = "ipred", help_type = "html")
# Fazendo bagging.
bg <- bagging(lpreco ~ ., data = ap, nbagg = 200, coob = TRUE)
bg
##
## Bagging regression trees with 200 bootstrap replications
##
## Call: bagging.data.frame(formula = lpreco ~ ., data = ap, nbagg = 200,
## coob = TRUE)
##
## Out-of-bag estimate of root mean squared error: 0.1048
# Predito contra observado.
xyplot(predict(bg, newdata = ap) ~ ap$lpreco, aspect = "iso")
#-----------------------------------------------------------------------
# Prototipando.
xvars <- names(teca)[1:20]
nv <- floor(length(xvars)/3)
# Seleciona variáveis preditoras.
v <- sample(xvars, size = nv, replace = FALSE)
v
## [1] "p" "al" "n" "Ur" "ca" "acc"
# Seleciona registros.
n <- nrow(teca)
i <- sample(1:n, size = n, replace = TRUE)
# Ajusta o modelo.
m0 <- rpart(prod ~ .,
data = teca[i, c(v, "prod")],
control = list(minsplit = 3,
cp = 0.001))
m0
## n= 48
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 48 52391.7900 79.30677
## 2) ca< 3.21 12 3050.7400 33.67443 *
## 3) ca>=3.21 36 16024.0800 94.51754
## 6) ca< 8.58 27 8981.3940 87.90780
## 12) n>=2.316487 19 2654.9530 81.34304
## 24) n< 2.700025 9 630.5857 76.51017 *
## 25) n>=2.700025 10 1624.9690 85.69262 *
## 13) n< 2.316487 8 3562.9050 103.49910 *
## 7) ca>=8.58 9 2324.3050 114.34680 *
# Visualiza.
rpart.plot(m0)
# Replicar.
set.seed(302010)
B <- 1000
rf <- replicate(B,
simplify = FALSE,
expr = {
v <- sample(xvars, size = nv, replace = FALSE)
i <- sample(1:n, size = n, replace = TRUE)
m0 <- rpart(prod ~ .,
data = teca[i, c(v, "prod")],
control = list(minsplit = 3,
cp = 0.01))
return(m0)
})
# ATTENTION: na árvore de regressão, o sorteio das variáveis é feito
# após cada split e não uma única vez como o que está neste código.
# Obtenção dos preditos.
y_rf <- sapply(rf, FUN = predict, newdata = teca)
head(y_rf[, 1:6])
## [,1] [,2] [,3] [,4] [,5] [,6]
## 1 120.42047 86.03186 148.18260 88.03894 125.74756 89.45850
## 2 27.36223 45.04491 42.23671 33.10333 31.73443 36.10144
## 3 120.42047 132.64916 120.09817 88.03894 60.18041 126.74866
## 4 63.93345 97.72206 92.24409 102.20512 99.91257 89.45850
## 5 63.93345 45.04491 42.23671 49.02340 60.18041 71.36626
## 6 27.36223 45.04491 42.23671 33.10333 31.73443 36.10144
# Cálculo da média.
ym <- rowMeans(y_rf)
xyplot(ym ~ teca$prod,
aspect = "iso",
type = c("p", "smooth")) +
layer(panel.abline(a = 0, b = 1))
# Correlação entre observado e predito.
cor(ym, teca$prod)
## [1] 0.8953036
#-----------------------------------------------------------------------
library(randomForest)
rf <- randomForest(prod ~ .,
data = teca,
ntree = 3,
mtry = 2,
keep.inbag = TRUE,
keep.forest = TRUE)
rf
##
## Call:
## randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2, keep.inbag = TRUE, keep.forest = TRUE)
## Type of random forest: regression
## Number of trees: 3
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 1161.826
## % Var explained: 11.81
1 - sum(rf$oob.times == 0)/length(rf$oob.times)
## [1] 0.7083333
str(rf)
## List of 18
## $ call : language randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2, keep.inbag = TRUE, keep.forest = TRUE)
## $ type : chr "regression"
## $ predicted : Named num [1:48] 92.1 28.9 84.3 NA 59.1 ...
## ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
## $ mse : num [1:3] 1166 1270 1162
## $ rsq : num [1:3] 0.115 0.036 0.118
## $ oob.times : int [1:48] 1 1 1 0 1 1 0 0 1 0 ...
## $ importance : num [1:20, 1] 866.7 55.2 2243.9 0 368.8 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
## .. ..$ : chr "IncNodePurity"
## $ importanceSD : NULL
## $ localImportance: NULL
## $ proximity : NULL
## $ ntree : num 3
## $ mtry : num 2
## $ forest :List of 11
## ..$ ndbigtree : int [1:3] 31 33 35
## ..$ nodestatus : int [1:35, 1:3] -3 -3 -3 -3 -3 -1 -3 -1 -1 -1 ...
## ..$ leftDaughter : int [1:35, 1:3] 2 4 6 8 10 0 12 0 0 0 ...
## ..$ rightDaughter: int [1:35, 1:3] 3 5 7 9 11 0 13 0 0 0 ...
## ..$ nodepred : num [1:35, 1:3] 88.6 41.5 108 48.2 36.4 ...
## ..$ bestvar : int [1:35, 1:3] 11 5 20 6 5 0 8 0 0 0 ...
## ..$ xbestsplit : num [1:35, 1:3] 3.145 0.604 438.77 0.477 0.667 ...
## ..$ ncat : Named int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "names")= chr [1:20] "Ur" "Us" "alp" "n" ...
## ..$ nrnodes : int 35
## ..$ ntree : num 3
## ..$ xlevels :List of 20
## .. ..$ Ur : num 0
## .. ..$ Us : num 0
## .. ..$ alp: num 0
## .. ..$ n : num 0
## .. ..$ I : num 0
## .. ..$ Ui : num 0
## .. ..$ S : num 0
## .. ..$ ph : num 0
## .. ..$ p : num 0
## .. ..$ k : num 0
## .. ..$ ca : num 0
## .. ..$ mg : num 0
## .. ..$ al : num 0
## .. ..$ ctc: num 0
## .. ..$ sat: num 0
## .. ..$ mo : num 0
## .. ..$ arg: num 0
## .. ..$ are: num 0
## .. ..$ cas: num 0
## .. ..$ acc: num 0
## $ coefs : NULL
## $ y : Named num [1:48] 68.2 24.3 125.3 102.7 54.6 ...
## ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
## $ test : NULL
## $ inbag : int [1:48, 1:3] 2 1 2 1 1 1 1 1 2 1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:48] "1" "2" "3" "4" ...
## .. ..$ : NULL
## $ terms :Classes 'terms', 'formula' language prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg + al + ctc + sat + mo + arg + are + cas + acc
## .. ..- attr(*, "variables")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat, mo, arg, are, cas, acc)
## .. ..- attr(*, "factors")= int [1:21, 1:20] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:21] "prod" "Ur" "Us" "alp" ...
## .. .. .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
## .. ..- attr(*, "term.labels")= chr [1:20] "Ur" "Us" "alp" "n" ...
## .. ..- attr(*, "order")= int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "intercept")= num 0
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat, mo, arg, are, cas, acc)
## .. ..- attr(*, "dataClasses")= Named chr [1:21] "numeric" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:21] "prod" "Ur" "Us" "alp" ...
## - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"
# Número de vezes que cada variável foi usada em cada bag.
head(rf$inbag)
## [,1] [,2] [,3]
## 1 2 0 1
## 2 1 0 1
## 3 2 0 2
## 4 1 1 3
## 5 1 1 0
## 6 1 1 0
# Valores preditos.
sort(unique(predict(rf)))
## [1] 25.48482 28.89996 44.40381 56.02661 59.11806 62.78211 63.90061
## [8] 68.94633 71.36430 71.57095 79.79300 84.33689 91.26764 92.09106
## [15] 98.10957 99.46530 100.46795 105.44807 108.87186 121.28472 121.47778
## [22] 123.38324 130.45181 136.58047
# Inspecionando uma das árvores da floresta.
t <- 1
one_tree <- getTree(rf, k = t, labelVar = TRUE)
nrow(one_tree)
## [1] 31
rf$forest$ndbigtree[t]
## [1] 31
one_tree
## left daughter right daughter split var split point status prediction
## 1 2 3 ca 3.1450000 -3 88.61266
## 2 4 5 I 0.6043793 -3 41.46031
## 3 6 7 acc 438.7700000 -3 108.02833
## 4 8 9 Ui 0.4772272 -3 48.19479
## 5 10 11 I 0.6669049 -3 36.40944
## 6 0 0 <NA> 0.0000000 -1 170.76465
## 7 12 13 ph 6.8500000 -3 99.66349
## 8 0 0 <NA> 0.0000000 -1 29.94343
## 9 0 0 <NA> 0.0000000 -1 66.44616
## 10 0 0 <NA> 0.0000000 -1 22.58625
## 11 14 15 Ui 0.4738361 -3 41.01717
## 12 16 17 are 711.5000000 -3 90.98453
## 13 18 19 p 52.9000000 -3 114.65443
## 14 0 0 <NA> 0.0000000 -1 47.97021
## 15 0 0 <NA> 0.0000000 -1 37.54065
## 16 20 21 p 2.5350000 -3 94.54145
## 17 22 23 alp -0.7142416 -3 83.27785
## 18 24 25 Ur 0.1850338 -3 117.94994
## 19 0 0 <NA> 0.0000000 -1 81.69934
## 20 0 0 <NA> 0.0000000 -1 109.64509
## 21 26 27 Us 0.7230838 -3 91.79534
## 22 0 0 <NA> 0.0000000 -1 68.22631
## 23 0 0 <NA> 0.0000000 -1 90.80361
## 24 0 0 <NA> 0.0000000 -1 105.44807
## 25 0 0 <NA> 0.0000000 -1 130.45181
## 26 28 29 cas 32.0500000 -3 93.02273
## 27 0 0 <NA> 0.0000000 -1 79.52139
## 28 30 31 ph 6.4500000 -3 88.72769
## 29 0 0 <NA> 0.0000000 -1 99.46530
## 30 0 0 <NA> 0.0000000 -1 70.95099
## 31 0 0 <NA> 0.0000000 -1 92.28303
# Outra forma de acessar as variáveis usadas.
rf$forest$bestvar[, t]
## [1] 11 5 20 6 5 0 8 0 0 0 6 18 9 0 0 9 3 1 0 0 2 0 0
## [24] 0 0 19 0 8 0 0 0 0 0 0 0
names(rf$forest$xlevels)[rf$forest$bestvar[, t]]
## [1] "ca" "I" "acc" "Ui" "I" "ph" "Ui" "are" "p" "p" "alp"
## [12] "Ur" "Us" "cas" "ph"
#-----------------------------------------------------------------------
# help(package = "randomForest", help_type = "html")
rf <- randomForest(prod ~ .,
data = teca,
ntree = B,
mtry = nv)
rf
##
## Call:
## randomForest(formula = prod ~ ., data = teca, ntree = B, mtry = nv)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 632.4727
## % Var explained: 51.99
# Valores preditos.
yp <- predict(rf)
# Correlação entre observado e predito.
cor(yp, teca$prod)
## [1] 0.7210519
xyplot(ym + yp ~ teca$prod,
aspect = "iso",
auto.key = TRUE,
type = c("p", "smooth")) +
layer(panel.abline(a = 0, b = 1))
# ATTENTION: a randomForest() criou árvores mais profundas que o código
# didático feito algumas linhas acima. Além do mais, as `m` variáveis
# são sorteadas a cada split e não uma vez apenas.
# o <- order(yp)
# xyplot(ym[o] + yp[o] ~ seq_along(ym), auto.key = TRUE)
Para construir o gráfico de uma árvore, leia esse post: https://stats.stackexchange.com/questions/41443/how-to-actually-plot-a-sample-tree-from-randomforestgettree.
#-----------------------------------------------------------------------
# Dados de imóveis.
nv <- floor(sqrt(ncol(ap) - 1))
nv
## [1] 2
ap2 <- na.omit(ap)
rf <- randomForest(lpreco ~ .,
data = ap2,
ntree = 100,
mtry = nv)
rf
##
## Call:
## randomForest(formula = lpreco ~ ., data = ap2, ntree = 100, mtry = nv)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 0.007040021
## % Var explained: 90.87
# Valores preditos.
yp <- predict(rf)
cor(yp, ap2$lpreco)
## [1] 0.9533648
# Correlação entre observado e predito.
xyplot(yp ~ ap2$lpreco,
aspect = "iso",
auto.key = TRUE,
type = c("p", "smooth")) +
layer(panel.abline(a = 0, b = 1))
importance(rf, type = 2)
## IncNodePurity
## quartos 7.321537
## banheiros 12.829150
## vagas 50.273498
## suites 29.342710
## bairro 6.591223
## larea 86.955501
im <- importance(rf, type = 2)
100 * im/sum(im)
## IncNodePurity
## quartos 3.787388
## banheiros 6.636444
## vagas 26.006185
## suites 15.178812
## bairro 3.409601
## larea 44.981570
varImpPlot(rf)
Veja as ilustrações aqui para entender: https://medium.com/mlreview/gradient-boosting-from-scratch-1e317ae4587d.
Visite também gradiente descendente boosting.
library(gbm)
# help(package = "gbm", help_type = "html")
rb <- gbm(lpreco ~ .,
data = ap,
distribution = "gaussian",
n.trees = 100,
shrinkage = 0.05,
interaction.depth = 1,
train.fraction = 0.7,
n.minobsinnode = 10,
cv.folds = 3,
keep.data = TRUE,
verbose = FALSE,
n.cores = 1)
rb
## gbm(formula = lpreco ~ ., distribution = "gaussian", data = ap,
## n.trees = 100, interaction.depth = 1, n.minobsinnode = 10,
## shrinkage = 0.05, train.fraction = 0.7, cv.folds = 3, keep.data = TRUE,
## verbose = FALSE, n.cores = 1)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 100.
## The best test-set iteration was 100.
## There were 6 predictors of which 5 had non-zero influence.
summary(rb)
## var rel.inf
## larea larea 54.0322474
## vagas vagas 27.3236736
## suites suites 17.0719486
## bairro bairro 1.2286939
## banheiros banheiros 0.3434365
## quartos quartos 0.0000000
# Valores preditos.
yp <- predict(rb)
## Using 100 trees...
# Correlação entre observado e predito.
cor(yp, ap$lpreco)
## [1] 0.930845
xyplot(yp ~ ap$lpreco,
aspect = "iso",
auto.key = TRUE,
type = c("p", "smooth")) +
latticeExtra::layer(panel.abline(a = 0, b = 1))
Próxima aula.
Machine Learning | Prof. Eduardo V. Ferreira & Prof. Walmes M. Zeviani |