Ajuste de modelos de regressão linear

Walmes Zeviani


Definições da sessão

##-----------------------------------------------------------------------------
## Definições da sessão, pacotes a serem usados.

pkg <- c("lattice", "latticeExtra", "reshape", "car", "alr3",
         "plyr", "wzRfun")
sapply(pkg, library, character.only=TRUE, logical.return=TRUE)
##      lattice latticeExtra      reshape          car         alr3         plyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##       wzRfun 
##         TRUE

source("lattice_setup.R")

##-----------------------------------------------------------------------------
## Informações sobre as versões dos pacotes.

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i686-pc-linux-gnu (32-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=pt_BR.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  base     
## 
## other attached packages:
## [1] wzRfun_0.1          plyr_1.8.1          alr3_2.0.5          car_2.0-20         
## [5] reshape_0.8.5       latticeExtra_0.6-26 RColorBrewer_1.0-5  lattice_0.20-29    
## [9] knitr_1.5          
## 
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.5 formatR_0.10   grid_3.1.1     MASS_7.3-33    methods_3.1.1 
## [6] nnet_7.3-8     Rcpp_0.11.1    stringr_0.6.2  tools_3.1.1

## obs: Para instalar um pacote faça:
## install.packages("nome_do_pacote", dependencies=TRUE)

Modelo para relação altura e diâmetro à altura do peito

##-----------------------------------------------------------------------------
## Dados de inventário florestal.

dap <- read.table("http://www.leg.ufpr.br/~walmes/data/dap.txt",
                  header=TRUE, sep="\t")
str(dap)
## 'data.frame':    991 obs. of  2 variables:
##  $ DAP: num  4.87 7.38 5.95 5.73 6.4 ...
##  $ HT : num  9.5 9.8 13 13.5 13.5 13.5 13.5 14.3 14.8 14.8 ...

## Renomeia.
names(dap) <- c("d","h")

##-----------------------------------------------------------------------------
## Visualização.

xyplot(h~d, data=dap, type=c("p","smooth"))

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Frequência de obsevações ausentes.

sum(!is.na(dap$d))
## [1] 991
sum(!is.na(dap$h))
## [1] 223

head(dap)
##       d    h
## 1 4.870  9.5
## 2 7.385  9.8
## 3 5.952 13.0
## 4 5.730 13.5
## 5 6.398 13.5
## 6 6.748 13.5
tail(dap)
##          d  h
## 986  8.785 NA
## 987 19.481 NA
## 988 19.226 NA
## 989  9.040 NA
## 990 13.560 NA
## 991 10.154 NA

## Foram medidas 991 árvores, mas em apenas 223 que se tem h e d. Nas
## restantes só o d. Quer-se estimar a h a partir dos pares conhecidos.

##-----------------------------------------------------------------------------
## Criando novas variáveis para explorar um modelo com boa previsão de
## valores de h a partir de valores de d. Usar procedimento stepwise.

dap <- transform(dap,
                 d2=d^2,     ## quadrado
                 d3=d^3,     ## cubo
                 dr=sqrt(d), ## raíz
                 dl=log(d),  ## ln
                 di=1/d,     ## recíproco
                 di2=1/d^2)  ## recíproco ao quadrado
str(dap)
## 'data.frame':    991 obs. of  8 variables:
##  $ d  : num  4.87 7.38 5.95 5.73 6.4 ...
##  $ h  : num  9.5 9.8 13 13.5 13.5 13.5 13.5 14.3 14.8 14.8 ...
##  $ d2 : num  23.7 54.5 35.4 32.8 40.9 ...
##  $ d3 : num  116 403 211 188 262 ...
##  $ dr : num  2.21 2.72 2.44 2.39 2.53 ...
##  $ dl : num  1.58 2 1.78 1.75 1.86 ...
##  $ di : num  0.205 0.135 0.168 0.175 0.156 ...
##  $ di2: num  0.0422 0.0183 0.0282 0.0305 0.0244 ...

##-----------------------------------------------------------------------------
## As relações marginais.

## splom(dap)

par(mfrow=c(2,4))
for(i in c(1,3:8)){
    plot(dap[,"h"]~dap[,i], main=names(dap)[i])
}
layout(1)

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Ordena os dados e deixa apenas os registros completos (linha cheia).

## Ordenar para evitar efeito espaguete quando plotar.
dap <- dap[order(dap$d),]

## Nova base que contém d e h observados.
dapcc <- dap[complete.cases(dap),]

## reseta o nome das linhas
rownames(dapcc) <- NULL
head(dapcc)
##       d    h    d2    d3    dr    dl     di     di2
## 1 4.870  9.5 23.72 115.5 2.207 1.583 0.2053 0.04216
## 2 5.730 13.5 32.83 188.1 2.394 1.746 0.1745 0.03046
## 3 5.952 13.0 35.43 210.9 2.440 1.784 0.1680 0.02822
## 4 6.334 15.5 40.12 254.2 2.517 1.846 0.1579 0.02492
## 5 6.398 13.5 40.93 261.9 2.529 1.856 0.1563 0.02443
## 6 6.748 13.5 45.54 307.3 2.598 1.909 0.1482 0.02196
## tail(dapcc)
## str(dapcc)

##-----------------------------------------------------------------------------
## Ajuste de modelos.

## Linear simples.
m0 <- lm(h~d, data=dapcc)
summary(m0)
## 
## Call:
## lm(formula = h ~ d, data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.454 -1.268  0.048  1.333 11.420 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.8998     0.5376    22.1   <2e-16 ***
## d             0.7424     0.0297    25.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.3 on 221 degrees of freedom
## Multiple R-squared:  0.739,  Adjusted R-squared:  0.738 
## F-statistic:  625 on 1 and 221 DF,  p-value: <2e-16

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m0)~d, dapcc, col=2)
plot(m0)

plot of chunk unnamed-chunk-3

layout(1)

## Apresenta falta de ajuste.

## Quadrático.
m1 <- lm(h~d+d2, data=dapcc)
summary(m1)
## 
## Call:
## lm(formula = h ~ d + d2, data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.311 -1.128  0.062  1.210 11.086 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.48684    1.31133    3.42  0.00074 ***
## d            1.76152    0.16902   10.42  < 2e-16 ***
## d2          -0.03130    0.00512   -6.11  4.4e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.14 on 220 degrees of freedom
## Multiple R-squared:  0.777,  Adjusted R-squared:  0.775 
## F-statistic:  383 on 2 and 220 DF,  p-value: <2e-16

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m1)~d, dapcc, col=2)
plot(m1)

plot of chunk unnamed-chunk-3

layout(1)

## Cúbico.
m2 <- lm(h~d+d2+d3, data=dapcc)
summary(m2)
## 
## Call:
## lm(formula = h ~ d + d2 + d3, data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.634 -1.118  0.041  1.189 10.613 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.044684   3.004680   -2.01  0.04547 *  
## d            4.093204   0.624615    6.55  4.0e-10 ***
## d2          -0.186500   0.040424   -4.61  6.7e-06 ***
## d3           0.003194   0.000826    3.87  0.00014 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.07 on 219 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.788 
## F-statistic:  276 on 3 and 219 DF,  p-value: <2e-16
anova(m2)
## Analysis of Variance Table
## 
## Response: h
##            Df Sum Sq Mean Sq F value  Pr(>F)    
## d           1   3320    3320   774.1 < 2e-16 ***
## d2          1    170     170    39.7 1.6e-09 ***
## d3          1     64      64    15.0 0.00014 ***
## Residuals 219    939       4                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m2)~d, dapcc, col=2)
plot(m2)

plot of chunk unnamed-chunk-3

layout(1)

##-----------------------------------------------------------------------------
## Cria a lista com as fórmulas dos modelos, 7 no total.

formulas <- list(linear=h~d,
                 quadratico=h~d+d2,
                 cubico=h~d+d2+d3,
                 reciproco=h~d+di,
                 reciproco2=h~d+di2,
                 raiz=h~d+dr,
                 log=h~d+dl)
formulas
## $linear
## h ~ d
## 
## $quadratico
## h ~ d + d2
## 
## $cubico
## h ~ d + d2 + d3
## 
## $reciproco
## h ~ d + di
## 
## $reciproco2
## h ~ d + di2
## 
## $raiz
## h ~ d + dr
## 
## $log
## h ~ d + dl

##-----------------------------------------------------------------------------
## Com a lapply(), o ajuste será feito usando cada uma das fórmulas da
## lista.

## Todos os ajustes.
ajustes <- lapply(formulas, lm, data=dapcc)
names(ajustes)
## [1] "linear"     "quadratico" "cubico"     "reciproco"  "reciproco2" "raiz"      
## [7] "log"

## Todas as tabelas com estimativas de medidas de ajuste.
lapply(ajustes, summary)
## $linear
## 
## Call:
## FUN(formula = X[[1L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.454 -1.268  0.048  1.333 11.420 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.8998     0.5376    22.1   <2e-16 ***
## d             0.7424     0.0297    25.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.3 on 221 degrees of freedom
## Multiple R-squared:  0.739,  Adjusted R-squared:  0.738 
## F-statistic:  625 on 1 and 221 DF,  p-value: <2e-16
## 
## 
## $quadratico
## 
## Call:
## FUN(formula = X[[2L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.311 -1.128  0.062  1.210 11.086 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.48684    1.31133    3.42  0.00074 ***
## d            1.76152    0.16902   10.42  < 2e-16 ***
## d2          -0.03130    0.00512   -6.11  4.4e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.14 on 220 degrees of freedom
## Multiple R-squared:  0.777,  Adjusted R-squared:  0.775 
## F-statistic:  383 on 2 and 220 DF,  p-value: <2e-16
## 
## 
## $cubico
## 
## Call:
## FUN(formula = X[[3L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.634 -1.118  0.041  1.189 10.613 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.044684   3.004680   -2.01  0.04547 *  
## d            4.093204   0.624615    6.55  4.0e-10 ***
## d2          -0.186500   0.040424   -4.61  6.7e-06 ***
## d3           0.003194   0.000826    3.87  0.00014 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.07 on 219 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.788 
## F-statistic:  276 on 3 and 219 DF,  p-value: <2e-16
## 
## 
## $reciproco
## 
## Call:
## FUN(formula = X[[4L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.552 -1.138  0.007  1.226 10.648 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.8607     1.9183   12.96  < 2e-16 ***
## d             0.3162     0.0667    4.74  3.8e-06 ***
## di          -85.0770    12.1784   -6.99  3.3e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.09 on 220 degrees of freedom
## Multiple R-squared:  0.786,  Adjusted R-squared:  0.784 
## F-statistic:  405 on 2 and 220 DF,  p-value: <2e-16
## 
## 
## $reciproco2
## 
## Call:
## FUN(formula = X[[5L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.722 -1.088  0.059  1.232 10.630 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.6752     0.9875   17.90  < 2e-16 ***
## d              0.4951     0.0456   10.85  < 2e-16 ***
## di2         -291.6728    43.2847   -6.74  1.4e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared:  0.783,  Adjusted R-squared:  0.782 
## F-statistic:  398 on 2 and 220 DF,  p-value: <2e-16
## 
## 
## $raiz
## 
## Call:
## FUN(formula = X[[6L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.360 -1.131  0.044  1.167 10.831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.075      4.283   -3.99  9.1e-05 ***
## d             -1.216      0.289   -4.21  3.7e-05 ***
## dr            15.313      2.249    6.81  9.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared:  0.784,  Adjusted R-squared:  0.782 
## F-statistic:  400 on 2 and 220 DF,  p-value: <2e-16
## 
## 
## $log
## 
## Call:
## FUN(formula = X[[7L]], data = ..1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.410 -1.115  0.008  1.190 10.754 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -11.138      3.355   -3.32   0.0011 ** 
## d             -0.203      0.139   -1.46   0.1454    
## dl            14.095      2.031    6.94  4.3e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.09 on 220 degrees of freedom
## Multiple R-squared:  0.786,  Adjusted R-squared:  0.784 
## F-statistic:  403 on 2 and 220 DF,  p-value: <2e-16

## Todos os gráficos de resíduos vs ajustados. 
par(mfrow=c(2,4))
lapply(ajustes, plot, which=1)
## $linear
## NULL
## 
## $quadratico
## NULL
## 
## $cubico
## NULL
## 
## $reciproco
## NULL
## 
## $reciproco2
## NULL
## 
## $raiz
## NULL
## 
## $log
## NULL
layout(1)

plot of chunk unnamed-chunk-3


## Extraindo o valor de R² ajustado de cada modelo.
r2 <- sapply(ajustes, function(a){
    summary(a)$adj.r.squared
})

## Valores do R².
cbind(r2)
##                r2
## linear     0.7376
## quadratico 0.7747
## cubico     0.7881
## reciproco  0.7843
## reciproco2 0.7815
## raiz       0.7823
## log        0.7838

## Gráficos.
## barplot(r2, ylim=c(0,1))
barplot(r2, ylim=c(0.7,0.8), xpd=FALSE)
box(bty="l")

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Correndo um stepwise.

## Modelo com todas as variáveis.
m7 <- lm(h~., data=dapcc)
summary(m7)
## 
## Call:
## lm(formula = h ~ ., data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.148 -1.107  0.062  1.123 11.075 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  9.41e+03   7.91e+03    1.19    0.236  
## d           -8.32e+03   3.93e+03   -2.12    0.035 *
## d2           5.55e+01   2.52e+01    2.20    0.029 *
## d3          -3.35e-01   1.48e-01   -2.26    0.025 *
## dr           9.81e+04   4.74e+04    2.07    0.040 *
## dl          -9.76e+04   4.85e+04   -2.01    0.045 *
## di          -1.79e+05   9.43e+04   -1.90    0.059 .
## di2          1.11e+05   6.25e+04    1.78    0.077 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.05 on 215 degrees of freedom
## Multiple R-squared:  0.799,  Adjusted R-squared:  0.793 
## F-statistic:  122 on 7 and 215 DF,  p-value: <2e-16

## obs: o ponto "." indica "todas as variáveis restante do arquivo".

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc); lines(fitted(m7)~d, dapcc, col=2)
plot(m7)
## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-3

layout(1)

##-----------------------------------------------------------------------------
## Seleção de modelos/variáveis (both, forward, backward).

## Por padrão é o AIC: ll-2*p. Peso para os parâmetros é 2.
step(m7, direction="both")
## Start:  AIC=327.9
## h ~ d + d2 + d3 + dr + dl + di + di2
## 
##        Df Sum of Sq RSS AIC
## <none>              903 328
## - di2   1      13.3 916 329
## - di    1      15.1 918 330
## - dl    1      17.0 920 330
## - dr    1      18.0 921 330
## - d     1      18.9 922 330
## - d2    1      20.4 923 331
## - d3    1      21.5 924 331
## 
## Call:
## lm(formula = h ~ d + d2 + d3 + dr + dl + di + di2, data = dapcc)
## 
## Coefficients:
## (Intercept)            d           d2           d3           dr           dl  
##    9.41e+03    -8.32e+03     5.55e+01    -3.35e-01     9.81e+04    -9.76e+04  
##          di          di2  
##   -1.79e+05     1.11e+05

## logLik(m7)-2*length(coef(m7))
## Assim é o BIC: ll-log(n)*p. Peso para os parâmetros é log(n).
step(m7, direction="both", k=log(nrow(dapcc)))
## Start:  AIC=355.1
## h ~ d + d2 + d3 + dr + dl + di + di2
## 
##        Df Sum of Sq RSS AIC
## - di2   1      13.3 916 353
## - di    1      15.1 918 353
## - dl    1      17.0 920 354
## - dr    1      18.0 921 354
## - d     1      18.9 922 354
## - d2    1      20.4 923 355
## - d3    1      21.5 924 355
## <none>              903 355
## 
## Step:  AIC=353
## h ~ d + d2 + d3 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - d3    1      16.5 933 352
## - d2    1      18.2 934 352
## - d     1      19.4 936 352
## - dr    1      19.8 936 352
## - di    1      19.8 936 352
## - dl    1      19.9 936 352
## <none>              916 353
## + di2   1      13.3 903 355
## 
## Step:  AIC=351.5
## h ~ d + d2 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - di    1      3.73 936 347
## - dl    1      5.22 938 347
## - dr    1      6.27 939 348
## - d     1      7.31 940 348
## - d2    1      9.48 942 348
## <none>              933 352
## + d3    1     16.46 916 353
## + di2   1      8.26 924 355
## 
## Step:  AIC=347
## h ~ d + d2 + dr + dl
## 
##        Df Sum of Sq RSS AIC
## - dl    1     13.63 950 345
## - dr    1     16.95 953 346
## - d     1     18.54 955 346
## - d2    1     20.80 957 346
## <none>              936 347
## + di2   1      4.56 932 351
## + di    1      3.73 933 352
## + d3    1      0.43 936 352
## 
## Step:  AIC=344.8
## h ~ d + d2 + dr
## 
##        Df Sum of Sq  RSS AIC
## - d2    1      19.4  969 344
## <none>               950 345
## - d     1      30.8  981 347
## + d3    1      13.9  936 347
## + dl    1      13.6  936 347
## + di    1      12.1  938 347
## + di2   1      10.3  940 348
## - dr    1      53.3 1003 352
## 
## Step:  AIC=343.9
## h ~ d + dr
## 
##        Df Sum of Sq  RSS AIC
## <none>               969 344
## + d3    1      22.4  947 344
## + d2    1      19.4  950 345
## + dl    1      12.2  957 346
## + di    1       9.0  960 347
## + di2   1       6.4  963 348
## - d     1      78.1 1047 356
## - dr    1     204.3 1174 381
## 
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
## 
## Coefficients:
## (Intercept)            d           dr  
##      -17.08        -1.22        15.31

## Uso um peso arbitrário.
step(m7, direction="both", k=12)
## Start:  AIC=407.9
## h ~ d + d2 + d3 + dr + dl + di + di2
## 
##        Df Sum of Sq RSS AIC
## - di2   1      13.3 916 399
## - di    1      15.1 918 400
## - dl    1      17.0 920 400
## - dr    1      18.0 921 400
## - d     1      18.9 922 400
## - d2    1      20.4 923 401
## - d3    1      21.5 924 401
## <none>              903 408
## 
## Step:  AIC=399.1
## h ~ d + d2 + d3 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - d3    1      16.5 933 391
## - d2    1      18.2 934 391
## - d     1      19.4 936 392
## - dr    1      19.8 936 392
## - di    1      19.8 936 392
## - dl    1      19.9 936 392
## <none>              916 399
## + di2   1      13.3 903 408
## 
## Step:  AIC=391.1
## h ~ d + d2 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - di    1      3.73 936 380
## - dl    1      5.22 938 380
## - dr    1      6.27 939 381
## - d     1      7.31 940 381
## - d2    1      9.48 942 381
## <none>              933 391
## + d3    1     16.46 916 399
## + di2   1      8.26 924 401
## 
## Step:  AIC=380
## h ~ d + d2 + dr + dl
## 
##        Df Sum of Sq RSS AIC
## - dl    1     13.63 950 371
## - dr    1     16.95 953 372
## - d     1     18.54 955 372
## - d2    1     20.80 957 373
## <none>              936 380
## + di2   1      4.56 932 391
## + di    1      3.73 933 391
## + d3    1      0.43 936 392
## 
## Step:  AIC=371.2
## h ~ d + d2 + dr
## 
##        Df Sum of Sq  RSS AIC
## - d2    1      19.4  969 364
## - d     1      30.8  981 366
## <none>               950 371
## - dr    1      53.3 1003 371
## + d3    1      13.9  936 380
## + dl    1      13.6  936 380
## + di    1      12.1  938 380
## + di2   1      10.3  940 381
## 
## Step:  AIC=363.7
## h ~ d + dr
## 
##        Df Sum of Sq  RSS AIC
## <none>               969 364
## - d     1      78.1 1047 369
## + d3    1      22.4  947 370
## + d2    1      19.4  950 371
## + dl    1      12.2  957 373
## + di    1       9.0  960 374
## + di2   1       6.4  963 374
## - dr    1     204.3 1174 394
## 
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
## 
## Coefficients:
## (Intercept)            d           dr  
##      -17.08        -1.22        15.31

##-----------------------------------------------------------------------------
## Modelo final.

mfin <- step(m7, direction="both", k=log(nrow(dapcc)))
## Start:  AIC=355.1
## h ~ d + d2 + d3 + dr + dl + di + di2
## 
##        Df Sum of Sq RSS AIC
## - di2   1      13.3 916 353
## - di    1      15.1 918 353
## - dl    1      17.0 920 354
## - dr    1      18.0 921 354
## - d     1      18.9 922 354
## - d2    1      20.4 923 355
## - d3    1      21.5 924 355
## <none>              903 355
## 
## Step:  AIC=353
## h ~ d + d2 + d3 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - d3    1      16.5 933 352
## - d2    1      18.2 934 352
## - d     1      19.4 936 352
## - dr    1      19.8 936 352
## - di    1      19.8 936 352
## - dl    1      19.9 936 352
## <none>              916 353
## + di2   1      13.3 903 355
## 
## Step:  AIC=351.5
## h ~ d + d2 + dr + dl + di
## 
##        Df Sum of Sq RSS AIC
## - di    1      3.73 936 347
## - dl    1      5.22 938 347
## - dr    1      6.27 939 348
## - d     1      7.31 940 348
## - d2    1      9.48 942 348
## <none>              933 352
## + d3    1     16.46 916 353
## + di2   1      8.26 924 355
## 
## Step:  AIC=347
## h ~ d + d2 + dr + dl
## 
##        Df Sum of Sq RSS AIC
## - dl    1     13.63 950 345
## - dr    1     16.95 953 346
## - d     1     18.54 955 346
## - d2    1     20.80 957 346
## <none>              936 347
## + di2   1      4.56 932 351
## + di    1      3.73 933 352
## + d3    1      0.43 936 352
## 
## Step:  AIC=344.8
## h ~ d + d2 + dr
## 
##        Df Sum of Sq  RSS AIC
## - d2    1      19.4  969 344
## <none>               950 345
## - d     1      30.8  981 347
## + d3    1      13.9  936 347
## + dl    1      13.6  936 347
## + di    1      12.1  938 347
## + di2   1      10.3  940 348
## - dr    1      53.3 1003 352
## 
## Step:  AIC=343.9
## h ~ d + dr
## 
##        Df Sum of Sq  RSS AIC
## <none>               969 344
## + d3    1      22.4  947 344
## + d2    1      19.4  950 345
## + dl    1      12.2  957 346
## + di    1       9.0  960 347
## + di2   1       6.4  963 348
## - d     1      78.1 1047 356
## - dr    1     204.3 1174 381
summary(mfin)
## 
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.360 -1.131  0.044  1.167 10.831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.075      4.283   -3.99  9.1e-05 ***
## d             -1.216      0.289   -4.21  3.7e-05 ***
## dr            15.313      2.249    6.81  9.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared:  0.784,  Adjusted R-squared:  0.782 
## F-statistic:  400 on 2 and 220 DF,  p-value: <2e-16

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(mfin)~d, dapcc, col=2)
plot(mfin)

plot of chunk unnamed-chunk-3

layout(1)

##-----------------------------------------------------------------------------
## Medidas de influência das observações.

inf <- influence.measures(mfin)
summary(inf)
## Potentially influential observations of
##   lm(formula = h ~ d + dr, data = dapcc) :
## 
##     dfb.1_ dfb.d dfb.dr dffit   cov.r   cook.d hat    
## 1   -0.25  -0.22  0.24  -0.27    1.17_*  0.02   0.14_*
## 2    0.12   0.11 -0.12   0.14    1.11_*  0.01   0.09_*
## 3   -0.01   0.00  0.01  -0.01    1.10_*  0.00   0.08_*
## 4    0.19   0.16 -0.18   0.22    1.07_*  0.02   0.06_*
## 5   -0.04  -0.03  0.04  -0.05    1.08_*  0.00   0.06_*
## 6   -0.09  -0.08  0.08  -0.11    1.07_*  0.00   0.05_*
## 7   -0.03  -0.03  0.03  -0.04    1.07_*  0.00   0.05_*
## 8   -0.12  -0.10  0.11  -0.15    1.06_*  0.01   0.05_*
## 9    0.10   0.08 -0.09   0.12    1.06_*  0.00   0.05_*
## 10  -0.41  -0.33  0.37  -0.56_*  0.94_*  0.10   0.04  
## 11  -0.04  -0.03  0.04  -0.05    1.05_*  0.00   0.04  
## 15   0.38   0.25 -0.31   0.78_*  0.71_*  0.18   0.02  
## 29  -0.02   0.03 -0.01  -0.25    0.96_*  0.02   0.01  
## 41  -0.17  -0.26  0.23   0.58_*  0.69_*  0.10   0.01  
## 160  0.02   0.00 -0.01  -0.20    0.94_*  0.01   0.01  
## 194 -0.08  -0.11  0.09  -0.27    0.93_*  0.02   0.01  
## 209 -0.25  -0.31  0.28  -0.50_*  0.87_*  0.08   0.02  
## 221  0.03   0.04 -0.04   0.05    1.05_*  0.00   0.03  
## 222  0.13   0.15 -0.14   0.19    1.04_*  0.01   0.04  
## 223 -0.19  -0.22  0.20  -0.26    1.05_*  0.02   0.05_*

## Sinalizando os pontos influentes.
str(inf)
## List of 3
##  $ infmat: num [1:223, 1:7] -0.24888 0.12312 -0.00557 0.19068 -0.04001 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:223] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:7] "dfb.1_" "dfb.d" "dfb.dr" "dffit" ...
##  $ is.inf: logi [1:223, 1:7] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:223] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:7] "dfb.1_" "dfb.d" "dfb.dr" "dffit" ...
##  $ call  : language lm(formula = h ~ d + dr, data = dapcc)
##  - attr(*, "class")= chr "infl"

## Influentes pelo DFITS (influência sobre ajuste).
dfits <- inf$is.inf[,4]

plot(h~d, dapcc)
lines(fitted(mfin)~d, dapcc, col=2)
with(dapcc, points(d[dfits], h[dfits], col=2, pch=19))

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Identificar/remover os pontos discrepantes/influentes manualmente
## (critério visual).

plot(residuals(mfin)~d, dapcc)

plot of chunk unnamed-chunk-3

## id <- identify(dapcc$d, residuals(mfin))
## id <- c(10L, 15L, 41L, 209L)
id <- which(dfits)

## Refazer a análise com os pontos removidos.
dapcc2 <- dapcc[-id,]
str(dapcc2)
## 'data.frame':    219 obs. of  8 variables:
##  $ d  : num  4.87 5.73 5.95 6.33 6.4 ...
##  $ h  : num  9.5 13.5 13 15.5 13.5 13.5 14.3 13.5 16 15 ...
##  $ d2 : num  23.7 32.8 35.4 40.1 40.9 ...
##  $ d3 : num  116 188 211 254 262 ...
##  $ dr : num  2.21 2.39 2.44 2.52 2.53 ...
##  $ dl : num  1.58 1.75 1.78 1.85 1.86 ...
##  $ di : num  0.205 0.175 0.168 0.158 0.156 ...
##  $ di2: num  0.0422 0.0305 0.0282 0.0249 0.0244 ...
mfinb <- lm(h~d+dr, data=dapcc2)
summary(mfinb)
## 
## Call:
## lm(formula = h ~ d + dr, data = dapcc2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.615 -1.032  0.118  1.232  3.994 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -15.210      3.585   -4.24  3.3e-05 ***
## d             -1.034      0.241   -4.29  2.7e-05 ***
## dr            14.081      1.878    7.50  1.7e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.73 on 216 degrees of freedom
## Multiple R-squared:  0.846,  Adjusted R-squared:  0.844 
## F-statistic:  593 on 2 and 216 DF,  p-value: <2e-16
summary(mfin)
## 
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.360 -1.131  0.044  1.167 10.831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.075      4.283   -3.99  9.1e-05 ***
## d             -1.216      0.289   -4.21  3.7e-05 ***
## dr            15.313      2.249    6.81  9.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared:  0.784,  Adjusted R-squared:  0.782 
## F-statistic:  400 on 2 and 220 DF,  p-value: <2e-16

## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc2)
lines(fitted(mfinb)~d, dapcc2, col=2) ## modelo com as remoções
lines(fitted(mfin)~d, dapcc, col=3)   ## modelo com todas observações
plot(mfinb)

plot of chunk unnamed-chunk-3

layout(1)

##-----------------------------------------------------------------------------
## Denconfia da normalidade por causa do leve arqueamento dos resíduos
## no qqnorm? Devemos transformar? Qual transformação usar?

plot(mfinb, which=2)

plot of chunk unnamed-chunk-3

MASS::boxcox(mfinb, lambda=seq(0, 2, l=100)) # intervalo contém o 1?

plot of chunk unnamed-chunk-3


## Qual o resultado dos testes?
shapiro.test(residuals(mfinb))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(mfinb)
## W = 0.9893, p-value = 0.1028
shapiro.test(rstudent(mfinb))
## 
##  Shapiro-Wilk normality test
## 
## data:  rstudent(mfinb)
## W = 0.9889, p-value = 0.09033
ks.test(rstudent(mfinb), "pnorm", mean=0, sd=1)
## Warning: ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstudent(mfinb)
## D = 0.0426, p-value = 0.8226
## alternative hypothesis: two-sided
ks.test(rstudent(mfinb), "pt", df=nrow(dapcc)-length(coef(mfin)-1))
## Warning: ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstudent(mfinb)
## D = 0.0425, p-value = 0.8249
## alternative hypothesis: two-sided

##-----------------------------------------------------------------------------
## Tudo para encontrar o modelo, vamos predizer a artura das árvores e
## salvar num arquivo.

hpred <- predict(mfinb, newdata=dap)
str(hpred)
##  Named num [1:991] 2.4 3.74 5.15 6.15 6.15 ...
##  - attr(*, "names")= chr [1:991] "960" "870" "616" "261" ...
dap$hpred <- hpred
str(dap)
## 'data.frame':    991 obs. of  9 variables:
##  $ d    : num  1.94 2.29 2.71 3.02 3.02 ...
##  $ h    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d2   : num  3.77 5.25 7.32 9.14 9.14 ...
##  $ d3   : num  7.32 12.04 19.81 27.65 27.65 ...
##  $ dr   : num  1.39 1.51 1.64 1.74 1.74 ...
##  $ dl   : num  0.664 0.829 0.995 1.107 1.107 ...
##  $ di   : num  0.515 0.436 0.37 0.331 0.331 ...
##  $ di2  : num  0.265 0.19 0.137 0.109 0.109 ...
##  $ hpred: num  2.4 3.74 5.15 6.15 6.15 ...

## write.table(dap, "dap.xls", sep="\t", quote=FALSE, row.names=FALSE,
##             dec=".")

##-----------------------------------------------------------------------------
## Predição.

## range(dapcc2$d)
d.new <- seq(4, 30, length=100)
d.new
##   [1]  4.000  4.263  4.525  4.788  5.051  5.313  5.576  5.838  6.101  6.364  6.626  6.889
##  [13]  7.152  7.414  7.677  7.939  8.202  8.465  8.727  8.990  9.253  9.515  9.778 10.040
##  [25] 10.303 10.566 10.828 11.091 11.354 11.616 11.879 12.141 12.404 12.667 12.929 13.192
##  [37] 13.455 13.717 13.980 14.242 14.505 14.768 15.030 15.293 15.556 15.818 16.081 16.343
##  [49] 16.606 16.869 17.131 17.394 17.657 17.919 18.182 18.444 18.707 18.970 19.232 19.495
##  [61] 19.758 20.020 20.283 20.545 20.808 21.071 21.333 21.596 21.859 22.121 22.384 22.646
##  [73] 22.909 23.172 23.434 23.697 23.960 24.222 24.485 24.747 25.010 25.273 25.535 25.798
##  [85] 26.061 26.323 26.586 26.848 27.111 27.374 27.636 27.899 28.162 28.424 28.687 28.949
##  [97] 29.212 29.475 29.737 30.000

pred <- data.frame(d=d.new, dr=sqrt(d.new))

## Fazendo predição com intervalo de confiança e predição futura.
Yp <- predict(mfinb, newdata=pred,
              interval="confidence")
Yf <- predict(mfinb, newdata=pred,
              interval="prediction")
colnames(Yf) <- toupper(colnames(Yf))
head(Yp)
##      fit    lwr   upr
## 1  8.816  7.177 10.45
## 2  9.454  7.925 10.98
## 3 10.064  8.637 11.49
## 4 10.650  9.318 11.98
## 5 11.212  9.969 12.45
## 6 11.753 10.593 12.91

pred <- cbind(pred, Yp, Yf)
str(pred)
## 'data.frame':    100 obs. of  8 variables:
##  $ d  : num  4 4.26 4.53 4.79 5.05 ...
##  $ dr : num  2 2.06 2.13 2.19 2.25 ...
##  $ fit: num  8.82 9.45 10.06 10.65 11.21 ...
##  $ lwr: num  7.18 7.92 8.64 9.32 9.97 ...
##  $ upr: num  10.5 11 11.5 12 12.5 ...
##  $ FIT: num  8.82 9.45 10.06 10.65 11.21 ...
##  $ LWR: num  5.03 5.72 6.37 6.99 7.58 ...
##  $ UPR: num  12.6 13.2 13.8 14.3 14.8 ...

## Inclusão da expressão do modelo.
c0 <- coef(mfinb)
co <- format(c(abs(c0), summary(mfinb)$r.squared),
             digits=3, trim=TRUE)
sinal <- ifelse(c0<0, "-", "+")
co[seq_along(sinal)] <- paste(sinal, co[seq_along(sinal)], sep="")

l <- c("Predito", "IC predito", "IC obs. futura")
key <- list(#corner=c(0.05,0.95),
            lines=list(lty=1),
            text=list(l[1]),
            rect=list(col=c("red"), alpha=0.1, lty=3),
            text=list(l[2]),
            rect=list(col=c("blue"), alpha=0.1, lty=3),
            text=list(l[3]))

xyplot(h~d, data=dapcc2, xlab="DAP (cm)", ylab="Altura (m)", key=key)+
    as.layer(xyplot(fit~d, data=pred, type="l",
                    ly=pred$lwr, uy=pred$upr, cty="bands", fill="red",
                    prepanel=prepanel.cbH, panel=panel.cbH))+
    as.layer(xyplot(FIT~d, data=pred, type="l",
                    ly=pred$LWR, uy=pred$UPR, cty="bands", fill="blue",
                    prepanel=prepanel.cbH, panel=panel.cbH))+
    layer(panel.text(
        x=20, y=15,
        label=substitute(hat(h)==b0~b1*d~b2*sqrt(d)~~~~(R^2==r2),
            list(b0=co[1], b1=co[2], b2=co[3], r2=co[4])), bty="n"))

plot of chunk unnamed-chunk-3