Ajuste de modelos de regressão linear

Walmes Zeviani


Definições da sessão

##-----------------------------------------------------------------------------
## Definições da sessão, pacotes a serem usados.

pkg <- c("lattice", "latticeExtra", "reshape", "car", "alr3",
         "plyr", "wzRfun")
sapply(pkg, library, character.only=TRUE, logical.return=TRUE)
##      lattice latticeExtra      reshape          car         alr3         plyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##       wzRfun 
##         TRUE

source("lattice_setup.R")

##-----------------------------------------------------------------------------
## Informações sobre as versões dos pacotes.

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i686-pc-linux-gnu (32-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=pt_BR.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  base     
## 
## other attached packages:
## [1] wzRfun_0.1          plyr_1.8.1          alr3_2.0.5          car_2.0-20         
## [5] reshape_0.8.5       latticeExtra_0.6-26 RColorBrewer_1.0-5  lattice_0.20-29    
## [9] knitr_1.5          
## 
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.5 formatR_0.10   grid_3.1.1     MASS_7.3-33    methods_3.1.1 
## [6] nnet_7.3-8     Rcpp_0.11.1    stringr_0.6.2  tools_3.1.1

## obs: Para instalar um pacote faça:
## install.packages("nome_do_pacote", dependencies=TRUE)

Distância para parada em função da velocidade

##-----------------------------------------------------------------------------
## Explorar os dados.

## Estrutura do Arquivo.
str(cars)
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...

## Visualização.
xyplot(dist~speed, data=cars, type=c("p","smooth"))

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Estimação por mínimos quadrados via solução matricial.

X <- cbind(b0=1, b1=cars$speed)
head(X)
##      b0 b1
## [1,]  1  4
## [2,]  1  4
## [3,]  1  7
## [4,]  1  7
## [5,]  1  8
## [6,]  1  9

y <- cbind(cars$dist)

## Estimativa dos parâmetros.
solve(t(X)%*%X)%*%t(X)%*%y
##       [,1]
## b0 -17.579
## b1   3.932

##-----------------------------------------------------------------------------
## Usando a lm().

## y ~ Normal(modelo linear, sigma²)
## modelo linear: b0+b1*x.

m0 <- lm(dist~speed, data=cars)
## m0 <- lm(dist~0+speed, data=cars)

c0 <- coef(m0)
c0
## (Intercept)       speed 
##     -17.579       3.932

## Sobrepondo ajuste às observações.
xyplot(dist~speed, data=cars, xlim=c(0,NA), ylim=c(c0[1],NA))+
    layer(panel.abline(a=c0[1], b=c0[2], col=2))

plot of chunk unnamed-chunk-3


## Diagnóstico.
par(mfrow=c(2,2)); plot(m0); layout(1)

plot of chunk unnamed-chunk-3


## Falta de ajuste. Extender para um modelo que permita curvatura da
## função.

##-----------------------------------------------------------------------------
## Ajustar polinômio de segundo grau.

## modelo linear: b0+b1*x+b2*x^2.
m1 <- lm(dist~speed+I(speed^2), data=cars)

## Diagnóstico.
par(mfrow=c(2,2)); plot(m1); layout(1)

plot of chunk unnamed-chunk-3


## Mostra uma possível relação média~variância. 

## Testa o abandono do termo extra por meio da mudança na soma de
## quadrados.
anova(m1, m0)
## Analysis of Variance Table
## 
## Model 1: dist ~ speed + I(speed^2)
## Model 2: dist ~ speed
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)
## 1     47 10825                        
## 2     48 11354 -1      -529 2.3   0.14
summary(m1)
## 
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28.72  -9.18  -3.19   4.63  45.15 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    2.470     14.817    0.17     0.87
## speed          0.913      2.034    0.45     0.66
## I(speed^2)     0.100      0.066    1.52     0.14
## 
## Residual standard error: 15.2 on 47 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.653 
## F-statistic: 47.1 on 2 and 47 DF,  p-value: 5.85e-12

## Verifica se cabe uma tranformação.
MASS::boxcox(m1); abline(v=0.5, col=2)

plot of chunk unnamed-chunk-3


## Usar lambda=0.5 como valor para transformar.
xyplot(sqrt(dist)~speed, data=cars)

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Com a variável transformada.

## Modelo quadrático com transformação na resposta.
m2 <- lm(sqrt(dist)~speed+I(speed^2), data=cars)

## Diagnóstico.
par(mfrow=c(2,2)); plot(m2); layout(1)

plot of chunk unnamed-chunk-3


## Teste para as estimativas, h0: beta==0.
summary(m2)
## 
## Call:
## lm(formula = sqrt(dist) ~ speed + I(speed^2), data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.073 -0.726 -0.183  0.637  3.116 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.99034    1.08671    0.91    0.367  
## speed        0.36559    0.14919    2.45    0.018 *
## I(speed^2)  -0.00143    0.00484   -0.30    0.769  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.11 on 47 degrees of freedom
## Multiple R-squared:  0.71,   Adjusted R-squared:  0.698 
## F-statistic: 57.5 on 2 and 47 DF,  p-value: 2.33e-13

##-----------------------------------------------------------------------------
## Volta pro modelo mais simples.

## Modelo que abandona b2, ou seja, b2==0.
m3 <- lm(sqrt(dist)~speed, data=cars)

summary(m3)
## 
## Call:
## lm(formula = sqrt(dist) ~ speed, data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.068 -0.698 -0.180  0.591  3.153 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.2771     0.4844    2.64    0.011 *  
## speed         0.3224     0.0298   10.83  1.8e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.1 on 48 degrees of freedom
## Multiple R-squared:  0.709,  Adjusted R-squared:  0.703 
## F-statistic:  117 on 1 and 48 DF,  p-value: 1.77e-14

## Teste para o abandono de b2 (o mesmo que o t do summary).
anova(m3, m2)
## Analysis of Variance Table
## 
## Model 1: sqrt(dist) ~ speed
## Model 2: sqrt(dist) ~ speed + I(speed^2)
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     48 58.3                         
## 2     47 58.2  1     0.108 0.09   0.77

## Diagnóstico para o modelo final.
par(mfrow=c(2,2)); plot(m3); layout(1)

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Representa os resultados.

## Gráfico do modelo final.
c3 <- coef(m3)
xyplot(sqrt(dist)~speed, data=cars)+
    layer(panel.abline(a=c3[1], b=c3[2], col=2))

plot of chunk unnamed-chunk-3


## Na escala natural.
xyplot(dist~speed, data=cars, xlim=c(0,NA), ylim=c(0,NA))+
    layer(panel.curve((c3[1]+c3[2]*x)^2, col=2))

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Com bandas de confiança.

pred <- data.frame(speed=seq(0, 30, length.out=30))
pred <- cbind(pred, predict(m3, newdata=pred, interval="confidence"))
str(pred)
## 'data.frame':    30 obs. of  4 variables:
##  $ speed: num  0 1.03 2.07 3.1 4.14 ...
##  $ fit  : num  1.28 1.61 1.94 2.28 2.61 ...
##  $ lwr  : num  0.303 0.695 1.086 1.477 1.867 ...
##  $ upr  : num  2.25 2.53 2.8 3.08 3.35 ...

xyplot(sqrt(dist)~speed, data=cars)+
    as.layer(xyplot(fit~speed, data=pred, type="l",
                    ly=pred$lwr, uy=pred$upr, cty="bands",
                    prepanel=prepanel.cbH, panel=panel.cbH))

plot of chunk unnamed-chunk-3


##-----------------------------------------------------------------------------
## Na escala original.

i <- c("fit","lwr","upr")
pred[,i] <- pred[,i]^2

xyplot(dist~speed, data=cars, xlim=c(0,NA), ylim=c(0,NA))+
    as.layer(xyplot(fit~speed, data=pred, type="l",
                    ly=pred$lwr, uy=pred$upr, cty="bands",
                    prepanel=prepanel.cbH, panel=panel.cbH))

plot of chunk unnamed-chunk-3