Estatística Computacional II

leg.ufpr.br/~walmes/ensino/EC2

1 Exemplo com correlação

Neste exemplo, será usado Jackknife para fazer inferência sobre o coeficiente de correlação de Pearson.

# Fertilidade (y) vs percentual de homens ocupados na agricultura (x).
plot(Fertility ~ Agriculture, data = swiss)

# O coeficiente de correlação de Pearson possui método de
# inferência. Porém, depende do atendimento dos pressupostos, etc.
with(swiss, cor.test(x = Fertility, y = Agriculture))
## 
##  Pearson's product-moment correlation
## 
## data:  Fertility and Agriculture
## t = 2.5316, df = 45, p-value = 0.01492
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07334947 0.58130587
## sample estimates:
##       cor 
## 0.3530792
# Estimativa com todas as observações.
rho <- with(swiss, cor(x = Fertility, y = Agriculture))
rho
## [1] 0.3530792
n <- nrow(swiss) # Número de observações.
i <- 1:n         # Índices.

# Estimativas parciais (partial estimates).
pe <- sapply(i,
             FUN = function(ii) {
                 with(swiss[-ii, ],
                      cor(x = Fertility, y = Agriculture))
             })
sort(pe)
##  [1] 0.2527003 0.3130895 0.3236550 0.3252307 0.3295657 0.3373877 0.3384854
##  [8] 0.3394874 0.3431902 0.3438933 0.3463836 0.3472831 0.3473921 0.3484359
## [15] 0.3515465 0.3525519 0.3526445 0.3529275 0.3532247 0.3544081 0.3544769
## [22] 0.3547668 0.3548660 0.3556879 0.3573136 0.3575933 0.3580389 0.3582101
## [29] 0.3584283 0.3598261 0.3600821 0.3603786 0.3607963 0.3622056 0.3629919
## [36] 0.3630053 0.3633260 0.3634777 0.3639524 0.3646809 0.3665158 0.3690244
## [43] 0.3693343 0.3700471 0.3787130 0.3872608 0.3920289
# Pseudo valores.
pv <- n * rho - (n - 1) * pe
sort(pv)
##  [1] -1.438609208 -1.219276002 -0.826074886 -0.427443555 -0.394654825
##  [6] -0.380399765 -0.265005713 -0.180598644 -0.147089195 -0.125250512
## [11] -0.118273658 -0.103522641 -0.102906932 -0.066738055 -0.001907453
## [16]  0.017305935  0.030944693  0.042721510  0.107019784  0.117054764
## [21]  0.124930800  0.145427596  0.158296835  0.233079893  0.270887611
## [26]  0.275450148  0.288782306  0.291947068  0.346384176  0.360058426
## [31]  0.373075948  0.377332330  0.423580770  0.566671387  0.614684885
## [36]  0.619697687  0.661076544  0.775631850  0.807971775  0.978302879
## [41]  1.024391458  1.074886874  1.434699932  1.634111551  1.706590742
## [46]  2.192606635  4.970505848
# ATENÇÃO: alguns pseudo valores tem valores fora do espaço paramétrico
# da correlação que é [-1, 1].

# Estimativa pontual Jackknife.
mean(pv)
## [1] 0.3669864
# Erro-padrão Jackknife.
sqrt(var(pv)/n)
## [1] 0.1407584
# Intervalo de confiança Jackknife (supõe independência e normalidade).
mean(pv) + c(-1, 1) * qt(p = 0.975, df = n - 1) * sqrt(var(pv)/n)
## [1] 0.0836545 0.6503182
# Intervalo de confiança para a correlação de Pearson.
with(swiss,
     cor.test(x = Fertility, y = Agriculture)$conf.int)
## [1] 0.07334947 0.58130587
## attr(,"conf.level")
## [1] 0.95

Em qual dos dois tipos de intervalo de confiança é mais seguro (para não usar o termo confiável)? O que é ser mais seguro?

2 Exemplo com regressão não linear

library(tidyverse)

# Carregando dados.
data(segreg, package = "alr3")

# Visualizando os dados.
ggplot(segreg, aes(x = Temp, y = C)) +
    geom_point() +
    geom_smooth(se = FALSE, span = 0.4)
## `geom_smooth()` using method = 'loess'

# Encontrando valores iniciais.
start <- list(th0 = 75,
              th1 = 0.5,
              th2 = 50)

ggplot(segreg, aes(x = Temp, y = C)) +
    geom_point() +
    geom_smooth(se = FALSE, span = 0.4)
## `geom_smooth()` using method = 'loess'

plot(C ~ Temp, data = segreg)
with(start, {
    curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE)
})

# Ajuste do modelo aos dados.
n0 <- nls(C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) +
              0 * (Temp < th2),
          data = segreg,
          start = start)
summary(n0)
## 
## Formula: C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) + 0 * (Temp < th2)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## th0  74.6953     1.3433  55.607  < 2e-16 ***
## th1   0.5674     0.1006   5.641 2.10e-06 ***
## th2  41.9512     4.6583   9.006 9.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.373 on 36 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 1.141e-08
confint.default(n0)
##          2.5 %     97.5 %
## th0 72.0625625 77.3281125
## th1  0.3702916  0.7645672
## th2 32.8212184 51.0812375
# Curva ajustada sobreposta aos dados.
plot(C ~ Temp, data = segreg)
with(as.list(coef(n0)), {
    curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE)
})

#-----------------------------------------------------------------------
# Aplicando o método Jackknife.

# Pseudo valores.
n <- nrow(segreg)
i <- 1:n
pv <- sapply(i,
             FUN = function(ii) {
                 # Reajusta o modelo.
                 n1 <- update(n0,
                              data = segreg[-ii, ],
                              start = coef(n0))
                 # Obtém os pseudo valores.
                 n * coef(n0) - (n - 1) * coef(n1)
             })
pv
##            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## th0 103.8767098 68.8478030 73.1790431 58.3058430 68.7659764 79.2263631
## th1   0.5674297  0.5674297  0.5674297  0.5674297  0.5674297  0.5674297
## th2  93.3785551 31.6459300 39.2790205 13.0674780 31.5017241 49.9364168
##           [,7]       [,8]       [,9]      [,10]      [,11]      [,12]
## th0 97.3624965 73.4240164 65.9876697 83.5912964 85.7694564 65.5790430
## th1  0.5674297  0.5674297  0.5674297  0.5674297  0.5674297  0.5674297
## th2 81.8983370 39.7107453 26.6054205 57.6288862 61.4675311 25.8852839
##          [,13]      [,14]       [,15]      [,16]     [,17]      [,18]
## th0 79.6349897 74.0778697   18.783115 87.7115098 74.695339 74.6953392
## th1  0.5674297  0.5674297   -0.838256  0.5674297 -0.512521  0.4687384
## th2 50.6565534 40.8630531 -112.931446 64.8900773 -6.461628 37.2292207
##          [,19]      [,20]      [,21]     [,22]      [,23]      [,24]
## th0 74.6953385 74.6953377 74.6953379 74.695338 74.6953395 74.6953390
## th1  0.9128637  0.4778908  0.1261413  1.433339  0.6711386  0.6726583
## th2 59.3790948 37.5142396 20.3100356 87.825906 47.3775618 51.1900718
##          [,25]      [,26]      [,27]      [,28]      [,29]    [,30]
## th0 74.6953391 74.6953385 74.6953381 74.6953373 74.6953376 74.69534
## th1  0.7105989  0.6166268  0.5711587  0.6031183  0.9077715  0.61858
## th2 59.8015242 76.2712899 39.0170175 32.4853470 26.0920991 40.71884
##          [,31]      [,32]      [,33]      [,34]     [,35]      [,36]
## th0 74.6953381 74.6953386 74.6953381 74.6953384 74.695339 74.6953381
## th1  0.7246327  0.6716795  0.5243044  0.8085346  1.483175  0.5039746
## th2 42.2766428 43.0727384 41.4490078 45.3822387 56.480970 40.8791466
##         [,37]      [,38]      [,39]
## th0 74.695338 74.6953379 74.6953382
## th1  1.137357 -0.4122876 -0.8823282
## th2 52.248344 23.3200786 12.9017403
# Estimativas pontuais de Jackknife e erros padrões.
jkest <- cbind("JK_Estimate" = apply(pv, MARGIN = 1, FUN = mean),
               "JK_StdError" = apply(pv, MARGIN = 1, FUN = sd)/sqrt(n))
jkest
##     JK_Estimate JK_StdError
## th0   74.413230  1.90986928
## th1    0.525906  0.07637956
## th2   40.057567  5.17480685
# Obtendo o IC.
me <- outer(X = c(lwr = -1, upr = 1) *
                qt(0.975, df = n - length(coef(n0))),
            Y = jkest[, 2],
            FUN = "*")
t(sweep(me, MARGIN = 2, STATS = jkest[, 1], FUN = "+"))
##           lwr        upr
## th0 70.539836 78.2866247
## th1  0.371001  0.6808109
## th2 29.562572 50.5525613
#-----------------------------------------------------------------------
# Usando a função nlstools::nlsJack().

library(nlstools)
## 
## 'nlstools' has been loaded.
## IMPORTANT NOTICE: Most nonlinear regression models and data set examples
## related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
jk <- nlsJack(n0)
summary(jk)
## 
## ------
## Jackknife statistics
##     Estimates       Bias
## th0 74.413230 0.28210720
## th1  0.525906 0.04152343
## th2 40.057567 1.89366134
## 
## ------
## Jackknife confidence intervals
##           Low         Up
## th0 70.539836 78.2866247
## th1  0.371001  0.6808109
## th2 29.562572 50.5525613
## 
## ------
## Influential values
## * Observation 1 is influential on th0 
## * Observation 4 is influential on th0 
## * Observation 7 is influential on th0 
## * Observation 15 is influential on th0 
## * Observation 15 is influential on th1 
## * Observation 39 is influential on th1 
## * Observation 15 is influential on th2
ggplot(segreg, aes(x = Temp, y = C)) +
    # geom_point() +
    geom_text(label = 1:nrow(segreg))

3 Exemplo com regressão polinomial

# Carrega os dados da web.
url <- "https://web.stanford.edu/~hastie/CASI_files/DATA/kidney.txt"
kidney <- read.table(url, header = TRUE, sep = " ")
str(kidney)
## 'data.frame':    157 obs. of  2 variables:
##  $ age: int  18 19 19 20 21 21 21 22 22 22 ...
##  $ tot: num  2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...
# Visualiza os dados.
plot(tot ~ age, data = kidney)

# Ajuste do modelo polinomial.
m0 <- lm(tot ~ poly(age, degree = 5), data = kidney)
summary(m0)
## 
## Call:
## lm(formula = tot ~ poly(age, degree = 5), data = kidney)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1758 -1.3138  0.0441  1.0551  4.5577 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -1.911e-04  1.454e-01  -0.001    0.999    
## poly(age, degree = 5)1 -1.563e+01  1.822e+00  -8.576 1.09e-14 ***
## poly(age, degree = 5)2 -4.758e-01  1.822e+00  -0.261    0.794    
## poly(age, degree = 5)3 -1.082e-01  1.822e+00  -0.059    0.953    
## poly(age, degree = 5)4 -4.028e-01  1.822e+00  -0.221    0.825    
## poly(age, degree = 5)5 -9.224e-01  1.822e+00  -0.506    0.614    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.822 on 151 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.3064 
## F-statistic: 14.79 on 5 and 151 DF,  p-value: 8.509e-12
# Checando os pressupostos.
par(mfrow = c(2, 2))
plot(m0)

layout(1)

# Predição com banda de confiança.
pred <- with(kidney,
             data.frame(age = seq(min(age), max(age), by = 1)))
fit0 <- predict(m0, newdata = pred, interval = "confidence")
pred <- cbind(pred, fit0)

# Gráfico da curva ajustada com a banda de confiança.
plot(tot ~ age, data = kidney)
with(pred, matlines(age,
                    cbind(fit, lwr, upr),
                    lty = c(1, 2, 2),
                    col = 1))

# Fazendo Jackknife.
n <- nrow(kidney)
i <- 1:n

# Grid de valores para obter os preditos.
grid <- data.frame(age = seq(20, 85, by = 5))
grid$fit0 <- predict(m0, newdata = grid)

# Obtém os pseudo valores para cada ponto predito.
res <- lapply(i,
              FUN = function(ii) {
                  m1 <- update(m0, data = kidney[-ii, ])
                  fit1 <- predict(m1, newdata = grid)
                  pv <- n * grid$fit0 - (n - 1) * fit1
                  return(pv)
              })
# Desmonta lista para matriz.
res <- do.call(cbind, res)
# Obtém estimativa pontual e erro-padrão Jackknife.
res <- apply(res,
             MARGIN = 1,
             FUN = function(x) {
                 x <- na.omit(x)
                 m <- mean(x)
                 s <- sd(x)/sqrt(n)
                 c(m = m, s = s)
             })
str(res)
##  num [1:2, 1:14] 1.344 0.547 0.844 0.218 0.509 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "m" "s"
##   ..$ : chr [1:14] "1" "2" "3" "4" ...
# Colocando todos os resultados juntos.
plot(tot ~ age, data = kidney)
with(pred, matlines(age,
                    cbind(fit, lwr, upr),
                    lty = c(1, 2, 2),
                    col = 1))
qtl <- qt(0.975, df = df.residual(m0))
for (i in 1:nrow(grid)) {
    points(grid$age[i], res["m", i], pch = 4, col = "red")
    arrows(grid$age[i],
           res["m", i] - qtl * res["s", i],
           grid$age[i],
           res["m", i] + qtl * res["s", i],
           code = 3,
           angle = 90,
           length = 0.05,
           col = "red")
}