Estatística Computacional II
|
Neste exemplo, será usado Jackknife para fazer inferência sobre o coeficiente de correlação de Pearson.
# Fertilidade (y) vs percentual de homens ocupados na agricultura (x).
plot(Fertility ~ Agriculture, data = swiss)
# O coeficiente de correlação de Pearson possui método de
# inferência. Porém, depende do atendimento dos pressupostos, etc.
with(swiss, cor.test(x = Fertility, y = Agriculture))
##
## Pearson's product-moment correlation
##
## data: Fertility and Agriculture
## t = 2.5316, df = 45, p-value = 0.01492
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07334947 0.58130587
## sample estimates:
## cor
## 0.3530792
# Estimativa com todas as observações.
rho <- with(swiss, cor(x = Fertility, y = Agriculture))
rho
## [1] 0.3530792
n <- nrow(swiss) # Número de observações.
i <- 1:n # Índices.
# Estimativas parciais (partial estimates).
pe <- sapply(i,
FUN = function(ii) {
with(swiss[-ii, ],
cor(x = Fertility, y = Agriculture))
})
sort(pe)
## [1] 0.2527003 0.3130895 0.3236550 0.3252307 0.3295657 0.3373877 0.3384854
## [8] 0.3394874 0.3431902 0.3438933 0.3463836 0.3472831 0.3473921 0.3484359
## [15] 0.3515465 0.3525519 0.3526445 0.3529275 0.3532247 0.3544081 0.3544769
## [22] 0.3547668 0.3548660 0.3556879 0.3573136 0.3575933 0.3580389 0.3582101
## [29] 0.3584283 0.3598261 0.3600821 0.3603786 0.3607963 0.3622056 0.3629919
## [36] 0.3630053 0.3633260 0.3634777 0.3639524 0.3646809 0.3665158 0.3690244
## [43] 0.3693343 0.3700471 0.3787130 0.3872608 0.3920289
# Pseudo valores.
pv <- n * rho - (n - 1) * pe
sort(pv)
## [1] -1.438609208 -1.219276002 -0.826074886 -0.427443555 -0.394654825
## [6] -0.380399765 -0.265005713 -0.180598644 -0.147089195 -0.125250512
## [11] -0.118273658 -0.103522641 -0.102906932 -0.066738055 -0.001907453
## [16] 0.017305935 0.030944693 0.042721510 0.107019784 0.117054764
## [21] 0.124930800 0.145427596 0.158296835 0.233079893 0.270887611
## [26] 0.275450148 0.288782306 0.291947068 0.346384176 0.360058426
## [31] 0.373075948 0.377332330 0.423580770 0.566671387 0.614684885
## [36] 0.619697687 0.661076544 0.775631850 0.807971775 0.978302879
## [41] 1.024391458 1.074886874 1.434699932 1.634111551 1.706590742
## [46] 2.192606635 4.970505848
# ATENÇÃO: alguns pseudo valores tem valores fora do espaço paramétrico
# da correlação que é [-1, 1].
# Estimativa pontual Jackknife.
mean(pv)
## [1] 0.3669864
# Erro-padrão Jackknife.
sqrt(var(pv)/n)
## [1] 0.1407584
# Intervalo de confiança Jackknife (supõe independência e normalidade).
mean(pv) + c(-1, 1) * qt(p = 0.975, df = n - 1) * sqrt(var(pv)/n)
## [1] 0.0836545 0.6503182
# Intervalo de confiança para a correlação de Pearson.
with(swiss,
cor.test(x = Fertility, y = Agriculture)$conf.int)
## [1] 0.07334947 0.58130587
## attr(,"conf.level")
## [1] 0.95
Em qual dos dois tipos de intervalo de confiança é mais seguro (para não usar o termo confiável)? O que é ser mais seguro?
library(tidyverse)
# Carregando dados.
data(segreg, package = "alr3")
# Visualizando os dados.
ggplot(segreg, aes(x = Temp, y = C)) +
geom_point() +
geom_smooth(se = FALSE, span = 0.4)
## `geom_smooth()` using method = 'loess'
# Encontrando valores iniciais.
start <- list(th0 = 75,
th1 = 0.5,
th2 = 50)
ggplot(segreg, aes(x = Temp, y = C)) +
geom_point() +
geom_smooth(se = FALSE, span = 0.4)
## `geom_smooth()` using method = 'loess'
plot(C ~ Temp, data = segreg)
with(start, {
curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE)
})
# Ajuste do modelo aos dados.
n0 <- nls(C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) +
0 * (Temp < th2),
data = segreg,
start = start)
summary(n0)
##
## Formula: C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) + 0 * (Temp < th2)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## th0 74.6953 1.3433 55.607 < 2e-16 ***
## th1 0.5674 0.1006 5.641 2.10e-06 ***
## th2 41.9512 4.6583 9.006 9.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.373 on 36 degrees of freedom
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 1.141e-08
confint.default(n0)
## 2.5 % 97.5 %
## th0 72.0625625 77.3281125
## th1 0.3702916 0.7645672
## th2 32.8212184 51.0812375
# Curva ajustada sobreposta aos dados.
plot(C ~ Temp, data = segreg)
with(as.list(coef(n0)), {
curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE)
})
#-----------------------------------------------------------------------
# Aplicando o método Jackknife.
# Pseudo valores.
n <- nrow(segreg)
i <- 1:n
pv <- sapply(i,
FUN = function(ii) {
# Reajusta o modelo.
n1 <- update(n0,
data = segreg[-ii, ],
start = coef(n0))
# Obtém os pseudo valores.
n * coef(n0) - (n - 1) * coef(n1)
})
pv
## [,1] [,2] [,3] [,4] [,5] [,6]
## th0 103.8767098 68.8478030 73.1790431 58.3058430 68.7659764 79.2263631
## th1 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297
## th2 93.3785551 31.6459300 39.2790205 13.0674780 31.5017241 49.9364168
## [,7] [,8] [,9] [,10] [,11] [,12]
## th0 97.3624965 73.4240164 65.9876697 83.5912964 85.7694564 65.5790430
## th1 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297
## th2 81.8983370 39.7107453 26.6054205 57.6288862 61.4675311 25.8852839
## [,13] [,14] [,15] [,16] [,17] [,18]
## th0 79.6349897 74.0778697 18.783115 87.7115098 74.695339 74.6953392
## th1 0.5674297 0.5674297 -0.838256 0.5674297 -0.512521 0.4687384
## th2 50.6565534 40.8630531 -112.931446 64.8900773 -6.461628 37.2292207
## [,19] [,20] [,21] [,22] [,23] [,24]
## th0 74.6953385 74.6953377 74.6953379 74.695338 74.6953395 74.6953390
## th1 0.9128637 0.4778908 0.1261413 1.433339 0.6711386 0.6726583
## th2 59.3790948 37.5142396 20.3100356 87.825906 47.3775618 51.1900718
## [,25] [,26] [,27] [,28] [,29] [,30]
## th0 74.6953391 74.6953385 74.6953381 74.6953373 74.6953376 74.69534
## th1 0.7105989 0.6166268 0.5711587 0.6031183 0.9077715 0.61858
## th2 59.8015242 76.2712899 39.0170175 32.4853470 26.0920991 40.71884
## [,31] [,32] [,33] [,34] [,35] [,36]
## th0 74.6953381 74.6953386 74.6953381 74.6953384 74.695339 74.6953381
## th1 0.7246327 0.6716795 0.5243044 0.8085346 1.483175 0.5039746
## th2 42.2766428 43.0727384 41.4490078 45.3822387 56.480970 40.8791466
## [,37] [,38] [,39]
## th0 74.695338 74.6953379 74.6953382
## th1 1.137357 -0.4122876 -0.8823282
## th2 52.248344 23.3200786 12.9017403
# Estimativas pontuais de Jackknife e erros padrões.
jkest <- cbind("JK_Estimate" = apply(pv, MARGIN = 1, FUN = mean),
"JK_StdError" = apply(pv, MARGIN = 1, FUN = sd)/sqrt(n))
jkest
## JK_Estimate JK_StdError
## th0 74.413230 1.90986928
## th1 0.525906 0.07637956
## th2 40.057567 5.17480685
# Obtendo o IC.
me <- outer(X = c(lwr = -1, upr = 1) *
qt(0.975, df = n - length(coef(n0))),
Y = jkest[, 2],
FUN = "*")
t(sweep(me, MARGIN = 2, STATS = jkest[, 1], FUN = "+"))
## lwr upr
## th0 70.539836 78.2866247
## th1 0.371001 0.6808109
## th2 29.562572 50.5525613
#-----------------------------------------------------------------------
# Usando a função nlstools::nlsJack().
library(nlstools)
##
## 'nlstools' has been loaded.
## IMPORTANT NOTICE: Most nonlinear regression models and data set examples
## related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
jk <- nlsJack(n0)
summary(jk)
##
## ------
## Jackknife statistics
## Estimates Bias
## th0 74.413230 0.28210720
## th1 0.525906 0.04152343
## th2 40.057567 1.89366134
##
## ------
## Jackknife confidence intervals
## Low Up
## th0 70.539836 78.2866247
## th1 0.371001 0.6808109
## th2 29.562572 50.5525613
##
## ------
## Influential values
## * Observation 1 is influential on th0
## * Observation 4 is influential on th0
## * Observation 7 is influential on th0
## * Observation 15 is influential on th0
## * Observation 15 is influential on th1
## * Observation 39 is influential on th1
## * Observation 15 is influential on th2
ggplot(segreg, aes(x = Temp, y = C)) +
# geom_point() +
geom_text(label = 1:nrow(segreg))
# Carrega os dados da web.
url <- "https://web.stanford.edu/~hastie/CASI_files/DATA/kidney.txt"
kidney <- read.table(url, header = TRUE, sep = " ")
str(kidney)
## 'data.frame': 157 obs. of 2 variables:
## $ age: int 18 19 19 20 21 21 21 22 22 22 ...
## $ tot: num 2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...
# Visualiza os dados.
plot(tot ~ age, data = kidney)
# Ajuste do modelo polinomial.
m0 <- lm(tot ~ poly(age, degree = 5), data = kidney)
summary(m0)
##
## Call:
## lm(formula = tot ~ poly(age, degree = 5), data = kidney)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1758 -1.3138 0.0441 1.0551 4.5577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.911e-04 1.454e-01 -0.001 0.999
## poly(age, degree = 5)1 -1.563e+01 1.822e+00 -8.576 1.09e-14 ***
## poly(age, degree = 5)2 -4.758e-01 1.822e+00 -0.261 0.794
## poly(age, degree = 5)3 -1.082e-01 1.822e+00 -0.059 0.953
## poly(age, degree = 5)4 -4.028e-01 1.822e+00 -0.221 0.825
## poly(age, degree = 5)5 -9.224e-01 1.822e+00 -0.506 0.614
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.822 on 151 degrees of freedom
## Multiple R-squared: 0.3287, Adjusted R-squared: 0.3064
## F-statistic: 14.79 on 5 and 151 DF, p-value: 8.509e-12
# Checando os pressupostos.
par(mfrow = c(2, 2))
plot(m0)
layout(1)
# Predição com banda de confiança.
pred <- with(kidney,
data.frame(age = seq(min(age), max(age), by = 1)))
fit0 <- predict(m0, newdata = pred, interval = "confidence")
pred <- cbind(pred, fit0)
# Gráfico da curva ajustada com a banda de confiança.
plot(tot ~ age, data = kidney)
with(pred, matlines(age,
cbind(fit, lwr, upr),
lty = c(1, 2, 2),
col = 1))
# Fazendo Jackknife.
n <- nrow(kidney)
i <- 1:n
# Grid de valores para obter os preditos.
grid <- data.frame(age = seq(20, 85, by = 5))
grid$fit0 <- predict(m0, newdata = grid)
# Obtém os pseudo valores para cada ponto predito.
res <- lapply(i,
FUN = function(ii) {
m1 <- update(m0, data = kidney[-ii, ])
fit1 <- predict(m1, newdata = grid)
pv <- n * grid$fit0 - (n - 1) * fit1
return(pv)
})
# Desmonta lista para matriz.
res <- do.call(cbind, res)
# Obtém estimativa pontual e erro-padrão Jackknife.
res <- apply(res,
MARGIN = 1,
FUN = function(x) {
x <- na.omit(x)
m <- mean(x)
s <- sd(x)/sqrt(n)
c(m = m, s = s)
})
str(res)
## num [1:2, 1:14] 1.344 0.547 0.844 0.218 0.509 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:2] "m" "s"
## ..$ : chr [1:14] "1" "2" "3" "4" ...
# Colocando todos os resultados juntos.
plot(tot ~ age, data = kidney)
with(pred, matlines(age,
cbind(fit, lwr, upr),
lty = c(1, 2, 2),
col = 1))
qtl <- qt(0.975, df = df.residual(m0))
for (i in 1:nrow(grid)) {
points(grid$age[i], res["m", i], pch = 4, col = "red")
arrows(grid$age[i],
res["m", i] - qtl * res["s", i],
grid$age[i],
res["m", i] + qtl * res["s", i],
code = 3,
angle = 90,
length = 0.05,
col = "red")
}
Estatística Computacional II | Prof. Walmes Zeviani |