1 Definições da sessão
2 Importação e inspeção dos dados
3 Ajuste de modelo segmentado linear
4 Ajuste considerando dependência temporal das observações

1 Definições da sessão

#-----------------------------------------------------------------------
#                                            Prof. Dr. Walmes M. Zeviani
#                                leg.ufpr.br/~walmes · github.com/walmes
#                                        walmes@ufpr.br · @walmeszeviani
#                      Laboratory of Statistics and Geoinformation (LEG)
#                Department of Statistics · Federal University of Paraná
#                                       2020-jul-02 · Curitiba/PR/Brazil
#-----------------------------------------------------------------------

#-----------------------------------------------------------------------
# Pacotes.

library(nlme)
library(car)
library(rootSolve)

library(tidyverse)

2 Importação e inspeção dos dados

#-----------------------------------------------------------------------
# Importação.

# Importa.
# tb <- read_csv2("./Walmes.csv", locale = locale(decimal_mark = "."))
tb <- read.csv2("./Walmes.csv", dec = ".", check.names = FALSE)
attr(tb, "spec") <- NULL
str(tb)

## 'data.frame':    105 obs. of  15 variables:
##  $           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $           : int  43900 43901 43902 43903 43904 43905 43906 43907 43908 43909 ...
##  $ Tmin      : num  13.8 14.8 15.4 16.9 14.9 18.7 20.1 19 19.1 18 ...
##  $ T med     : num  20 20.5 20.7 22 21.5 ...
##  $ Tmax      : num  29.2 28.2 30.3 31.2 30.5 31.7 31.8 28.1 28.3 29.1 ...
##  $ Prec      : num  0 0 0 0 0 0 0 1.7 0 4.2 ...
##  $ URmin     : int  31 36 31 23 29 32 36 52 51 46 ...
##  $ URmed     : num  63 64 65 55 65 56 67 75 73 64 ...
##  $ URmax     : int  95 92 99 87 101 80 98 98 95 82 ...
##  $ UAmin     : num  2.81 3.25 2.79 2.08 2.61 ...
##  $ UAmed     : num  5.84 5.97 6.08 5.24 6.15 ...
##  $ Uamax     : num  10.5 9.9 11.2 10.1 11.5 ...
##  $ Incidencia: int  0 5 0 0 0 2 2 5 0 3 ...
##  $ Acumulado : int  0 5 5 5 5 7 9 14 14 17 ...
##  $ Taxa      : num  0 0.259 0.259 0.259 0.259 ...

# Renomeia variáveis.
names(tb)[1:2] <- c("dia", "data")
tb <- tb %>%
    rename(Tmed = `T med`, UAmax = Uamax) %>%
    as_tibble()

# Cria a variável cronológica.
tb <- tb %>%
    mutate(data = as.Date(data, origin = as.Date("1900-01-01")))

# Converte variáveis importadas como character para numérico.
tb <- tb %>%
    mutate_at(vars(Prec, Taxa), as.numeric)

#-----------------------------------------------------------------------
# Análise exploratória.

# Empilha para fazer o gráfico.
tb_long <- tb %>%
    gather(key = "var", value = "val", 3:last_col())
str(tb_long)

## tibble [1,365 × 4] (S3: tbl_df/tbl/data.frame)
##  $ dia : int [1:1365] 1 2 3 4 5 6 7 8 9 10 ...
##  $ data: Date[1:1365], format: "2020-03-12" ...
##  $ var : chr [1:1365] "Tmin" "Tmin" "Tmin" "Tmin" ...
##  $ val : num [1:1365] 13.8 14.8 15.4 16.9 14.9 18.7 20.1 19 19.1 18 ...

# Todas as variáveis.
ggplot(data = tb_long,
       mapping = aes(x = data, y = val)) +
    facet_wrap(facets = ~var, scale = "free_y") +
    geom_point() +
    geom_line()

# Temperatura.
tb %>%
    select(1:2, starts_with("Tm")) %>%
    gather(key = "var", value = "val", -(1:2)) %>%
    ggplot(mapping = aes(x = data, y = val, color = var)) +
    geom_point() +
    geom_line() +
    labs(color = "Value", x = "Date", y = "Temperature")

# Umidade Relativa.
tb %>%
    select(1:2, starts_with("URm")) %>%
    gather(key = "var", value = "val", -(1:2)) %>%
    ggplot(mapping = aes(x = data, y = val, color = var)) +
    geom_point() +
    geom_line() +
    labs(color = "Value", x = "Date", y = "Relative humidity")

# Umidade A???.
tb %>%
    select(1:2, starts_with("UAm")) %>%
    gather(key = "var", value = "val", -(1:2)) %>%
    ggplot(mapping = aes(x = data, y = val, color = var)) +
    geom_point() +
    geom_line() +
    labs(color = "Value", x = "Date", y = "???")

# As variáveis da epidêmia.
tb %>%
    select(1:2, Incidencia:Taxa) %>%
    gather(key = "var", value = "val", -(1:2)) %>%
    ggplot(mapping = aes(x = data, y = val)) +
    facet_wrap(facets = ~var, scale = "free_y", ncol = 1) +
    geom_point() +
    geom_line() +
    labs(x = "Date", y = "Value")

# Apenas a taxa.
ggplot(data = tb,
       mapping = aes(x = data, y = Taxa)) +
    geom_point() +
    geom_line() +
    # scale_y_log10() +
    labs(x = "Date", y = "Rate")

3 Ajuste de modelo segmentado linear

#-----------------------------------------------------------------------
# Ajuste do modelo linear segmentado.

# Gráfico da taxa com sinalização dos possíveis pontos de quebra. Eles
# representam mudanças de regime no fenômeno que alteram a taxa.
plot(Taxa ~ dia, data = tb)
abline(v = c(25, 90), col = "orange")

# Equação do modelo de 2 quebras e 3 segmentos livres conectados.
seg_model <- function(x, th0, th1, th2, th3, b0, b1) {
    th0 + th1 * x +
        th2 * (x - b0) * (x > b0) +
        th3 * (x - b1) * (x > b1)
}

# Afere valores iniciais por tentativa e erro.
plot(Taxa ~ dia, data = tb)
abline(v = c(25, 90), col = "orange")
curve(seg_model(x,
                th0 = 0,
                th1 = 0.2,
                th2 = 0.7,
                th3 = 5,
                b0 = 25,
                b1 = 90),
      add = TRUE, col = "red")

# Ajuste do modelo de 3 segmentos.
seg_3 <- nls(Taxa ~ seg_model(dia, th0, th1, th2, th3, b0, b1),
             data = tb,
             start = list(th0 = 0,
                          th1 = 0.2,
                          th2 = 0.7,
                          th3 = 5,
                          b0 = 25,
                          b1 = 90))
summary(seg_3)

## 
## Formula: Taxa ~ seg_model(dia, th0, th1, th2, th3, b0, b1)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## th0 -0.79270    1.15747  -0.685   0.4950    
## th1  0.22278    0.09218   2.417   0.0175 *  
## th2  0.63856    0.09347   6.832 6.89e-10 ***
## th3  5.64929    0.15364  36.769  < 2e-16 ***
## b0  21.28837    1.97709  10.768  < 2e-16 ***
## b1  90.86464    0.25104 361.951  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.558 on 99 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 7.381e-07

# Simplificação para o modelo de 2 segmentos.
seg_2 <- nls(Taxa ~ seg_model(dia, th0, th1, th2, th3 = 0, b0, b1 = 90),
             data = tb,
             start = list(th0 = 0,
                          th1 = 0.2,
                          th2 = 0.7,
                          b0 = 25))
summary(seg_2)

## 
## Formula: Taxa ~ seg_model(dia, th0, th1, th2, th3 = 0, b0, b1 = 90)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## th0  -8.9199     0.7336  -12.16   <2e-16 ***
## th1   0.7749     0.0140   55.34   <2e-16 ***
## th2   5.7357     0.2067   27.75   <2e-16 ***
## b0   90.4481     0.3375  268.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.451 on 101 degrees of freedom
## 
## Number of iterations to convergence: 9 
## Achieved convergence tolerance: 1.331e-08

# Simplificação para o modelo de 1 segmentos (regressão linear simples).
seg_1 <- nls(Taxa ~ seg_model(dia, th0, th1, th2 = 0, th3 = 0, b0 = 20, b1 = 90),
             data = tb,
             start = list(th0 = 0,
                          th1 = 0.2))
summary(seg_1)

## 
## Formula: Taxa ~ seg_model(dia, th0, th1, th2 = 0, th3 = 0, b0 = 20, b1 = 90)
## 
## Parameters:
##      Estimate Std. Error t value Pr(>|t|)    
## th0 -19.67927    3.04265  -6.468  3.4e-09 ***
## th1   1.09464    0.04983  21.965  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.48 on 103 degrees of freedom
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 1.251e-08

# Pontos de quebra.
bk <- c(coef(seg_3), coef(seg_2), coef(seg_1))
bk <- bk[names(bk) %in% c("b0", "b1")]

# Predição.
pred <- data.frame(dia = sort(c(bk,
                                seq(min(tb$dia),
                                    max(tb$dia),
                                    length.out = 71))))
pred$fit_3 <- predict(seg_3, newdata = pred)
pred$fit_2 <- predict(seg_2, newdata = pred)
pred$fit_1 <- predict(seg_1, newdata = pred)

# Gráfico dos ajustes.
plot(Taxa ~ dia, data = tb)
with(pred, lines(dia, fit_3, col = 1))
with(pred, lines(dia, fit_2, col = 2))
with(pred, lines(dia, fit_1, col = 3))

# Razão de verossimilhanças entre os 3 modelos encaixados.
anova(seg_3, seg_2, seg_1)

## Analysis of Variance Table
## 
## Model 1: Taxa ~ seg_model(dia, th0, th1, th2, th3, b0, b1)
## Model 2: Taxa ~ seg_model(dia, th0, th1, th2, th3 = 0, b0, b1 = 90)
## Model 3: Taxa ~ seg_model(dia, th0, th1, th2 = 0, th3 = 0, b0 = 20, b1 = 90)
##   Res.Df Res.Sum Sq Df   Sum Sq F value    Pr(>F)    
## 1     99      647.7                                  
## 2    101     1202.6 -2   -554.9  42.401 4.995e-14 ***
## 3    103    24674.7 -2 -23472.1 985.647 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Intervalos de confiança (método Wald) para os parâmetros.
confint.default(seg_3)

##           2.5 %     97.5 %
## th0 -3.06129472  1.4758971
## th1  0.04210512  0.4034461
## th2  0.45536240  0.8217509
## th3  5.34815218  5.9504256
## b0  17.41335025 25.1633938
## b1  90.37261107 91.3566758

# Interpreação do modelo de 3 segmentos.
summary(seg_3)

## 
## Formula: Taxa ~ seg_model(dia, th0, th1, th2, th3, b0, b1)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## th0 -0.79270    1.15747  -0.685   0.4950    
## th1  0.22278    0.09218   2.417   0.0175 *  
## th2  0.63856    0.09347   6.832 6.89e-10 ***
## th3  5.64929    0.15364  36.769  < 2e-16 ***
## b0  21.28837    1.97709  10.768  < 2e-16 ***
## b1  90.86464    0.25104 361.951  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.558 on 99 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 7.381e-07

# Pontos de troca de taxa.
coef(seg_3)[c("b0", "b1")]

##       b0       b1 
## 21.28837 90.86464

# Taxa do 1º segmento.
coef(seg_3)["th1"]

##       th1 
## 0.2227756

# Taxa do 2º segmento.
coef(seg_3)["th1"] + coef(seg_3)["th2"]

##       th1 
## 0.8613322

# Taxa do 3º segmento.
coef(seg_3)["th1"] + coef(seg_3)["th2"] + coef(seg_3)["th3"]

##      th1 
## 6.510621

# Intervalos de confiança para as taxas (são funções dos parâmetros)
# usando o método delta.
deltaMethod(object = seg_3, g. = "th1")

##     Estimate       SE    2.5 % 97.5 %
## th1 0.222776 0.092181 0.042105 0.4034

deltaMethod(object = seg_3, g. = "th1 + th2")

##           Estimate       SE    2.5 % 97.5 %
## th1 + th2 0.861332 0.015461 0.831029 0.8916

deltaMethod(object = seg_3, g. = "th1 + th2 + th3")

##                 Estimate      SE   2.5 % 97.5 %
## th1 + th2 + th3  6.51062 0.15286 6.21101 6.8102

# Modelo escrito como função do vetor de parâmetros (lambda).
model_fun <- function(lambda, xx){
    with(as.list(lambda),
         th0 + th1 * xx +
         th2 * (xx - b0) * (xx > b0) +
         th3 * (xx - b1) * (xx > b1))
}

# Matriz de derivadas parciais em lambda (n x p).
# gradient(f = model_fun, x = coef(seg_3), xx = c(0, 0.2, 0.4))
X <- gradient(f = model_fun,
              x = coef(seg_3),
              xx = pred$dia)
str(X)

##  num [1:74, 1:6] 1 1 1 1 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:6] "th0" "th1" "th2" "th3" ...

# Etapas até o IC passando pelo erro padrão e margem de erro.
U <- chol(vcov(seg_3))
pred$se <- sqrt(apply(X %*% t(U), 1, function(x) sum(x^2)))
tval <- qt(p = c(lwr = 0.025, upr = 0.975), df = df.residual(seg_3))
pred$me <- outer(pred$se, tval, "*")
pred <- cbind(pred, sweep(pred$me, 1, pred$fit_3, "+"))
str(pred)

## 'data.frame':    74 obs. of  8 variables:
##  $ dia  : num  1 2.49 3.97 5.46 6.94 ...
##  $ fit_3: num  -0.57 -0.239 0.092 0.423 0.754 ...
##  $ fit_2: num  -8.15 -6.99 -5.84 -4.69 -3.54 ...
##  $ fit_1: num  -18.6 -17 -15.3 -13.7 -12.1 ...
##  $ se   : num  1.078 0.963 0.855 0.757 0.672 ...
##  $ me   : num [1:74, 1:2] -2.14 -1.91 -1.7 -1.5 -1.33 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:2] "lwr" "upr"
##  $ lwr  : num  -2.708 -2.15 -1.605 -1.078 -0.579 ...
##  $ upr  : num  1.57 1.67 1.79 1.92 2.09 ...

# Gráfico com bandas de confiança.
ggplot(data = pred,
       mapping = aes(x = dia)) +
    geom_ribbon(data = pred,
                mapping = aes(ymin = lwr, ymax = upr),
                fill = "orange") +
    geom_line(mapping = aes(y = fit_3)) +
    geom_point(data = tb,
               mapping = aes(x = dia, y = Taxa))

4 Ajuste considerando dependência temporal das observações

#-----------------------------------------------------------------------
# Ajuste do modelo linear segmentado com estrutura de dependência.

# Gráficos de autocorrelação dos resíduos.
par(mfrow = c(1, 2))
acf(residuals(seg_3))
abline(h = 0.5, col = "orange")
pacf(residuals(seg_3))

layout(1)

# Modelo que também estima a correlação com estrutura AR(1) -
# autoregressivo de primeira ordem.
seg_3c <- gnls(model = formula(seg_3),
               data = tb,
               start = coef(seg_3),
               correlation = corAR1(value = 0.6))

# Resultado do ajuste.
summary(seg_3c)

## Generalized nonlinear least squares fit
##   Model: formula(seg_3) 
##   Data: tb 
##        AIC      BIC   logLik
##   470.4779 491.7096 -227.239
## 
## Correlation Structure: AR(1)
##  Formula: ~1 
##  Parameter estimate(s):
##       Phi 
## 0.5290446 
## 
## Coefficients:
##        Value Std.Error   t-value p-value
## th0 -0.70867 1.8152670  -0.39039  0.6971
## th1  0.21891 0.1364404   1.60446  0.1118
## th2  0.64050 0.1420697   4.50834  0.0000
## th3  5.59868 0.2174281  25.74955  0.0000
## b0  21.24244 2.7738147   7.65821  0.0000
## b1  90.73963 0.3410998 266.02078  0.0000
## 
##  Correlation: 
##     th0    th1    th2    th3    b0    
## th1 -0.857                            
## th2  0.835 -0.983                     
## th3 -0.012  0.025 -0.070              
## b0  -0.334  0.668 -0.575 -0.074       
## b1   0.012 -0.025  0.081  0.625  0.074
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -5.51485032 -0.24302240 -0.02256651  0.26705159  2.28940200 
## 
## Residual standard error: 2.55301 
## Degrees of freedom: 105 total; 99 residual

# Testa se o parâmetro de correlação é 0 via razão de verossimilhanças.
anova(seg_3c, seg_3)

##        Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## seg_3c     1  8 470.4779 491.7096 -227.2390                        
## seg_3      2  7 503.0285 521.6062 -244.5142 1 vs 2 34.55052  <.0001

# No modelo que acomoda a correlação, ocorre aumento dos erros padrões
# justamente porque o dado correlacionado é menos informativo, resultado
# da sobreposição/redundância de informação que as observações tem.
diag(vcov(seg_3c))/diag(vcov(seg_3))

##      th0      th1      th2      th3       b0       b1 
## 2.459591 2.190824 2.310338 2.002627 1.968352 1.846170

# Intervalos de confiança dos modelos.
cbind(confint(seg_3c), "|" = NA, confint.default(seg_3))

##           2.5 %     97.5 %  |       2.5 %     97.5 %
## th0 -4.26652398  2.8491920 NA -3.06129472  1.4758971
## th1 -0.04850539  0.4863310 NA  0.04210512  0.4034461
## th2  0.36204754  0.9189507 NA  0.45536240  0.8217509
## th3  5.17252496  6.0248275 NA  5.34815218  5.9504256
## b0  15.80586721 26.6790209 NA 17.41335025 25.1633938
## b1  90.07108888 91.4081755 NA 90.37261107 91.3566758

# Intervalos de confiança para as taxas (são funções dos parâmetros)
# usando o método delta. Como mudou a matriz de covariância, mudam-se as
# inferências.
deltaMethod(object = seg_3c, g. = "th1")

##      Estimate        SE     2.5 % 97.5 %
## th1  0.218913  0.136440 -0.048505 0.4863

deltaMethod(object = seg_3c, g. = "th1 + th2")

##           Estimate       SE    2.5 % 97.5 %
## th1 + th2 0.859412 0.025988 0.808476 0.9103

deltaMethod(object = seg_3c, g. = "th1 + th2 + th3")

##                 Estimate      SE   2.5 % 97.5 %
## th1 + th2 + th3  6.45809 0.21239 6.04180 6.8744

# Bandas de confiança.
pred$fit_3c <- predict(seg_3c, newdata = pred)
pred <- pred[, c("dia", "fit_3c")]
U <- chol(vcov(seg_3c))
pred$se <- sqrt(apply(X %*% t(U), 1, function(x) sum(x^2)))
pred$me <- outer(pred$se, tval, "*")
pred <- cbind(pred, sweep(pred$me, 1, pred$fit_3c, "+"))
str(pred)

## 'data.frame':    74 obs. of  6 variables:
##  $ dia   : num  1 2.49 3.97 5.46 6.94 ...
##  $ fit_3c: num  -0.49 -0.165 0.161 0.486 0.811 ...
##   ..- attr(*, "label")= chr "Predicted values"
##  $ se    : num  1.7 1.53 1.38 1.24 1.12 ...
##  $ me    : num [1:74, 1:2] -3.37 -3.04 -2.74 -2.46 -2.21 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:2] "lwr" "upr"
##  $ lwr   : num  -3.86 -3.21 -2.58 -1.97 -1.4 ...
##  $ upr   : num  2.88 2.88 2.9 2.94 3.02 ...

# Gráfico com bandas de confiança.
ggplot(data = pred,
       mapping = aes(x = dia)) +
    geom_ribbon(data = pred,
                mapping = aes(ymin = lwr, ymax = upr),
                fill = "orange") +
    geom_line(mapping = aes(y = fit_3c)) +
    geom_point(data = tb,
               mapping = aes(x = dia, y = Taxa))

# Verificação: gráficos de autocorrelação dos resíduos.
par(mfrow = c(1, 2))
acf(residuals(seg_3, type = "pearson"),
    main = "nls model")
acf(residuals(seg_3c, type = "normalized"),
    main = "gnls model") # Descorrelacionados.

layout(1)

#-----------------------------------------------------------------------

Análise de dados de Covid19 e associações climáticas

Ajuste de modelo segmentado

André Carlos Auler & Walmes Marques Zeviani

2020-07-02

1 Definições da sessão

2 Importação e inspeção dos dados

3 Ajuste de modelo segmentado linear

4 Ajuste considerando dependência temporal das observações