1 Loading extra packages

library(knitr)
library(glmmTMB)
library(TMB)
library(tmbstan)
library(ggplot2)
library(tidyverse)
library(cowplot)

2 Water quality index data set

This data set corresponds to a water quality index of hydroelectric power plants operated by the energy company COPEL in the State of Paran, Brazil. The response variable \(y \in (0,1)\) was measured 12 times (3 locations \(\times\) 4 quarters) in each of the 16 power plants. Thus, the dataset is composed of 190 observations (excluding two missing observations). The main goal of the data analysis was to investigate the relationship of the \(y\) with the locations (upstream, reservoir and downstream) controlled by the effect of the quarters and power plants.

data2 <- read.table("IQA_Data_Set.txt", h = T)
data2 <- na.exclude(data2)
head(data2, n = 12)
##    id    y   location quarter
## 1  U1 0.68   Upstream       1
## 2  U1 0.83  Reservoir       1
## 3  U1 0.76 Downstream       1
## 4  U1 0.53   Upstream       2
## 5  U1 0.75  Reservoir       2
## 6  U1 0.64 Downstream       2
## 7  U1 0.72   Upstream       3
## 8  U1 0.81  Reservoir       3
## 9  U1 0.84 Downstream       3
## 10 U1 0.78   Upstream       4
## 11 U1 0.85  Reservoir       4
## 12 U1 0.86 Downstream       4

3 Descriptive analysis

summary(data2)
##        id            y                location  quarter
##  U1     : 12   Min.   :0.5100   Upstream  :63   1:48   
##  U2     : 12   1st Qu.:0.7625   Reservoir :64   2:47   
##  U3     : 12   Median :0.8200   Downstream:63   3:48   
##  U5     : 12   Mean   :0.8022                   4:47   
##  U6     : 12   3rd Qu.:0.8600                          
##  U7     : 12   Max.   :0.9300                          
##  (Other):118
p1 <- ggplot(data2, aes(x = y)) +
        geom_histogram(binwidth = 0.05, colour = "black", 
                       fill = "gray80") + 
        ylab("Frequency") + xlab("y") 

p2 <- data2 %>% 
        ggplot(aes(x = quarter, y = y, fill = location)) +
        geom_boxplot() +
        xlab("Quarter")+ 
        ylab("y") +
        facet_wrap(~ location) +
        scale_fill_grey(start = 0.8, end = 0.8) + theme(legend.position = "none")

p3 <- data2 %>% 
        ggplot(aes(x = id, y = y)) +
        geom_boxplot(fill = "gray80") +
        xlab("Power plant") + 
        ylab("y") 

bottom_row <- plot_grid(p1, p2, labels = c('', '(B)'), align = 'h', rel_widths = c(1, 1.3))
plot_grid(bottom_row, p3, labels = c('(A)', '(C)'), ncol = 1, rel_heights = c(1, 1.2))

4 Beta mixed regression model

4.1 Beta distribution

Let \(Y_1, \ldots, Y_n\) be independent random variables with probability density function given by \[ f(y;\mu,\phi) = \dfrac{\Gamma(\phi)}{\Gamma(\mu \phi) \Gamma[(1 - \mu)\phi]} y^{\mu \phi - 1} (1 - y)^{(1 - \mu) \phi - 1}, \] where \(y, \mu \in (0,1)\) and \(\phi > 0\) denote the precision parameter. According to Ferrari and Cribari-Neto (2004) the expectation and variance of the beta distribution are given, respectively, by

\[\begin{equation} \label{betamoments} \mathrm{E}(Y) = \mu \quad~\mathrm{and}~\quad \mathrm{Var}(Y) = \dfrac{\mu(1-\mu)}{1 + \phi}. \end{equation}\]

4.2 Model specification

For specification of the beta mixed regression model, we assume that \(y\) was measured at the \(k^{th}\) location, \(j^{th}\) quarter and \(i^{th}\) power plant, for \(i = 1, \ldots, 16\). Thus, we considered the following structure

\[\begin{align} \label{eq:linearprediqa} & Y_{ijk} | u_i \sim \mathcal{B}(\mu_{ijk}, \phi), \nonumber \\ \mathrm{logit}(\mu_{ijk}) & = \beta_0 + \beta_{1j}\texttt{quarter}_{ij} + \beta_{2k}\texttt{location}_{ik} + u_{i}, \nonumber \\ & \quad u_i \sim \mathcal{N}(0, \sigma^2), \nonumber \end{align}\] where \(\beta_{1j}\) evaluates the differences from quarter 1 to up to 4, for \(j = 2,3,4\). The parameter \(\beta_{2k}\) measure the changes from the upstream to the reservoir and to the downstream, for \(k = 2, 3\).

5 glmmTMB package

fit_glmmTMB <- glmmTMB::glmmTMB(y ~ quarter + location + (1 | id), 
                                family = beta_family, data = data2)
summary(fit_glmmTMB)
##  Family: beta  ( logit )
## Formula:          y ~ quarter + location + (1 | id)
## Data: data2
## 
##      AIC      BIC   logLik deviance df.resid 
##   -446.1   -420.1    231.0   -462.1      182 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  id     (Intercept) 0.03452  0.1858  
## Number of obs: 190, groups:  id, 16
## 
## Overdispersion parameter for beta family (): 30.5 
## 
## Conditional model:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.13561    0.08833  12.857  < 2e-16 ***
## quarter2            0.21885    0.09010   2.429 0.015136 *  
## quarter3            0.30926    0.09077   3.407 0.000657 ***
## quarter4            0.05210    0.08766   0.594 0.552297    
## locationReservoir   0.23701    0.07817   3.032 0.002428 ** 
## locationDownstream  0.16363    0.07776   2.104 0.035366 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Precision parameter
glmmTMB::sigma(fit_glmmTMB)
## [1] 30.47401

6 Template Model Builder (TMB) package

For details about TMB package see Kristensen et al. (2016).

6.1 Code written in C++ template

// Beta mixed regression model 
#include <TMB.hpp>

template<class Type>
Type objective_function<Type>::operator() ()
{
  DATA_VECTOR(Y);         // Observations
  DATA_SPARSE_MATRIX(X);  // Fixed effect design matrix
  PARAMETER_VECTOR(beta); // Fixed effects vector
  DATA_SPARSE_MATRIX(Z);  // Random effect design matrix
  PARAMETER_VECTOR(u);    // Random effects vector
  PARAMETER(logsigma);    // Random effect standard deviation
  PARAMETER(logphi);      // Precision parameter (Beta)

  // Preparing
  Type phi = exp(logphi);
  Type sigma = exp(logsigma);
  
  // Distribution of random effect (u):
  Type nll = 0;
  nll -= dnorm(u, Type(0), sigma, true).sum();

  // Linear predictor for mean
  vector<Type> mu = exp(X * beta + Z * u)/(1 + exp(X * beta + Z * u)); 
    
  // Log-likelihood
    for(int i=0; i < Y.size(); i++)
    nll -= dbeta(Y(i), mu(i)*phi, (1 - mu(i))*phi, true);
  
  // Delta method  
    ADREPORT(sigma);
    ADREPORT(phi);
  return nll;
}

6.2 Compiling and loading

TMB::compile("Beta_Mixed.cpp")
dyn.load(TMB::dynlib("Beta_Mixed"))

6.3 Data set

X <- model.matrix(~ quarter + location, data = data2)
Z <- model.matrix(~ factor(id)-1, data2)
X <- as(X,"dgTMatrix")
betas <- rnorm(ncol(X))
Z <- as(Z,"dgTMatrix")
u <- rnorm(ncol(Z))
# Response variable
Y <- data2$y
data <- list(Y = Y, X = X, Z = Z)

6.4 Initial values

parameters <- list(beta = betas, logsigma = 1, logphi = 1, u = u*0)

6.5 Log-likelihood function

Beta_TMB <- TMB::MakeADFun(data, parameters, DLL = "Beta_Mixed", 
                           random = "u",
                           hessian = TRUE, silent = TRUE)

6.6 Optimizing via nlminb()

fit_TMB <- nlminb(start = Beta_TMB$par, objective = Beta_TMB$fn, 
                  gradient = Beta_TMB$gr)
fit_TMB
## $par
##        beta        beta        beta        beta        beta        beta 
##  1.13561049  0.21885254  0.30925703  0.05209487  0.23700869  0.16362575 
##    logsigma      logphi 
## -1.68308174  3.41687498 
## 
## $objective
## [1] -231.0469
## 
## $convergence
## [1] 0
## 
## $iterations
## [1] 46
## 
## $evaluations
## function gradient 
##       59       47 
## 
## $message
## [1] "relative convergence (4)"

6.7 Getting parameter estimates

report <- TMB::sdreport(Beta_TMB)

6.8 Summary for the fixed effects

summary(report, "fixed", p.value = TRUE)
##             Estimate Std. Error    z value    Pr(>|z^2|)
## beta      1.13561049 0.08832735 12.8568385  7.873142e-38
## beta      0.21885254 0.09009649  2.4290906  1.513675e-02
## beta      0.30925703 0.09077102  3.4070019  6.568067e-04
## beta      0.05209487 0.08765509  0.5943165  5.523004e-01
## beta      0.23700869 0.07816587  3.0321251  2.428385e-03
## beta      0.16362575 0.07776378  2.1041384  3.536637e-02
## logsigma -1.68308174 0.26337707 -6.3903882  1.654651e-10
## logphi    3.41687498 0.10667377 32.0310712 4.028587e-225

6.9 Delta method for \(\sigma\) and \(\phi\)

summary(report, "report", p.value = TRUE)
##         Estimate Std. Error  z value   Pr(>|z^2|)
## sigma  0.1858005 0.04893559 3.796838 1.465537e-04
## phi   30.4740340 3.25077996 9.374376 6.958608e-21

6.10 Coefficients: glmmTMB versus TMB

tab <- rbind("glmmTMB" = fit_glmmTMB$fit$par[1:6], 
             "TMB Beta" = c(fit_TMB$par[1:6]))
colnames(tab) <- c("(Intercept)", "quarter2", "quarter3", 
                   "quarter4", "locationReservoir", "locationDownstream")
tab
##          (Intercept)  quarter2  quarter3   quarter4 locationReservoir
## glmmTMB      1.13561 0.2188536 0.3092573 0.05209532         0.2370077
## TMB Beta     1.13561 0.2188525 0.3092570 0.05209487         0.2370087
##          locationDownstream
## glmmTMB           0.1636258
## TMB Beta          0.1636258

6.11 Log-likelihood value

tab2 <- rbind("glmmTMB" = logLik(fit_glmmTMB), "TMB Beta" = abs(fit_TMB$objective))
colnames(tab2) <- "LogLik"
tab2
##            LogLik
## glmmTMB  231.0469
## TMB Beta 231.0469

7 Profile

# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 1, trace = F))

# mle
mle = fit_TMB$par[1]

p_beta0 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[0]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile beta12 (quarter2) ---------------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 2, trace = F))

# mle
mle = fit_TMB$par[2]

p_beta12 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[12]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile beta13 (quarter3) ---------------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 3, trace = F))

# mle
mle = fit_TMB$par[3]

p_beta13 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[13]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile beta14 (quarter4) ---------------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 4, trace = F))

# mle
mle = fit_TMB$par[4]

p_beta14 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[14]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile beta22 (locationReservoir) ------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 5, trace = F))

# mle
mle = fit_TMB$par[5]

p_beta22 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[22]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile beta23 (locationDownstream) -----------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 6, trace = F))

# mle
mle = fit_TMB$par[6]

p_beta23 <- ggplot() +
        geom_line(data = data, aes(x = beta, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(beta[23]), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed") 

# Profile logsigma ------------------------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 7, trace = F))

# mle
mle = fit_TMB$par[7]

p_logsigma <- ggplot() +
        geom_line(data = data, aes(x = logsigma, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(log(sigma)), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Profile logphi --------------------------------------------------------------
# Data
data <- as.data.frame(TMB::tmbprofile(Beta_TMB, 8, trace = F))

# mle
mle = fit_TMB$par[8]

p_logphi <- ggplot() +
        geom_line(data = data, aes(x = logphi, y = -1*value), size = 1) +
        theme_minimal(base_size = 16) +
        labs(x = expression(log(phi)), y = "Log-lik") +
        geom_vline(xintercept = mle, linetype = "dashed")

# Figure
gridExtra::grid.arrange(p_beta0, p_beta12, p_beta13, p_beta14,
                        p_beta22, p_beta23, p_logsigma, p_logphi,
                        ncol = 4, nrow = 2)

8 Unit gamma mixed regression model

For details, see Petterle et al. (2021)

8.1 Unit gamma distribution

Recently, Mousa, El-Sheikh, and Abdel-Fattah (2016) proposed the unit gamma distribution parameterized on mean. Thus, let \(Y \sim \mathrm{UG}\big([\mu^{1/\phi}/(1 - \mu^{1/\phi}], \phi\big)\) denoting a unit gamma random variable with pdf given by \[\begin{equation} \label{eq:ug} f(y; \mu, \phi) = \dfrac{ \Big(\frac{\mu^{1/\phi}}{1 - \mu^{1/\phi}}\Big)^{\phi} }{\Gamma(\phi)} y^{\tfrac{\mu^{1/\phi}}{1 - \mu^{1/\phi}} - 1} \left[ \log\bigg( \dfrac{1}{y} \bigg) \right]^{\phi - 1}, \end{equation}\] where \(y, \mu \in (0,1)\) and \(\phi > 0\). In this notation, \(\mu\) is the mean and \(\phi\) is a precision parameter. Here, the expectation and variance are given by

\[\begin{equation} \label{eq:ugmoments} \nonumber \mathrm{E}(Y) = \mu \quad~\mathrm{and}~\quad \mathrm{Var}(Y) = \mu \left\{ \Bigg[ \dfrac{1}{\big(2 - \mu^{1/\phi}\big)^{\phi}} \Bigg] - \mu \right\}. \end{equation}\]

8.2 Code written in C++ template

// Unit gamma mixed regression model 
#include <TMB.hpp>

template<class Type>
Type objective_function<Type>::operator() ()
{
  DATA_VECTOR(Y);         // Observations
  DATA_SPARSE_MATRIX(X);  // Fixed effect design matrix
  PARAMETER_VECTOR(beta); // Fixed effects vector
  DATA_SPARSE_MATRIX(Z);  // Random effect design matrix
  PARAMETER_VECTOR(u);    // Random effects vector
  PARAMETER(logsigma);    // Random effect standard deviation
  PARAMETER(logphi);      // Precision parameter (Unit gamma)

  // Preparing
  Type phi = exp(logphi);
  Type sigma = exp(logsigma);
  
  // Distribution of random effect (u):
  Type nll = 0;
  nll -= dnorm(u, Type(0), sigma, true).sum();

  // Linear predictor for mean
  vector<Type> mu = exp(X * beta + Z * u)/(1 + exp(X * beta + Z * u)); 
  vector<Type> tau = pow(mu, 1/phi)/(1 - pow(mu, 1/phi));
    
  // Log-likelihood
    for(int i=0; i < Y.size(); i++)
    nll -= phi * log(tau(i)) - lgamma(phi) + (tau(i) - 1) * 
           log(Y(i)) + (phi - 1) * log(-log(Y(i)));
  
  // Delta method 
    ADREPORT(sigma);
    ADREPORT(phi);
  return nll;
}

8.3 Compiling and loading

TMB::compile("Unit_gamma_Mixed.cpp")
dyn.load(TMB::dynlib("Unit_gamma_Mixed"))

8.4 Data set

X <- model.matrix(~ quarter + location, data = data2)
Z <- model.matrix(~ factor(id)-1, data2)
X <- as(X,"dgTMatrix")
betas <- rnorm(ncol(X))
Z <- as(Z,"dgTMatrix")
u <- rnorm(ncol(Z))
# Response variable
Y <- data2$y
data <- list(Y = Y, X = X, Z = Z)

8.5 Initial values

parameters <- list(beta = betas, logsigma = 1, logphi = 1, u = u*0)

8.6 Log-likelihood function

UG_TMB <- TMB::MakeADFun(data, parameters, DLL = "Unit_gamma_Mixed", 
                         random = "u",
                         hessian = TRUE, silent = TRUE)

8.7 Optimizing via nlminb()

fit_UG <- nlminb(start = UG_TMB$par, objective = UG_TMB$fn, gradient = UG_TMB$gr)
fit_UG
## $par
##        beta        beta        beta        beta        beta        beta 
##  1.11639101  0.22936457  0.34524972  0.07421333  0.25877350  0.15479091 
##    logsigma      logphi 
## -1.73549538  1.78965516 
## 
## $objective
## [1] -233.2328
## 
## $convergence
## [1] 0
## 
## $iterations
## [1] 44
## 
## $evaluations
## function gradient 
##       54       45 
## 
## $message
## [1] "relative convergence (4)"

8.8 Getting parameter estimates

report_UG <- TMB::sdreport(UG_TMB)

8.9 Summary for the fixed effects

summary(report_UG, "fixed", p.value = TRUE)
##             Estimate Std. Error    z value   Pr(>|z^2|)
## beta      1.11639101 0.09271391 12.0412455 2.156745e-33
## beta      0.22936457 0.09357306  2.4511816 1.423881e-02
## beta      0.34524972 0.09245312  3.7343220 1.882216e-04
## beta      0.07421333 0.09369410  0.7920811 4.283134e-01
## beta      0.25877350 0.08024972  3.2246030 1.261475e-03
## beta      0.15479091 0.08092527  1.9127635 5.577834e-02
## logsigma -1.73549538 0.27425720 -6.3279848 2.483836e-10
## logphi    1.78965516 0.10411568 17.1891031 3.204512e-66

8.10 Delta method for \(\sigma\) and \(\phi\)

summary(report_UG, "report", p.value = TRUE)
##        Estimate Std. Error  z value   Pr(>|z^2|)
## sigma 0.1763128 0.04835506 3.646212 2.661340e-04
## phi   5.9873874 0.62338091 9.604701 7.637912e-22

9 tmbstan package

9.1 TMB Stan: Beta mixed regression model

options(mc.cores = parallel::detectCores())
rstan_options(auto_write = T)
fit_TMB_Stan <- tmbstan::tmbstan(Beta_TMB, silent = F, laplace = F, chains = 4)
fit_TMB_Stan

# Plots
plot(fit_TMB_Stan, pars = names(Beta_TMB$par))

# Traceplot
trace <- traceplot(fit_TMB_Stan, pars = names(Beta_TMB$par))
trace + scale_color_discrete() + theme(legend.position = "top")

9.2 TMB Stan: Unit gamma mixed regression model

options(mc.cores = parallel::detectCores())
rstan_options(auto_write = T)
fit_TMB_Stan_UG <- tmbstan::tmbstan(UG_TMB, silent = F, laplace = F, chains = 4)
fit_TMB_Stan_UG

# Plots
plot(fit_TMB_Stan_UG, pars = names(UG_TMB$par))

# Traceplot
trace <- traceplot(fit_TMB_Stan_UG, pars = names(UG_TMB$par))
trace + scale_color_discrete() + theme(legend.position = "top")

References

Ferrari, Silvia, and Francisco Cribari-Neto. 2004. “Beta Regression for Modelling Rates and Proportions.” Journal of Applied Statistics 31 (7): 799–815.

Kristensen, Kasper, Anders Nielsen, Casper W Berg, Hans Skaug, and Bradley M Bell. 2016. “TMB: Automatic Differentiation and Laplace Approximation.” Journal of Statistical Software 70 (5).

Mousa, Amany M, Ahmed A El-Sheikh, and Mahmoud A Abdel-Fattah. 2016. “A Gamma Regression for Bounded Continuous Variables.” Advances and Applications in Statistics 49 (4): 305.

Petterle, Ricardo Rasmussen, César Augusto Taconeli, José L P da Silva, Guilherme Parreira da Silva, Henrique Aparecido Laureano, and Wagner Hugo Bonat. 2021. “Unit Gamma Mixed Regression Models for Continuous Bounded Data.” Journal of Statistical Computation and Simulation (Online): 1–19.