Reproducible Data Analysis of Scientific Cooperations

github.com/walmes/wzCoop

Session Definition

# https://github.com/walmes/wzRfun
# devtools::install_github("walmes/wzRfun")

library(lattice)
library(latticeExtra)
library(plyr)
library(reshape)
library(doBy)
library(multcomp)
library(lme4)
library(lmerTest)
library(wzRfun)
library(wzCoop)

Exploratory data analysis

# Data strucuture.
str(quality_ake_b)
## 'data.frame':    288 obs. of  12 variables:
##  $ yr : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ hed: Factor w/ 2 levels "heavy","normal": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tra: Factor w/ 4 levels "cnt","fon2","mer2",..: 4 4 4 4 4 4 4 4 4 1 ...
##  $ plo: Factor w/ 12 levels "1A","1B","1C",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tre: int  1 1 1 2 2 2 3 3 3 4 ...
##  $ rep: int  1 2 3 1 2 3 1 2 3 1 ...
##  $ c0 : int  2 4 7 4 0 0 0 0 2 NA ...
##  $ c1 : int  27 55 53 31 44 38 22 21 25 NA ...
##  $ c2 : int  18 12 12 26 40 37 25 32 36 NA ...
##  $ c3 : int  16 16 14 33 8 18 19 12 15 NA ...
##  $ c4 : int  37 13 14 6 8 7 34 35 22 NA ...
##  $ tot: int  100 100 100 100 100 100 100 100 100 NA ...
# Short names are easier to use.
da <- quality_ake_b
da$yr <- factor(da$yr)

intlev <- c(c0 = "[0]",
            c1 = "[1, 10]",
            c2 = "[11, 35]",
            c3 = "[36, 64]",
            c4 = "[65, 100]")

# Creating the blocks.
da$block <- with(da, {
    factor(ifelse(as.integer(gsub("\\D", "", plo)) > 2, "II", "I"))
})

# Creating unique levels for trees.
da$tre <- with(da,
               interaction(hed, tra, block, tre, drop = TRUE))

# Stack data to plot variables.
db <- melt(da,
           id.vars = 1:6,
           measure.vars = grep("c\\d", names(quality_ake_b)),
           na.rm = TRUE)
names(db)[ncol(db) - 1:0] <- c("categ", "freq")

useOuterStrips(
    xyplot(freq ~ categ | hed + yr,
           groups = tra,
           data = db,
           xlab = "Percentage of the shell surface discolored",
           ylab = "Number of fruits in each class (n = 100)",
           jitter.x = TRUE,
           scales = list(x = list(labels = intlev,
                                  alternating = 1)),
           auto.key = list(columns = 2,
                           title = "Fungicide",
                           cex.title = 1.1),
           type = c("p", "a")))

useOuterStrips(
    xyplot(freq ~ categ | tra + yr,
           groups = hed,
           data = db,
           xlab = "Percentage of the shell surface discolored",
           ylab = "Number of fruits in each class (n = 100)",
           jitter.x = TRUE,
           scales = list(x = list(labels = intlev,
                                  rot = 45,
                                  alternating = 1)),
           auto.key = list(columns = 2,
                           title = "Hed",
                           cex.title = 1.1),
           type = c("p", "a")))

i <- grep("^c\\d$", x = names(da))

# useOuterStrips(
#     splom(~da[, i] | hed + yr,
#           groups = tra,
#           type = c("p", "r"),
#           data = da))
# Acumulated proportion.
cml <- t(apply(da[, i], MARGIN = 1, FUN = cumsum))
cml <- cml[, -length(i)]
colnames(cml) <- paste0("p", 1:ncol(cml))

# Add acummulated proportions.
da <- cbind(da, as.data.frame(cml))

db <- melt(da,
           id.vars = 1:6,
           measure.vars = colnames(cml),
           na.rm = TRUE)
names(db)[ncol(db) - 1:0] <- c("categ", "prop")
str(db)
## 'data.frame':    1104 obs. of  8 variables:
##  $ yr   : Factor w/ 2 levels "2015","2016": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hed  : Factor w/ 2 levels "heavy","normal": 1 1 1 1 1 1 1 1 1 2 ...
##  $ tra  : Factor w/ 4 levels "cnt","fon2","mer2",..: 4 4 4 4 4 4 4 4 4 2 ...
##  $ plo  : Factor w/ 12 levels "1A","1B","1C",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ tre  : Factor w/ 48 levels "heavy.mer3.I.1",..: 1 1 1 2 2 2 3 3 3 5 ...
##  $ rep  : int  1 2 3 1 2 3 1 2 3 1 ...
##  $ categ: Factor w/ 4 levels "p1","p2","p3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ prop : int  2 4 7 4 0 0 0 0 2 0 ...
accumlev <- c(p1 = "0",
              p2 = "<=10",
              p3 = "<=35",
              p4 = "<=65")

useOuterStrips(
    xyplot(prop ~ categ | hed + yr,
           groups = tra,
           data = db,
           xlab = "Percentage of the shell surface discolored",
           ylab = "Number of fruits in each class (n = 100)",
           jitter.x = TRUE,
           auto.key = list(columns = 2,
                           title = "Fungicide",
                           cex.title = 1.1),
           scales = list(x = list(labels = accumlev,
                                  alternating = 1)),
           type = c("p", "a")))

useOuterStrips(
    xyplot(prop ~ categ | tra + yr,
           groups = hed,
           data = db,
           xlab = "Percentage of the shell surface discolored",
           ylab = "Number of fruits in each category (n = 100)",
           jitter.x = TRUE,
           auto.key = list(columns = 2,
                           title = "Hed",
                           cex.title = 1.1),
           scales = list(x = list(labels = accumlev,
                                  alternating = 1)),
           type = c("p", "a")))

Analysis for the accumulated proportion

The plots above showed that the greatest difference among levels of treaments, either hed or fungicides, occurs for the accumulated proportions at class \(\leq 35\) (p2). Because of this, the analysis will be for this response variable.

The total number of independent observations results from the calculus \[ \begin{aligned} 288 \text{ observations} &= 2 \text{ years } \times 2 \text{ blocks } \times \\ &\quad\,\, 2 \text{ heds } \times 4 \text{ fungicides } \times \\ &\quad\,\, 3 \text{ trees } \times 3 \text{ bags }.\\ \end{aligned} \]

da$y <- da$p2
da <- da[!is.na(da$y), ]

xyplot(p2 ~ tra | yr,
       groups = hed,
       data = da,
       type = c("p", "a"),
       xlab = "Fungicides",
       ylab = expression("Number of fruits with" <= "35% of the shell surface discolored"),
       jitter.x = TRUE,
       auto.key = list(columns = 2,
                       title = "Hed",
                       cex.title = 1.1))
Figure  1: Number of fruits with 35% or less of the shell surface discolored as a function of the fungicide and hed level for two years.

Figure 1: Number of fruits with 35% or less of the shell surface discolored as a function of the fungicide and hed level for two years.

# xyplot(p2 ~ yr | tre,
#        data = da,
#        xlab = "Years",
#        ylab = "Number of fruits with 35% or less damage",
#        as.table = TRUE,
#        jitter.x = TRUE)

The analysis are being separated for each year. First, although the choosen response variable is limited or bounded (because it is a sample proportion), it doesn’t show problematic departures from a linear (gaussian) model. Linear models are more flexible for analysing clutered data as such.

da15 <- subset(da, yr == "2015")
da16 <- subset(da, yr == "2016")

# Quasi-binomial model.
# m0 <- glm(cbind(p2, 100 - p2) ~ block + yr * hed * tra,
#           family = quasibinomial,
#           data = da)

# Just to check if the model assumptions are meet.
m0 <- lm(p2 ~ block + yr * hed * tra,
         data = da)
par(mfrow = c(2, 2))
plot(m0); layout(1)

# Normal approximation goes well.

A simple linear models is fitted only to check how the residuals behave. No evidence of departures from models assumptions were found.

# Year 2015.
m15 <- lmer(p2 ~ block + hed * tra + (1 | tre),
           data = da15)

# r <- residuals(m15)
# f <- fitted(m15)
# plot(r ~ f)
# qqnorm(r)

anova(m15)
## Analysis of Variance Table of type III  with  Satterthwaite 
## approximation for degrees of freedom
##          Sum Sq Mean Sq NumDF DenDF F.value Pr(>F)
## block    32.467  32.467     1    35 1.05247 0.3120
## hed      18.445  18.445     1    35 0.59794 0.4446
## tra      87.850  29.283     3    35 0.94926 0.4274
## hed:tra 189.774  63.258     3    35 2.05060 0.1246
summary(m15)
## Linear mixed model fit by REML t-tests use Satterthwaite
##   approximations to degrees of freedom [lmerMod]
## Formula: p2 ~ block + hed * tra + (1 | tre)
##    Data: da15
## 
## REML criterion at convergence: 896.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4619 -0.4981 -0.1348  0.5054  2.6844 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  tre      (Intercept) 166.72   12.912  
##  Residual              30.85    5.554  
## Number of obs: 132, groups:  tre, 44
## 
## Fixed effects:
##                   Estimate Std. Error       df t value Pr(>|t|)   
## (Intercept)        20.7481     6.9546  35.0000   2.983  0.00517 **
## blockII            -4.1628     4.0577  35.0000  -1.026  0.31198   
## hednormal          -2.6667     8.5877  35.0000  -0.311  0.75801   
## trafon2            -0.2222     8.5877  35.0000  -0.026  0.97950   
## tramer2            10.5000     8.5877  35.0000   1.223  0.22962   
## tramer3             9.5556     8.5877  35.0000   1.113  0.27342   
## hednormal:trafon2  13.5000    11.5217  35.0000   1.172  0.24923   
## hednormal:tramer2  -2.1240    12.1872  35.0000  -0.174  0.86265   
## hednormal:tramer3 -13.3333    11.5217  35.0000  -1.157  0.25501   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) blckII hdnrml trafn2 tramr2 tramr3 hdnrml:trf2
## blockII     -0.292                                               
## hednormal   -0.741  0.000                                        
## trafon2     -0.741  0.000  0.600                                 
## tramer2     -0.741  0.000  0.600  0.600                          
## tramer3     -0.741  0.000  0.600  0.600  0.600                   
## hdnrml:trf2  0.552  0.000 -0.745 -0.745 -0.447 -0.447            
## hdnrml:trm2  0.498  0.083 -0.705 -0.423 -0.705 -0.423  0.525     
## hdnrml:trm3  0.552  0.000 -0.745 -0.447 -0.447 -0.745  0.556     
##             hdnrml:trm2
## blockII                
## hednormal              
## trafon2                
## tramer2                
## tramer3                
## hdnrml:trf2            
## hdnrml:trm2            
## hdnrml:trm3  0.525
# Year 2016.
m16 <- lmer(p2 ~ block + hed * tra + (1 | tre),
           data = da16)

# r <- residuals(m16)
# f <- fitted(m16)
# plot(r ~ f)
# qqnorm(r)

anova(m16)
## Analysis of Variance Table of type III  with  Satterthwaite 
## approximation for degrees of freedom
##         Sum Sq Mean Sq NumDF DenDF F.value Pr(>F)
## block    1.385   1.385     1    39 0.03743 0.8476
## hed     81.092  81.092     1    39 2.19127 0.1468
## tra     59.092  19.697     3    39 0.53226 0.6629
## hed:tra 55.633  18.544     3    39 0.50111 0.6837
summary(m16)
## Linear mixed model fit by REML t-tests use Satterthwaite
##   approximations to degrees of freedom [lmerMod]
## Formula: p2 ~ block + hed * tra + (1 | tre)
##    Data: da16
## 
## REML criterion at convergence: 984.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.06240 -0.61593 -0.05643  0.60353  1.87718 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  tre      (Intercept) 102.01   10.100  
##  Residual              37.01    6.083  
## Number of obs: 144, groups:  tre, 48
## 
## Fixed effects:
##                   Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)        44.8681     4.6303 39.0000   9.690 6.19e-12 ***
## blockII             0.5972     3.0869 39.0000   0.193   0.8476    
## hednormal          11.1111     6.1737 39.0000   1.800   0.0796 .  
## trafon2             9.6667     6.1737 39.0000   1.566   0.1255    
## tramer2             6.1111     6.1737 39.0000   0.990   0.3283    
## tramer3             6.5556     6.1737 39.0000   1.062   0.2948    
## hednormal:trafon2  -8.6111     8.7309 39.0000  -0.986   0.3301    
## hednormal:tramer2  -9.1111     8.7309 39.0000  -1.044   0.3031    
## hednormal:tramer3  -8.4444     8.7309 39.0000  -0.967   0.3394    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) blckII hdnrml trafn2 tramr2 tramr3 hdnrml:trf2
## blockII     -0.333                                               
## hednormal   -0.667  0.000                                        
## trafon2     -0.667  0.000  0.500                                 
## tramer2     -0.667  0.000  0.500  0.500                          
## tramer3     -0.667  0.000  0.500  0.500  0.500                   
## hdnrml:trf2  0.471  0.000 -0.707 -0.707 -0.354 -0.354            
## hdnrml:trm2  0.471  0.000 -0.707 -0.354 -0.707 -0.354  0.500     
## hdnrml:trm3  0.471  0.000 -0.707 -0.354 -0.354 -0.707  0.500     
##             hdnrml:trm2
## blockII                
## hednormal              
## trafon2                
## tramer2                
## tramer3                
## hdnrml:trf2            
## hdnrml:trm2            
## hdnrml:trm3  0.500

For both years, no effect were found for the experimental factors.

# Represent the results with interval confidence on ls means.
cmp <- list()

lsm <- LSmeans(m15, effect = "hed")
grid <- equallevels(lsm$grid, da)

L <- lsm$K
rownames(L) <- grid[, 1]

cmp$hed <-
    rbind(
        cbind(yr = "2015", apmc(L, model = m15, focus = "hed")),
        cbind(yr = "2016", apmc(L, model = m16, focus = "hed")))

lsm <- LSmeans(m15, effect = "tra")
grid <- equallevels(lsm$grid, da)

L <- lsm$K
rownames(L) <- grid[, 1]

cmp$tra <-
    rbind(
        cbind(yr = "2015", apmc(L, model = m15, focus = "tra")),
        cbind(yr = "2016", apmc(L, model = m16, focus = "tra")))

cmp <- ldply(cmp,
             .id = "effect",
             .fun = function(x) {
                 names(x)[2] <- "level"
                 return(x)
             })
cmp$level <- factor(cmp$level, levels = unique(cmp$level))
cmp
##    effect   yr  level      fit       lwr      upr cld
## 1     hed 2015  heavy 23.62500 17.979486 29.27051   a
## 2     hed 2015 normal 20.46899 14.801639 26.13635   a
## 3     hed 2016  heavy 50.75000 46.471923 55.02808   a
## 4     hed 2016 normal 55.31944 51.041367 59.59752   a
## 5     tra 2015    cnt 17.33333  8.917498 25.74917   a
## 6     tra 2015   fon2 23.86111 16.333759 31.38846   a
## 7     tra 2015   mer2 26.77132 18.296971 35.24566   a
## 8     tra 2015   mer3 20.22222 12.694870 27.74957   a
## 9     tra 2016    cnt 50.72222 44.672107 56.77234   a
## 10    tra 2016   fon2 56.08333 50.033218 62.13345   a
## 11    tra 2016   mer2 52.27778 46.227663 58.32789   a
## 12    tra 2016   mer3 53.05556 47.005441 59.10567   a
resizePanels(
    useOuterStrips(
        segplot(level ~ lwr + upr | yr + effect,
                centers = fit,
                data = cmp,
                draw = FALSE,
                xlab = "Proportion of fruits with 35% or less damage",
                ylab = "Levels of the factors",
                scales = list(y = list(relation = "free"),
                              alternating = 1),
                ann = sprintf("%0.2f %s", cmp$fit, cmp$cld)
                )
    ), h = c(2, 4)) +
    layer(panel.text(x = centers[subscripts],
                     y = z[subscripts],
                     labels = ann[subscripts],
                     pos = 3))
Figure  2: Mean proportion of fruits with 35% or less of the shell surface discolored as a function of the fungicide and hed level for two years.

Figure 2: Mean proportion of fruits with 35% or less of the shell surface discolored as a function of the fungicide and hed level for two years.


Session information

## domingo, 07 de maio de 2017, 22:17
## ----------------------------------------
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
## 
## locale:
##  [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=pt_BR.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] methods   stats     graphics  grDevices utils     datasets 
## [7] base     
## 
## other attached packages:
##  [1] wzCoop_0.0-5        captioner_2.2.3     wzRfun_0.79        
##  [4] lmerTest_2.0-33     lme4_1.1-12         Matrix_1.2-8       
##  [7] multcomp_1.4-6      TH.data_1.0-8       MASS_7.3-45        
## [10] survival_2.40-1     mvtnorm_1.0-5       doBy_4.5-15        
## [13] reshape_0.8.6       plyr_1.8.4          latticeExtra_0.6-28
## [16] RColorBrewer_1.1-2  lattice_0.20-34     rmarkdown_1.3      
## [19] knitr_1.15.1       
## 
## loaded via a namespace (and not attached):
##  [1] zoo_1.7-14        splines_3.3.3     testthat_1.0.2   
##  [4] colorspace_1.3-2  htmltools_0.3.5   yaml_2.1.14      
##  [7] rpanel_1.1-3      base64enc_0.1-3   nloptr_1.0.4     
## [10] withr_1.0.2       foreign_0.8-67    stringr_1.1.0    
## [13] commonmark_1.1    munsell_0.4.3     gtable_0.2.0     
## [16] htmlwidgets_0.8   devtools_1.12.0   memoise_1.0.0    
## [19] codetools_0.2-15  evaluate_0.10     parallel_3.3.3   
## [22] pbkrtest_0.4-6    highr_0.6         htmlTable_1.9    
## [25] Rcpp_0.12.9       acepack_1.4.1     scales_0.4.1     
## [28] backports_1.0.5   checkmate_1.8.2   Hmisc_4.0-2      
## [31] gridExtra_2.2.1   ggplot2_2.2.1     digest_0.6.12    
## [34] stringi_1.1.2     grid_3.3.3        rprojroot_1.2    
## [37] tools_3.3.3       sandwich_2.3-4    magrittr_1.5     
## [40] lazyeval_0.2.0    tibble_1.2        Formula_1.2-1    
## [43] cluster_2.0.6     crayon_1.3.2      xml2_1.1.1       
## [46] data.table_1.10.4 roxygen2_6.0.1    assertthat_0.1   
## [49] minqa_1.2.4       R6_2.2.0          rpart_4.1-10     
## [52] nnet_7.3-12       nlme_3.1-131