Supplementary material of the paper

A dynamic rating for Brazilian football league 2011 season

Marcelo Leme de Arruda (I)
Walmes Marques Zeviani (II),
Júlio Sílvio de Sousa Bueno Filho (I)

I: DEX: Department of Exact Sciences, UFLA: Federal University of Lavras,
II: LEG: Statistics and Geoinformation Lab, DEST: Department of Statistics, UFPR: Federal University of Paraná.

Download the dataset: brasileiro2004-2011.txt
Download the R script: map.R

Session definitions

R source

##----------------------------------------------------------------------
## Required packages and session definition.

require(latticeExtra)
require(plyr)
require(reshape2)
require(gridExtra)

sessionInfo()
Sys.time()

##----------------------------------------------------------------------

rm(list=ls())

## Symbols.
## plot(1:30, pch=1:30)

## Color palette.
mycol <- brewer.pal(6, "Set1")

## Trellis graphical style.
ps <- list(
    box.rectangle=list(col=1, fill=c("gray70")),
    box.umbrella=list(col=1, lty=1),
    dot.symbol=list(col=1, pch=19),
    dot.line=list(col="gray50", lty=3),
    plot.symbol=list(col=1, cex=1.2),
    plot.line=list(col=1),
    plot.polygon=list(col="gray95"),
    superpose.line=list(col=mycol),
    superpose.symbol=list(col=mycol, pch=c(1:5)),
    superpose.polygon=list(col=mycol),
    strip.background=list(col=c("gray80","gray50"))
    )
trellis.par.set(ps)
## show.settings()

Load data

R source

##----------------------------------------------------------------------
## Load the data of the season 2011.

## UNIX command to get the charset/encoding of a file.
system("file -bi brasileiro2004-2011.txt")

## iso-8859-1 aka latin1.

## Importing the dataset with seasons 2004 to 2011.
da <- read.table(
    file="brasileiro2004-2011.txt",
    header=TRUE,
    sep="\t",
    encoding="latin1",
    colClasses=c("factor","character","integer","NULL")[
        c(2,4,3,4,4,4,1,1,2)])

## Short names are more manageable.
## man: home team, away: away team.
names(da) <- substr(names(da), 1, 3)
names(da)[3:4] <- c("home", "away")
str(da)

R output

'data.frame':  3294 obs. of  5 variables:
 $ dat : chr  "21/04/2004" "21/04/2004" "21/04/2004" "21/04/2004" ...
 $ rod : int  1 1 1 1 1 1 1 1 1 1 ...
 $ home: Factor w/ 40 levels "América/MG","América/RN",..: 9 16 36 15 28 21 31 29 13 39 ...
 $ away: Factor w/ 40 levels "América/MG","América/RN",..: 20 24 40 26 4 17 12 35 23 14 ...
 $ pla : chr  "1X4" "1X0" "1X0" "2X1" ...

R source

## Same order in the factor levels.
l <- union(levels(da$home), levels(da$away))
da$home <- factor(da$home, levels=l)
da$away <- factor(da$away, levels=l)

## Converting string to data (calendar format).
da$dat <- as.POSIXct(x=da$dat, format="%d/%m/%Y")
da$yea <- as.integer(format(da$dat, format="%Y"))
    
## Get the goals of each team in a match.
da$h <- as.integer(gsub("^(\\d+)x\\d+$", "\\1", tolower(da$pla)))
da$a <- as.integer(gsub("^\\d+x(\\d+)$", "\\1", tolower(da$pla)))
da <- transform(da, d=abs(h-a), s=h+a)
str(da)

R output

'data.frame':  3294 obs. of  10 variables:
 $ dat : POSIXct, format: "2004-04-21 00:00:00" ...
 $ rod : int  1 1 1 1 1 1 1 1 1 1 ...
 $ home: Factor w/ 40 levels "América/MG","América/RN",..: 9 16 36 15 28 21 31 29 13 39 ...
 $ away: Factor w/ 40 levels "América/MG","América/RN",..: 20 24 40 26 4 17 12 35 23 14 ...
 $ pla : chr  "1X4" "1X0" "1X0" "2X1" ...
 $ yea : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
 $ h   : int  1 1 1 2 0 0 3 3 1 0 ...
 $ a   : int  4 0 0 1 0 0 2 2 0 1 ...
 $ d   : int  3 1 1 1 0 0 1 1 1 1 ...
 $ s   : int  5 1 1 3 0 0 5 5 1 1 ...

R source

## Marginal statistics in goals scored for each season.
aggregate(cbind(home=h, away=a, absdiff=d, total=s)~yea,
          data=da, FUN=mean, na.rm=TRUE)

R output

   yea     home      away  absdiff    total
1 2004 1.717391 1.0652174 1.362319 2.782609
2 2005 1.803030 1.3311688 1.307359 3.134199
3 2006 1.589474 1.1210526 1.289474 2.710526
4 2007 1.665789 1.0868421 1.384211 2.752632
5 2008 1.731579 0.9894737 1.389474 2.721053
6 2009 1.736842 1.1421053 1.257895 2.878947
7 2010 1.528947 1.0447368 1.147368 2.573684
8 2011 1.605263 1.0710526 1.239474 2.676316

Season 2010

R source

##----------------------------------------------------------------------

## 2010 season only.
db <- droplevels(subset(da, yea==2010,
                        select=c("rod","home","away","h","a")))
str(db)

R output

'data.frame':  380 obs. of  5 variables:
 $ rod : int  1 1 1 1 1 1 1 1 1 1 ...
 $ home: Factor w/ 20 levels "Atlético/GO",..: 5 1 16 9 2 15 7 6 14 4 ...
 $ away: Factor w/ 20 levels "Atlético/GO",..: 17 12 20 18 19 8 3 10 11 13 ...
 $ h   : int  3 0 1 1 2 1 2 1 1 6 ...
 $ a   : int  3 0 0 1 1 2 1 0 0 1 ...

R source

subset(db, home=="Corinthians")

R output

     rod        home            away h a
2541   1 Corinthians     Atlético/PR 2 1
2558   3 Corinthians      Fluminense 1 0
2578   5 Corinthians          Santos 4 2
2594   6 Corinthians   Internacional 2 0
2618   9 Corinthians     Atlético/MG 1 0
2643  11 Corinthians         Guarani 3 1
2658  13 Corinthians        Flamengo 1 0
2682  15 Corinthians       São Paulo 3 0
2698  17 Corinthians         Vitória 2 1
2717  19 Corinthians           Goiás 5 1
2735  21 Corinthians          Grêmio 0 1
2755  23 Corinthians Grêmio Prudente 3 0
2792  26 Corinthians        Botafogo 1 1
2796  27 Corinthians           Ceará 2 2
2820  29 Corinthians     Atlético/GO 3 4
2839  31 Corinthians       Palmeiras 1 0
2861  33 Corinthians            Avaí 4 0
2877  35 Corinthians        Cruzeiro 1 0
2895  37 Corinthians           Vasco 2 0

R source

## Are the levels in the same order?
all(levels(db$home)==levels(db$away))

R output

[1] TRUE

R source

##----------------------------------------------------------------------

## Matrices to identify teams matches (design matrices).
Xh <- model.matrix(~-1+home, db) ## h: home.
Xa <- model.matrix(~-1+away, db) ## a: away.

## Check.
Xha <- Xh-Xa
db[1:5,c("home","away")]

R output

            home      away
2535    Botafogo    Santos
2536 Atlético/GO    Grêmio
2537   Palmeiras   Vitória
2538    Flamengo São Paulo
2539 Atlético/MG     Vasco

R source

print(as.table(t(Xha[1:5,])), zero.print=".")

R output

                    2535 2536 2537 2538 2539
homeAtlético/GO        .    1    .    .    .
homeAtlético/MG        .    .    .    .    1
homeAtlético/PR        .    .    .    .    .
homeAvaí               .    .    .    .    .
homeBotafogo           1    .    .    .    .
homeCeará              .    .    .    .    .
homeCorinthians        .    .    .    .    .
homeCruzeiro           .    .    .    .    .
homeFlamengo           .    .    .    1    .
homeFluminense         .    .    .    .    .
homeGoiás              .    .    .    .    .
homeGrêmio             .   -1    .    .    .
homeGrêmio Prudente    .    .    .    .    .
homeGuarani            .    .    .    .    .
homeInternacional      .    .    .    .    .
homePalmeiras          .    .    1    .    .
homeSantos            -1    .    .    .    .
homeSão Paulo          .    .    .   -1    .
homeVasco              .    .    .    .   -1
homeVitória            .    .   -1    .    .

R source

##----------------------------------------------------------------------

## Place in 2010 season, from first to last.
cbind(scored=t(Xh)%*%db$h+t(Xa)%*%db$a,   ## goals scored
      conceded=t(Xa)%*%db$h+t(Xh)%*%db$a) ## goals conceded

R output

                    [,1] [,2]
homeAtlético/GO       51   57
homeAtlético/MG       52   64
homeAtlético/PR       43   45
homeAvaí              49   58
homeBotafogo          54   42
homeCeará             35   44
homeCorinthians       65   41
homeCruzeiro          53   38
homeFlamengo          41   44
homeFluminense        62   36
homeGoiás             41   68
homeGrêmio            68   43
homeGrêmio Prudente   39   64
homeGuarani           33   53
homeInternacional     48   41
homePalmeiras         42   43
homeSantos            63   50
homeSão Paulo         54   54
homeVasco             43   45
homeVitória           42   48

R source

## http://esportes.terra.com.br/futebol/brasileiro/2010/noticias/0,,OI4339585-EI15413,00-Classificacao+e+Jogos+Serie+A.html

aux <- with(db, {
    d <- h-a
    draw <- d==0
    winH <- d>0
    ## winA <- !draw & !winH
    winA <- !(draw | winH)
    x <- cbind(h=winH*3+draw, a=winA*3+draw)
    return(x)
})

## cbind(db, aux)[sample(1:nrow(db), size=10),]

pl10 <- t(Xh)%*%aux[,"h"]+t(Xa)%*%aux[,"a"]
pl10 <- cbind(team=levels(db$home), data.frame(pts=pl10))
## pl10$rk <- rank(pl10$pts, decreasing=TRUE)
pl10 <- arrange(pl10, -pts)
pl10$pos <- rank(-pl10$pts)
pl10

R output

              team pts  pos
1       Fluminense  71  1.0
2         Cruzeiro  69  2.0
3      Corinthians  68  3.0
4           Grêmio  63  4.0
5      Atlético/PR  60  5.0
6         Botafogo  59  6.0
7    Internacional  58  7.0
8           Santos  56  8.0
9        São Paulo  55  9.0
10       Palmeiras  50 10.0
11           Vasco  49 11.0
12           Ceará  47 12.0
13     Atlético/MG  45 13.0
14        Flamengo  44 14.0
15            Avaí  43 15.0
16     Atlético/GO  42 16.5
17         Vitória  42 16.5
18         Guarani  37 18.0
19           Goiás  33 19.0
20 Grêmio Prudente  31 20.0

R source

th16 <- pl10[16,]

rm(aux)

Maximum likelihood estimates for the 2010 season

R source

##----------------------------------------------------------------------
## Likelihood definition.

## log-likeligood funtion corresponding to a Poisson model for the goals
## scored as function of teams attacking streng (gamma), teams
## deffensive strength (delta), and home(away) field
## advantage(disadvatage) (omega,kappa), so theta=c(gamma, delta,
## omega, kappa).

## attack, defense and home field advantage.
ll1 <- function(theta, gh, ga, Xh, Xa){
    ## theta: parameter vector.
    ## gh, ga: goals for the home and away team.
    ## Xh, Xa: design matrices for home and away team.
    gamma <- 1:20; delta <- 21:40; omega <- 41
    eta.h <- (Xh%*%theta[gamma]-Xa%*%theta[delta])+theta[omega]
    eta.a <- (Xa%*%theta[gamma]-Xh%*%theta[delta])
    ll.h <- dpois(gh, lambda=exp(eta.h), log=TRUE)
    ll.a <- dpois(ga, lambda=exp(eta.a), log=TRUE)
    ll <- sum(ll.h+ll.a)
    return(ll)
}

## attack, defense, home field advantage and away field disadvantage.
ll2 <- function(theta, gh, ga, Xh, Xa){
    ## theta: parameter vector.
    ## gh, ga: goals for the home and away team.
    ## Xh, Xa: design matrices for home and away team.
    gamma <- 1:20; delta <- 21:40; omega <- 41; kappa <- 42
    eta.h <- (Xh%*%theta[gamma]-Xa%*%theta[delta])+theta[omega]
    eta.a <- (Xa%*%theta[gamma]-Xh%*%theta[delta])-theta[kappa]
    ll.h <- dpois(gh, lambda=exp(eta.h), log=TRUE)
    ll.a <- dpois(ga, lambda=exp(eta.a), log=TRUE)
    ll <- sum(ll.h+ll.a)
    return(ll)
}

##----------------------------------------------------------------------
## Optimization to get parameter estimates.

op1 <- optim(par=rep(1, 41),
             fn=ll1,
             gh=db$h, ga=db$a,
             Xh=Xh, Xa=Xa,
             control=list(fnscale=-1),
             hessian=TRUE)
str(op1)

R output

List of 6
 $ par        : num [1:41] 1.004 1.01 0.978 0.954 1.017 ...
 $ value      : num -1080
 $ counts     : Named int [1:2] 502 NA
  ..- attr(*, "names")= chr [1:2] "function" "gradient"
 $ convergence: int 1
 $ message    : NULL
 $ hessian    : num [1:41, 1:41] -45.9 0 0 0 0 ...

R source

op2 <- optim(par=rep(1, 42),
             fn=ll2,
             gh=db$h, ga=db$a,
             Xh=Xh, Xa=Xa,
             control=list(fnscale=-1),
             hessian=TRUE)
str(op2)

R output

List of 6
 $ par        : num [1:42] 1.014 1.112 0.963 1 1.04 ...
 $ value      : num -1192
 $ counts     : Named int [1:2] 501 NA
  ..- attr(*, "names")= chr [1:2] "function" "gradient"
 $ convergence: int 1
 $ message    : NULL
 $ hessian    : num [1:42, 1:42] -36.4 0 0 0 0 ...

R source

## Likelihoods.
l <- c(ll1=op1$value, ll2=op2$value)
l

R output

      ll1       ll2 
-1080.332 -1191.508

R source

pchisq(-2*diff(l), df=1, lower.tail=FALSE)

R output

         ll2 
2.775114e-50

R source

theta <- data.frame(th1=c(op1$par, NA), th2=op2$par)
theta

R output

         th1       th2
1  1.0043790 1.0138957
2  1.0096615 1.1117698
3  0.9779044 0.9633655
4  0.9544233 1.0002134
5  1.0170494 1.0400751
6  0.9278776 0.8330297
7  0.9979084 1.0503616
8  1.0137993 0.9966172
9  0.9484080 1.0328762
10 1.0273349 1.0418324
11 0.9517496 1.0327474
12 1.0151424 1.1340466
13 0.9393636 1.0324598
14 0.8571552 0.9896103
15 0.9910947 0.9905845
16 0.8105746 1.1835028
17 1.1063962 1.0430362
18 1.0145194 1.0437392
19 0.9851621 0.9784566
20 0.9690200 0.9447839
21 1.0514958 1.0337385
22 1.0143191 0.8626878
23 1.0655327 1.0153220
24 1.0491480 1.0346921
25 1.0891696 1.0420349
26 1.0725644 1.0255372
27 1.0938077 1.0348820
28 1.1031880 1.0371056
29 1.0764055 1.0273088
30 1.1043323 1.0388512
31 0.9994671 1.0039978
32 1.0804586 1.0308121
33 0.9359466 1.0338432
34 1.0565694 1.0191678
35 1.0094950 1.0455539
36 1.0847163 1.0132376
37 1.0593575 1.0387299
38 1.0539577 1.0322186
39 1.0689397 1.0194866
40 1.0623410 1.0163889
41 0.4359021 0.3909383
42        NA 0.8104569

R source

## plot(th2~th1, data=theta[1:40,])
## a <- with(theta, identify(th1, th2))
## dput(a)
a <- c(2L, 6L, 12L, 14L, 16L, 22L)
theta <- cbind(theta[1:40,],
               expand.grid(team=levels(db$home),
                           param=c("gamma","delta")))
theta[a,]

R output

         th1       th2        team param
2  1.0096615 1.1117698 Atlético/MG gamma
6  0.9278776 0.8330297       Ceará gamma
12 1.0151424 1.1340466      Grêmio gamma
14 0.8571552 0.9896103     Guarani gamma
16 0.8105746 1.1835028   Palmeiras gamma
22 1.0143191 0.8626878 Atlético/MG delta

R source

## Legend for gamma and delta.
key.gd <- list(columns=2,
               title="Parameter", cex.title=1.1,
               points=list(
                   col=ps$superpose.line$col[1:2],
                   pch=ps$superpose.symbol$pch[1:2]),
               text=list(c(expression(gamma), expression(delta))))

xyplot(th2~th1, data=theta, groups=param,
       key=key.gd, aspect="iso",
       xlab="Parameter estimates in the 1 field parameter model",
       ylab="Parameter estimates in the 2 field parameter model")+
    layer(panel.abline(a=0, b=1))+
             layer(panel.text(x=theta$th1[a], y=theta$th2[a], pos=4,
                              labels=as.character(theta$team[a])))

R source

##----------------------------------------------------------------------

## Precision matrix.
## levelplot(-op$hessian[-41,-41])

## Variace-covariance matrix.
## levelplot(-solve(op$hessian)[-41,-41])

## Standard error of parameter estimates (so big).
## sqrt(diag(-solve(op$hessian)))

priors1 <- expand.grid(teams=levels(db$home),
                       par=c("gamma", "delta"),
                       stringsAsFactors=FALSE)
priors1 <- rbind(priors1, c("homefield", "omega"))

priors1$est <- sqrt(op1$par)        ## Estimate.
priors1$info <- diag(-op1$hessian)  ## Precision.
priors1$var <- 1/priors1$info       ## Inverse precision.
priors1$sd <- sqrt(priors1$var)

## Spliting the table of priors1.
L <- split(subset(priors1, select=c("teams","est","var")),
           f=priors1$par)

## Using better names.
names(L$gamma)[-1] <- c("mu","sigma2")
names(L$delta)[-1] <- c("theta","psi2")
names(L$omega)[-1] <- c("mu0","sigma20")
priors1 <- merge(L$gamma, L$delta)
priors1$teams <- factor(priors1$teams, levels=levels(db$home))
attr(priors1, "hf") <- list(omega=L$omega)

arrange(priors1, -mu)

R output

             teams        mu     sigma2     theta       psi2
1           Santos 1.0518537 0.01964525 1.0292509 0.02258880
2       Fluminense 1.0135753 0.02121257 1.0508722 0.02352013
3         Botafogo 1.0084887 0.02144826 1.0436329 0.02315306
4           Grêmio 1.0075428 0.02149876 1.0394511 0.02294985
5        São Paulo 1.0072335 0.02154183 1.0266244 0.02234888
6         Cruzeiro 1.0068760 0.02150287 1.0503276 0.02347573
7      Atlético/MG 1.0048192 0.02169298 1.0071341 0.02147464
8      Atlético/GO 1.0021871 0.02176421 1.0254247 0.02228163
9      Corinthians 0.9989536 0.02185762 1.0458526 0.02323653
10   Internacional 0.9955374 0.02210539 1.0047363 0.02134991
11           Vasco 0.9925533 0.02216623 1.0338954 0.02265038
12     Atlético/PR 0.9888905 0.02233165 1.0322464 0.02256469
13         Vitória 0.9843881 0.02253469 1.0306993 0.02248232
14            Avaí 0.9769459 0.02288191 1.0242793 0.02217090
15           Goiás 0.9755765 0.02300528 0.9997335 0.02109345
16        Flamengo 0.9738624 0.02298721 1.0374997 0.02277652
17 Grêmio Prudente 0.9692077 0.02337762 0.9674433 0.01978278
18           Ceará 0.9632640 0.02346868 1.0356469 0.02266570
19         Guarani 0.9258268 0.02520956 1.0278956 0.02223024
20       Palmeiras 0.9003192 0.02637312 1.0414972 0.02281669

R source

attr(priors1, "hf")

R output

$omega
       teams       mu0     sigma20
41 homefield 0.6602288 0.001839751

R source

##----------------------------------------------------------------------

priors2 <- expand.grid(teams=levels(db$home),
                       par=c("gamma", "delta"),
                       stringsAsFactors=FALSE)
priors2 <- rbind(priors2,
                 c("homefield", "omega"),
                 c("awayfield", "kappa"))

priors2$est <- sqrt(op2$par)        ## Estimate.
priors2$info <- diag(-op2$hessian)  ## Precision.
priors2$var <- 1/priors2$info       ## Inverse precision.
priors2$sd <- sqrt(priors2$var)

## Spliting the table of priors2.
M <- split(subset(priors2, select=c("teams","est","var")),
           f=priors2$par)

## Using better names.
names(M$gamma)[-1] <- c("mu","sigma2")
names(M$delta)[-1] <- c("theta","psi2")
names(M$omega)[-1] <- c("mu0","sigma20")
names(M$kappa)[-1] <- c("theta0","psi20")
priors2 <- merge(M$gamma, M$delta)
priors2$teams <- factor(priors2$teams, levels=levels(db$home))
attr(priors2, "hf") <- list(omega=M$omega, kappa=M$kappa)

arrange(priors2, -mu)

R output

             teams        mu     sigma2     theta       psi2
1        Palmeiras 1.0878891 0.02323790 1.0065971 0.02728627
2           Grêmio 1.0649162 0.02439354 1.0152892 0.02768694
3      Atlético/MG 1.0544050 0.02518285 0.9288099 0.02337196
4      Corinthians 1.0248715 0.02651716 1.0172915 0.02766873
5        São Paulo 1.0216355 0.02669704 1.0159816 0.02758529
6           Santos 1.0212914 0.02670681 1.0191810 0.02776445
7       Fluminense 1.0207019 0.02673881 1.0192405 0.02776603
8         Botafogo 1.0198407 0.02678145 1.0208011 0.02785195
9         Flamengo 1.0163052 0.02699553 1.0135624 0.02743428
10           Goiás 1.0162418 0.02703229 1.0019969 0.02680198
11 Grêmio Prudente 1.0161003 0.02699759 1.0167808 0.02761353
12     Atlético/GO 1.0069239 0.02750361 1.0167293 0.02758373
13            Avaí 1.0001067 0.02788113 1.0171982 0.02759055
14        Cruzeiro 0.9983071 0.02797809 1.0183838 0.02765213
15   Internacional 0.9952811 0.02813515 1.0225233 0.02787817
16         Guarani 0.9947916 0.02820119 1.0095384 0.02715085
17           Vasco 0.9891697 0.02851701 1.0096963 0.02714425
18     Atlético/PR 0.9815119 0.02895699 1.0076319 0.02701117
19         Vitória 0.9720000 0.02949843 1.0081611 0.02701548
20           Ceará 0.9127046 0.03297040 1.0126881 0.02712505

R source

attr(priors2, "hf")

R output

$omega
       teams       mu0     sigma20
41 homefield 0.6252506 0.001769845

$kappa
       teams    theta0       psi20
42 awayfield 0.9002538 0.005884297

Classification in 2011 season

R source

##----------------------------------------------------------------------

## Just the 2011 season.
dc <- droplevels(subset(da, yea==2011))
rownames(dc) <- NULL
str(dc)

R output

'data.frame':  380 obs. of  10 variables:
 $ dat : POSIXct, format: "2011-05-21 00:00:00" ...
 $ rod : int  1 1 1 1 1 1 1 1 1 1 ...
 $ home: Factor w/ 20 levels "América/MG","Atlético/GO",..: 3 8 13 18 12 15 10 17 14 1 ...
 $ away: Factor w/ 20 levels "América/MG","Atlético/GO",..: 4 20 5 16 11 9 2 7 19 6 ...
 $ pla : chr  "3x0" "1x3" "4x0" "1x1" ...
 $ yea : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
 $ h   : int  3 1 4 1 1 1 0 1 0 2 ...
 $ a   : int  0 3 0 1 0 2 1 0 2 1 ...
 $ d   : int  3 2 4 0 1 1 1 1 2 1 ...
 $ s   : int  3 4 4 2 1 3 1 1 2 3 ...

R source

##----------------------------------------------------------------------

## http://esportes.terra.com.br/futebol/brasileiro/2011/seriea/classificacao_jogos/

## Rating and position in 2011 season.
Xh <- model.matrix(~-1+home, dc) ## h: home.
Xa <- model.matrix(~-1+away, dc) ## a: away.

cbind(scored=t(Xh)%*%dc$h+t(Xa)%*%dc$a,   ## goals scored
      conceded=t(Xa)%*%dc$h+t(Xh)%*%dc$a) ## goals conceded

R output

                  [,1] [,2]
homeAmérica/MG      51   69
homeAtlético/GO     50   45
homeAtlético/MG     50   60
homeAtlético/PR     38   55
homeAvaí            45   75
homeBahia           43   49
homeBotafogo        52   49
homeCeará           47   64
homeCorinthians     53   36
homeCoritiba        57   41
homeCruzeiro        48   51
homeFigueirense     46   45
homeFlamengo        59   47
homeFluminense      60   51
homeGrêmio          49   57
homeInternacional   57   43
homePalmeiras       43   39
homeSantos          55   55
homeSão Paulo       57   46
homeVasco           57   40

R source

aux <- with(dc, {
    d <- h-a
    draw <- d==0
    winH <- d>0
    ## winA <- !draw & !winH
    winA <- !(draw | winH)
    x <- cbind(h=winH*3+draw, a=winA*3+draw)
    return(x)
})

pl11 <- t(Xh)%*%aux[,"h"]+t(Xa)%*%aux[,"a"]
pl11 <- cbind(team=levels(dc$home), data.frame(pts=pl11))
pl11 <- arrange(pl11, -pts)
pl11$pos <- rank(-pl11$pts)
## pl11

arrange(merge(pl10, pl11, by="team", all=TRUE), -pts.y)

R output

              team pts.x pos.x pts.y pos.y
1      Corinthians    68   3.0    71   1.0
2            Vasco    49  11.0    69   2.0
3       Fluminense    71   1.0    63   3.0
4         Flamengo    44  14.0    61   4.0
5    Internacional    58   7.0    60   5.0
6        São Paulo    55   9.0    59   6.0
7      Figueirense    NA    NA    58   7.0
8         Coritiba    NA    NA    57   8.0
9         Botafogo    59   6.0    56   9.0
10          Santos    56   8.0    53  10.0
11       Palmeiras    50  10.0    50  11.0
12     Atlético/GO    42  16.5    48  12.5
13          Grêmio    63   4.0    48  12.5
14           Bahia    NA    NA    46  14.0
15     Atlético/MG    45  13.0    45  15.0
16        Cruzeiro    69   2.0    43  16.0
17     Atlético/PR    60   5.0    41  17.0
18           Ceará    47  12.0    39  18.0
19      América/MG    NA    NA    37  19.0
20            Avaí    43  15.0    31  20.0
21           Goiás    33  19.0    NA    NA
22 Grêmio Prudente    31  20.0    NA    NA
23         Guarani    37  18.0    NA    NA
24         Vitória    42  16.5    NA    NA

Exploratory analysis to the 2011 season

R source

##----------------------------------------------------------------------

## Distribution of goals.
histogram(~h+a+d+s, data=dc,
          breaks=with(dc, seq(min(d), max(s)+1, by=1)-0.5),
          xlab="Goals",
          ylab="Relative frequency (%)",
          strip=strip.custom(
              factor.levels=c("Goals of Home","Goals of Away",
                  "Absolute difference","Total of goals")))

R source

## Classes of goals: 0, 1, ..., 3, >3.
cutin <- function(z){
    cut(z, breaks=c(0:4, Inf),
        right=FALSE, include.lowest=TRUE)
}

## Crossed frequency.
xt <- xtabs(~cutin(h)+cutin(a), data=dc)
addmargins(xt)

R output

         cutin(a)
cutin(h)  [0,1) [1,2) [2,3) [3,4) [4,Inf] Sum
  [0,1)      26    26    13     6       2  73
  [1,2)      38    52    25     8       1 124
  [2,3)      32    40    24     6       2 104
  [3,4)      16    15    13     3       1  48
  [4,Inf]    13    11     4     1       2  31
  Sum       125   144    79    24       8 380

R source

## Remove objects.
rm(xt, cutin)

Prior from 2010 to the 2011 season

R source

##----------------------------------------------------------------------
## Set operation.

## Teams in 2010 that aren't in 2011 (the worst in seria A).
wA <- setdiff(levels(db$home), levels(dc$home))
wA

R output

[1] "Goiás"              "Grêmio Prudente"    "Guarani"           
[4] "Vitória"

R source

## The bests of serie B, came to 2011 season.
bB <- setdiff(levels(dc$home), levels(db$home))
bB

R output

[1] "América/MG"    "Bahia"         "Coritiba"      "Figueirense"

R source

##----------------------------------------------------------------------
## Prior for the new teams, those in serie B that went to seria A.

priors1 <- droplevels(subset(priors1, !teams%in%wA))

## The team at 16th position.
t16 <- subset(priors1, teams==as.character(th16$team))

t16 <- t16[rep(1, 4),]
t16$teams <- bB

## Priors1 for all teams in the 2011 season.
priors1 <- rbind(priors1, t16)
rownames(priors1) <- NULL
priors1

R output

           teams        mu     sigma2    theta       psi2
1    Atlético/GO 1.0021871 0.02176421 1.025425 0.02228163
2    Atlético/MG 1.0048192 0.02169298 1.007134 0.02147464
3    Atlético/PR 0.9888905 0.02233165 1.032246 0.02256469
4           Avaí 0.9769459 0.02288191 1.024279 0.02217090
5       Botafogo 1.0084887 0.02144826 1.043633 0.02315306
6          Ceará 0.9632640 0.02346868 1.035647 0.02266570
7    Corinthians 0.9989536 0.02185762 1.045853 0.02323653
8       Cruzeiro 1.0068760 0.02150287 1.050328 0.02347573
9       Flamengo 0.9738624 0.02298721 1.037500 0.02277652
10    Fluminense 1.0135753 0.02121257 1.050872 0.02352013
11        Grêmio 1.0075428 0.02149876 1.039451 0.02294985
12 Internacional 0.9955374 0.02210539 1.004736 0.02134991
13     Palmeiras 0.9003192 0.02637312 1.041497 0.02281669
14        Santos 1.0518537 0.01964525 1.029251 0.02258880
15     São Paulo 1.0072335 0.02154183 1.026624 0.02234888
16         Vasco 0.9925533 0.02216623 1.033895 0.02265038
17    América/MG 1.0021871 0.02176421 1.025425 0.02228163
18         Bahia 1.0021871 0.02176421 1.025425 0.02228163
19      Coritiba 1.0021871 0.02176421 1.025425 0.02228163
20   Figueirense 1.0021871 0.02176421 1.025425 0.02228163

R source

## Prior mean and variance for the home-field advantage parameter.
attr(priors1, "hf") <- list(omega=L$omega)
attr(priors1, "hf")

R output

$omega
       teams       mu0     sigma20
41 homefield 0.6602288 0.001839751

R source

## Same order in the factor levels.
priors1$teams <- factor(as.character(priors1$teams),
                        levels=levels(dc$home))

##----------------------------------------------------------------------

priors2 <- droplevels(subset(priors2, !teams%in%wA))

## Priors2 for all teams in the 2011 season.
priors2 <- rbind(priors2, t16)
rownames(priors2) <- NULL
priors2

R output

           teams        mu     sigma2     theta       psi2
1    Atlético/GO 1.0069239 0.02750361 1.0167293 0.02758373
2    Atlético/MG 1.0544050 0.02518285 0.9288099 0.02337196
3    Atlético/PR 0.9815119 0.02895699 1.0076319 0.02701117
4           Avaí 1.0001067 0.02788113 1.0171982 0.02759055
5       Botafogo 1.0198407 0.02678145 1.0208011 0.02785195
6          Ceará 0.9127046 0.03297040 1.0126881 0.02712505
7    Corinthians 1.0248715 0.02651716 1.0172915 0.02766873
8       Cruzeiro 0.9983071 0.02797809 1.0183838 0.02765213
9       Flamengo 1.0163052 0.02699553 1.0135624 0.02743428
10    Fluminense 1.0207019 0.02673881 1.0192405 0.02776603
11        Grêmio 1.0649162 0.02439354 1.0152892 0.02768694
12 Internacional 0.9952811 0.02813515 1.0225233 0.02787817
13     Palmeiras 1.0878891 0.02323790 1.0065971 0.02728627
14        Santos 1.0212914 0.02670681 1.0191810 0.02776445
15     São Paulo 1.0216355 0.02669704 1.0159816 0.02758529
16         Vasco 0.9891697 0.02851701 1.0096963 0.02714425
17    América/MG 1.0021871 0.02176421 1.0254247 0.02228163
18         Bahia 1.0021871 0.02176421 1.0254247 0.02228163
19      Coritiba 1.0021871 0.02176421 1.0254247 0.02228163
20   Figueirense 1.0021871 0.02176421 1.0254247 0.02228163

R source

## Prior mean and variance for the home-field advantage parameter.
attr(priors2, "hf") <- list(omega=M$omega, kappa=M$kappa)
attr(priors2, "hf")

R output

$omega
       teams       mu0     sigma20
41 homefield 0.6252506 0.001769845

$kappa
       teams    theta0       psi20
42 awayfield 0.9002538 0.005884297

R source

## Same order in the factor levels.
priors2$teams <- factor(as.character(priors2$teams),
                        levels=levels(dc$home))

## Remove objects.
rm(wA, bB, t16, L, M)

## Check.
## all.equal(levels(db$home), levels(db$away))
## cbind(levels(dc$home), levels(dc$away), levels(priors1$teams))

Visualize the priors

R source

##----------------------------------------------------------------------

priors12 <- rbind(cbind(priors1, pr=1),
                  cbind(priors2, pr=2))

key.gd <- list(columns=1,
               title="Model", cex.title=1.1,
               points=list(
                   col=ps$superpose.line$col[1:2],
                   pch=c(1,5)),
               text=list(c("1 field parameter",
                   "2 field parameter")))

xyplot(mu+sigma2+theta+psi2~teams, outer=TRUE,
       data=priors12, groups=pr, pch=c(1,5), key=key.gd,
       xlab="Teams", ylab="Prior", as.table=TRUE,
       scales=list(y="free", x=list(rot=90)),
       strip=strip.custom(
           factor.levels=expression(
               mu*": prior mean for"~gamma,
               sigma^2*": prior variance for"~gamma,
               theta*": prior mean for"~delta,
               psi^2*": prior variance for"~delta)))

R source

##----------------------------------------------------------------------
## Prior densities.

## str(priors12)
## xtabs(~teams+pr, priors12)

L <- apply(priors12[,-1], MARGIN=1,
           FUN=function(pars){
               x <- seq(0.2, 1.8, length.out=50)
               ygamma <- dnorm(x, mean=pars[1], sd=sqrt(pars[2]))
               ydelta <- dnorm(x, mean=pars[3], sd=sqrt(pars[4]))
               data.frame(x=x, ygamma=ygamma, ydelta=ydelta, pr=pars[5])
           })

names(L) <- rep(levels(priors12$teams), times=2)
L <- ldply(L, .id="teams")
str(L)

R output

'data.frame':  2000 obs. of  5 variables:
 $ teams : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ x     : num  0.2 0.233 0.265 0.298 0.331 ...
 $ ygamma: num  1.03e-06 3.34e-06 1.03e-05 3.05e-05 8.55e-05 ...
 $ ydelta: num  6.12e-07 2.00e-06 6.25e-06 1.86e-05 5.27e-05 ...
 $ pr    : num  1 1 1 1 1 1 1 1 1 1 ...

R source

## Legend for gamma and delta.
key.gd <- list(columns=1,
               title="Parameter", cex.title=1.1,
               lines=list(
                   col=ps$superpose.line$col[1:2]),
               text=list(c(expression(gamma), expression(delta))))

## Legend for 1 and 2 field parameter models.
key.12 <- list(columns=1,
               title="Model", cex.title=1.1,
               lines=list(
                   lty=1:2),
               text=list(c("1 field parameter",
                   "2 field parameter")))

## Create a grob by merging the two keys (latticeExtra).
keys <- mergedTrellisLegendGrob(
    a=list(fun=draw.key, args=list(key=key.gd)),
    b=list(fun=draw.key, args=list(key=key.12)),
    vertical=FALSE)

xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==1),
       type="l", as.table=TRUE,
       xlab="Parameter space",
       ylab="Prior density",
       legend=list(top=list(fun=keys))
       )+
    as.layer(xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==2),
                    type="l", lty=2))

R source

## About multiple legends:
## https://procomun.wordpress.com/2012/02/18/maps_with_r_1/

MAP-1 model fitting and results

Parameter estimates

R source

##----------------------------------------------------------------------

## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)

R output

[1] 20 38

R source

## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)

## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH

i <- 1
gamma[,i] <- priors1$mu
delta[,i] <- priors1$theta
omega[i] <- attr(priors1, "hf")$omega$mu0

## Fitting for each round.
for(i in 1:nrounds){
    ##-------------------------------------------
    ## Subset each round (10 matches).
    ## 
    dbi <- subset(dc, rod==i)
    ##
    ##-------------------------------------------
    ## Model matrices, which teams confronted each other.
    ## 
    Xh <- model.matrix(~-1+home, data=dbi)
    Xa <- model.matrix(~-1+away, data=dbi)
    ##
    ##-------------------------------------------
    ## Columns of Home and Away teams.
    ##
    posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
    posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
    ##
    ##-------------------------------------------
    ## Lambda values.
    ##
    lambdaH[,i] <- as.vector(
        exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
        )
    lambdaA[,i] <- as.vector(
        exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
        )
    ##
    ##-------------------------------------------
    ## Goals expected and goals scored.
    ## 
    ## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
    ##       goalScorHome=dbi$x, goalScorAway=dbi$y)
    ##
    ##-------------------------------------------
    ## Updating priors for the next round.
    ##
    gamma[posh,i+1] <- 
        Xh%*%gamma[,i]+Xh%*%priors1$sigma2*(dbi$h-lambdaH[,i])
    gamma[posa,i+1] <- 
        Xa%*%gamma[,i]+Xa%*%priors1$sigma2*(dbi$a-lambdaA[,i])
    delta[posh,i+1] <- 
        Xh%*%delta[,i]-Xh%*%priors1$psi2*(dbi$h-lambdaA[,i])
    delta[posa,i+1] <- 
        Xa%*%delta[,i]-Xa%*%priors1$psi2*(dbi$h-lambdaH[,i])
    omega[i+1] <- 
        omega[i]+attr(priors1, "hf")$omega$sigma20*
            sum(dbi$h-lambdaH[,i])
}

gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]

##----------------------------------------------------------------------

## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
    melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
         id.vars="team", variable.name="rod", value.name ="gamma"),
    delta=dd$delta)
dd$rod <- as.integer(dd$rod)
str(dd)

R output

'data.frame':  760 obs. of  4 variables:
 $ team : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ rod  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ gamma: num  1.005 1.006 1.014 0.955 0.988 ...
 $ delta: num  1.002 1.049 0.986 0.999 0.99 ...

R source

sc <- list(x=list(at=c(1,10,20,30,38)))

## Legend for gamma and delta.
key.gd <- list(columns=2,
               title="Parameter", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:2],
                   pch=ps$superpose.symbol$pch[1:2]),
               text=list(c(expression(gamma), expression(delta))))

## All teams.
xyplot(gamma+delta~rod|team, data=dd, type="l",
       as.table=TRUE,
       scales=sc,
       key=key.gd[c(-4,-5)],
       xlab="Championship round",
       ylab="Parameter value")

R source

## Home field parameter.
xyplot(omega~seq_along(omega), type="b",
       scales=sc,
       xlab="Championship round",
       ylab="Home field advantage parameter")

R source

##----------------------------------------------------------------------

## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(dd$team)[c(1,5,9,20)]
de <- droplevels(subset(dd, team%in%sel))

## Selected teams.
xyplot(gamma+delta~rod|team,
       data=de, type="o",
       as.table=TRUE,
       scales=sc,
       key=key.gd,
       xlab="Championship round",
       ylab="Parameter value")

R source

key.tm <- list(columns=2,
               title="Teams", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:4],
                   pch=ps$superpose.symbol$pch[1:4]),
               text=list(levels(de$team)))

xyplot(gamma+delta~rod, outer=TRUE, groups=team,
       data=de, type="o",
       as.table=TRUE,
       scales=sc,
       key=key.tm,
       strip=strip.custom(factor.levels=key.gd$text[[1]]),
       xlab="Championship round",
       ylab="Parameter value")

Ratings

R source

##----------------------------------------------------------------------
## Rating series.

## Rating along the matches.
dd$rating <- with(dd, gamma+delta)
de$rating <- with(de, gamma+delta)

xyplot(rating~rod|team,
       data=dd, type="o", cex=0.8,
       as.table=TRUE,
       scales=sc,
       xlab="Championship round",
       ylab=expression("Rating"~(gamma+delta)))

R source

xyplot(rating~rod, groups=team,
       data=de, type="o", cex=0.8,
       as.table=TRUE,
       scales=sc,
       key=key.tm,
       strip=strip.custom(factor.levels=key.gd$text[[1]]),
       xlab="Championship round",
       ylab=expression("Rating"~(gamma+delta)))

R source

##----------------------------------------------------------------------

## Average rating.
rtm <- arrange(aggregate(rating~team, data=dd, FUN=mean), -rating)
names(rtm)[2] <- "averating"

## Final rating.
frat <- subset(dd, rod==max(dd$rod))
names(frat)[5] <- "finrating"

## Final, average rating and rank.
fr <- arrange(merge(merge(pl11, frat, by="team"),
                    rtm, by="team"), -finrating)
fr

R output

            team pts  pos rod     gamma     delta finrating averating
1    Corinthians  71  1.0  38 0.9126470 1.0562906  1.968938  2.033645
2    Figueirense  58  7.0  38 0.8638266 1.0982144  1.962041  1.910578
3     Fluminense  63  3.0  38 1.1214535 0.8325354  1.953989  1.936301
4  Internacional  60  5.0  38 0.9632204 0.9715182  1.934739  2.038818
5          Vasco  69  2.0  38 1.0051561 0.8989380  1.904094  1.969309
6      Palmeiras  50 11.0  38 0.7946274 1.1034982  1.898126  1.929364
7       Flamengo  61  4.0  38 0.9685208 0.9193115  1.887832  1.971458
8    Atlético/GO  48 12.5  38 0.9454246 0.8925307  1.837955  1.953604
9      São Paulo  59  6.0  38 0.9929939 0.8430417  1.836036  2.026626
10         Bahia  46 14.0  38 0.8069065 1.0079671  1.814874  1.897562
11      Coritiba  57  8.0  38 0.9673316 0.8385369  1.805868  1.906328
12        Santos  53 10.0  38 1.0149936 0.7887464  1.803740  1.890617
13      Cruzeiro  43 16.0  38 0.9073711 0.8478226  1.755194  1.948746
14        Grêmio  48 12.5  38 0.9542689 0.7577612  1.712030  1.842851
15      Botafogo  56  9.0  38 0.9142141 0.7732264  1.687441  1.928350
16         Ceará  39 18.0  38 0.8872581 0.7888507  1.676109  1.798350
17   Atlético/MG  45 15.0  38 0.9412810 0.7102926  1.651574  1.847031
18          Avaí  31 20.0  38 0.8626881 0.7620425  1.624731  1.798487
19   Atlético/PR  41 17.0  38 0.7590013 0.8656251  1.624626  1.801613
20    América/MG  37 19.0  38 0.9733991 0.6462534  1.619653  1.799950

R source

xyplot(finrating+averating~pts, data=fr,
       auto.key=TRUE, type=c("p","smooth"))

R source

splom(fr[,c("pts","finrating","averating")],
      type=c("p","smooth"))

R source

cor(fr[,c("pts","finrating","averating")],
    method="spearman")

R output

                pts finrating averating
pts       1.0000000 0.8559609 0.8078225
finrating 0.8559609 1.0000000 0.7879699
averating 0.8078225 0.7879699 1.0000000

Estimated probabilities and DeFinetti measure

R source

## Como calcular as probabilidades de ganhar, empatar e perder?

##-----------------------------------------------------------------------------
## Who have more chance to gain, Palmeiras or Corinthians?

prob <- function(gh, ga, theta, hf=FALSE){
    ## 1: attack home
    ## 2: defense home
    ## 3: attack away
    ## 4: defense away
    ## 5: home field advantage
    if(hf){
        pa <- dpois(gh, lambda=exp(theta[5]+theta[1]-theta[4]))
    } else {
        pa <- dpois(gh, lambda=exp(theta[1]-theta[4]))
    }
    pb <- dpois(ga, lambda=exp(theta[3]-theta[2]))
    pa*pb
}
Prob <- Vectorize(prob, c("gh", "ga"))
rm("prob")

i <- c("Avaí", "Corinthians")
i <- rev(i)

sel <- subset(fr, team%in%i,
              select=c("team", "gamma","delta"))
sel <- split(sel, f=as.character(sel$team))
sel <- sel[i]
theta <- unlist(lapply(sel, "[", -1))
theta <- c(theta, omega=tail(omega, 1))
theta

R output

Corinthians.gamma Corinthians.delta        Avaí.gamma 
        0.9126470         1.0562906         0.8626881 
       Avaí.delta             omega 
        0.7620425         0.5051326

R source

g <- 0:10
## pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta))
pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta, hf=TRUE))
sum(pla) ## must be equal to 1.

R output

[1] 0.9999941

R source

res <- c(homeWin=sum(pla[lower.tri(pla)]),
         draw=sum(diag(pla)),
         awayWin=sum(pla[upper.tri(pla)]))

barchart(res~names(res), main=paste(i, collapse=" vs "),
         horizontal=FALSE, stack=FALSE, origin=0,
         xlab="Outcome of the match",
         ylab="Predicted probability")

R source

## The outcome with highest probability.
m <- which(pla==max(pla), arr.ind=TRUE); m

R output

     row col
[1,]   2   1

R source

##----------------------------------------------------------------------

## Combines team and round in a new variable.
dg <- dc
dg <- transform(dg,
                hr=paste(home, rod, sep="."),
                ar=paste(away, rod, sep="."))
dd$tr <- with(dd, paste(team, rod, sep="."))

dg <- merge(dg,
            subset(dd, select=c("gamma","delta","tr")),
            by.x="hr", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("hgamma", "hdelta")

dg <- merge(dg,
            subset(dd, select=c("gamma","delta","tr")),
            by.x="ar", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("agamma", "adelta")

omg <- data.frame(omega=omega, rod=seq_along(omega))
dg <- merge(dg, omg, by="rod")

dg <- arrange(dg, dat)
str(dg)

R output

'data.frame':  380 obs. of  17 variables:
 $ rod   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ ar    : Factor w/ 380 levels "América/MG.10",..: 77 58 286 362 343 20 96 191 115 153 ...
 $ hr    : Factor w/ 380 levels "América/MG.1",..: 229 39 324 134 248 172 1 210 305 267 ...
 $ dat   : POSIXct, format: "2011-05-21 00:00:00" ...
 $ home  : Factor w/ 20 levels "América/MG","Atlético/GO",..: 13 3 18 8 14 10 1 12 17 15 ...
 $ away  : Factor w/ 20 levels "América/MG","Atlético/GO",..: 5 4 16 20 19 2 6 11 7 9 ...
 $ pla   : chr  "4x0" "3x0" "1x1" "1x3" ...
 $ yea   : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
 $ h     : int  4 3 1 1 0 0 2 1 1 1 ...
 $ a     : int  0 0 1 3 2 1 1 0 0 2 ...
 $ d     : int  4 3 0 2 2 1 1 1 1 1 ...
 $ s     : int  4 3 2 4 2 1 3 1 1 3 ...
 $ hgamma: num  0.962 1.014 0.983 0.988 1.013 ...
 $ hdelta: num  0.972 0.986 1.025 1.049 1.051 ...
 $ agamma: num  0.988 0.955 0.993 1.047 1.025 ...
 $ adelta: num  0.99 0.999 1.054 1.045 1.07 ...
 $ omega : num  0.652 0.652 0.652 0.652 0.652 ...

R source

##----------------------------------------------------------------------

## HomeWin Draw AwayWin matrix.
hda <- with(dg, {
    d <- h-a
    draw <- d==0
    winH <- d>0
    ## winA <- !draw & !winH
    winA <- !(draw | winH)
    x <- cbind(winH, draw, winA)
    return(1*x)
})

## Estimated probability of HomeWin Draw AwayWin matrix.
g <- 0:10
ephda <- t(apply(dg[,13:17], MARGIN=1,
                 FUN=function(i){
                     pla <- outer(g, g,
                                  function(x, y){
                                      Prob(gh=x, ga=y, i, hf=TRUE)
                                  })
                     res <- c(homeWin=sum(pla[lower.tri(pla)]),
                              draw=sum(diag(pla)),
                              awayWin=sum(pla[upper.tri(pla)]))
                     return(res)
                 }))
## head(ephda)

## Per match DeFinetti Index.
dfi <- rowSums((ephda-hda)^2)

histogram(~dfi|factor(dg$rod),
          as.table=TRUE,
          xlab="DeFinetti Index")

R source

xyplot(dfi~rod, data=dg,
       xlab="Round", ylab="DeFinetti Index")

R source

## Average DeFinetti Index.
mean(dfi)

R output

[1] 0.6162193

R source

##----------------------------------------------------------------------
## Saving the results of MAP-1 approach.

map1 <- list(dd=dd, fr=fr, omega=omega, dg=dg, dfi=dfi)

MAP-2 model fitting and results

Parameter estimates

R source

##----------------------------------------------------------------------

## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)

R output

[1] 20 38

R source

## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)
kappa <- omega

## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH

i <- 1
gamma[,i] <- priors2$mu
delta[,i] <- priors2$theta
omega[i] <- attr(priors2, "hf")$omega$mu0
kappa[i] <- attr(priors2, "hf")$kappa$theta0

## Fitting for each round.
for(i in 1:nrounds){
    ##-------------------------------------------
    ## Subset each round (10 matches).
    ## 
    dbi <- subset(dc, rod==i)
    ##
    ##-------------------------------------------
    ## Model matrices, which teams confronted each other.
    ## 
    Xh <- model.matrix(~-1+home, data=dbi)
    Xa <- model.matrix(~-1+away, data=dbi)
    ##
    ##-------------------------------------------
    ## Columns of Home and Away teams.
    ##
    posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
    posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
    ##
    ##-------------------------------------------
    ## Lambda values.
    ##
    lambdaH[,i] <- as.vector(
        exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
        )
    lambdaA[,i] <- as.vector(
        exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
        )
    ##
    ##-------------------------------------------
    ## Goals expected and goals scored.
    ## 
    ## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
    ##       goalScorHome=dbi$x, goalScorAway=dbi$y)
    ##
    ##-------------------------------------------
    ## Updating priors for the next round.
    ##
    gamma[posh,i+1] <- 
        Xh%*%gamma[,i]+Xh%*%priors2$sigma2*(dbi$h-lambdaH[,i])
    gamma[posa,i+1] <- 
        Xa%*%gamma[,i]+Xa%*%priors2$sigma2*(dbi$a-lambdaA[,i])
    delta[posh,i+1] <- 
        Xh%*%delta[,i]-Xh%*%priors2$psi2*(dbi$h-lambdaA[,i])
    delta[posa,i+1] <- 
        Xa%*%delta[,i]-Xa%*%priors2$psi2*(dbi$h-lambdaH[,i])
    omega[i+1] <- 
        omega[i]+attr(priors2, "hf")$omega$sigma20*
            sum(dbi$h-lambdaH[,i])
    kappa[i+1] <- 
        kappa[i]-attr(priors2, "hf")$kappa$psi20*
            sum(dbi$a-lambdaA[,i])    
}

gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]
kappa <- kappa[-1]

##----------------------------------------------------------------------

## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
    melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
         id.vars="team", variable.name="rod", value.name ="gamma"),
    delta=dd$delta)
dd$rod <- as.integer(dd$rod)
str(dd)

R output

'data.frame':  760 obs. of  4 variables:
 $ team : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ rod  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ gamma: num  1.011 1.054 1.016 0.972 0.993 ...
 $ delta: num  0.986 0.977 0.953 0.984 0.965 ...

R source

sc <- list(x=list(at=c(1,10,20,30,38)))

## Legend for gamma and delta.
key.gd <- list(columns=2,
               title="Parameter", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:2],
                   pch=ps$superpose.symbol$pch[1:2]),
               text=list(c(expression(gamma), expression(delta))))

## All teams.
xyplot(gamma+delta~rod|team, data=dd, type="l",
       as.table=TRUE,
       scales=sc,
       key=key.gd[c(-4,-5)],
       xlab="Championship round",
       ylab="Parameter value")

R source

## Legend for omega and kappa.
key.ok <- list(columns=2,
               title="Parameter", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:2],
                   pch=ps$superpose.symbol$pch[1:2]),
               text=list(c(expression(omega), expression(kappa))))

## Field parameters.
xyplot(omega+kappa~seq_along(omega), type="b",
       scales=sc, key=key.ok,
       xlab="Championship round",
       ylab="Field parameters")

R source

##----------------------------------------------------------------------

## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(dd$team)[c(1,5,9,20)]
de <- droplevels(subset(dd, team%in%sel))

## Selected teams.
xyplot(gamma+delta~rod|team,
       data=de, type="o",
       as.table=TRUE,
       scales=sc,
       key=key.gd,
       xlab="Championship round",
       ylab="Parameter value")

R source

key.tm <- list(columns=2,
               title="Teams", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:4],
                   pch=ps$superpose.symbol$pch[1:4]),
               text=list(levels(de$team)))

xyplot(gamma+delta~rod, outer=TRUE, groups=team,
       data=de, type="o",
       as.table=TRUE,
       scales=sc,
       key=key.tm,
       strip=strip.custom(factor.levels=key.gd$text[[1]]),
       xlab="Championship round",
       ylab="Parameter value")

Ratings

R source

##----------------------------------------------------------------------
## Rating series.

## Rating along the matches.
dd$rating <- with(dd, gamma+delta)
de$rating <- with(de, gamma+delta)

xyplot(rating~rod|team,
       data=dd, type="o", cex=0.8,
       as.table=TRUE,
       scales=sc,
       xlab="Championship round",
       ylab=expression("Rating"~(gamma+delta)))

R source

xyplot(rating~rod, groups=team,
       data=de, type="o", cex=0.8,
       as.table=TRUE,
       scales=sc,
       key=key.tm,
       strip=strip.custom(factor.levels=key.gd$text[[1]]),
       xlab="Championship round",
       ylab=expression("Rating"~(gamma+delta)))

R source

##----------------------------------------------------------------------

## Average rating.
rtm <- arrange(aggregate(rating~team, data=dd, FUN=mean), -rating)
names(rtm)[2] <- "averating"

## Final rating.
frat <- subset(dd, rod==max(dd$rod))
names(frat)[5] <- "finrating"

## Final, average rating and rank.
fr <- arrange(merge(merge(pl11, frat, by="team"),
                    rtm, by="team"), -finrating)
fr

R output

            team pts  pos rod     gamma     delta finrating averating
1    Figueirense  58  7.0  38 0.8331871 1.1189569  1.952144  1.897312
2    Corinthians  71  1.0  38 0.8998578 1.0412194  1.941077  2.034678
3     Fluminense  63  3.0  38 1.1398870 0.7955653  1.935452  1.885016
4       Flamengo  61  4.0  38 1.0109584 0.8846871  1.895646  2.020284
5          Vasco  69  2.0  38 0.9993262 0.8873846  1.886711  1.961439
6  Internacional  60  5.0  38 0.9331674 0.9481169  1.881284  2.013915
7      Palmeiras  50 11.0  38 0.7885027 1.0905794  1.879082  1.920158
8      São Paulo  59  6.0  38 0.9846371 0.8316986  1.816336  2.019588
9    Atlético/GO  48 12.5  38 0.9490610 0.8464453  1.795506  1.920805
10        Santos  53 10.0  38 1.0097980 0.7725788  1.782377  1.881937
11      Coritiba  57  8.0  38 0.9390714 0.8111666  1.750238  1.871157
12         Bahia  46 14.0  38 0.7610293 0.9880016  1.749031  1.828892
13      Cruzeiro  43 16.0  38 0.9100550 0.7926209  1.702676  1.945772
14        Grêmio  48 12.5  38 0.9556129 0.7005182  1.656131  1.807381
15      Botafogo  56  9.0  38 0.8913755 0.7290012  1.620377  1.902378
16         Ceará  39 18.0  38 0.8608879 0.7560238  1.616912  1.737877
17   Atlético/MG  45 15.0  38 0.9363727 0.6484578  1.584830  1.798111
18   Atlético/PR  41 17.0  38 0.7363706 0.8243455  1.560716  1.771267
19          Avaí  31 20.0  38 0.8382000 0.7189453  1.557145  1.753607
20    América/MG  37 19.0  38 0.9738472 0.5753004  1.549148  1.755183

R source

xyplot(finrating+averating~pts, data=fr,
       auto.key=TRUE, type=c("p","smooth"))

R source

splom(fr[,c("pts","finrating","averating")],
      type=c("p","smooth"))

R source

cor(fr[,c("pts","finrating","averating")],
    method="spearman")

R output

                pts finrating averating
pts       1.0000000 0.8867996 0.7912750
finrating 0.8867996 1.0000000 0.7714286
averating 0.7912750 0.7714286 1.0000000

Estimated probabilities and DeFinetti measure

R source

## Como calcular as probabilidades de ganhar, empatar e perder?

##-----------------------------------------------------------------------------
## Who have more chance to gain, Palmeiras or Corinthians?

prob <- function(gh, ga, theta, hf=FALSE){
    ## 1: attack home
    ## 2: defense home
    ## 3: attack away
    ## 4: defense away
    ## 5: home field advantage
    ## 6: away field disadvantage
    if(hf){
        pa <- dpois(gh, lambda=exp(theta[1]-theta[4]+theta[5]))
        pb <- dpois(ga, lambda=exp(theta[3]-theta[2]-theta[6]))
    } else {
        pa <- dpois(gh, lambda=exp(theta[1]-theta[4]))
        pb <- dpois(ga, lambda=exp(theta[3]-theta[2]))
    }
    pa*pb
}
Prob <- Vectorize(prob, c("gh", "ga"))
rm("prob")

i <- c("Avaí", "Corinthians")
i <- rev(i)

sel <- subset(fr, team%in%i,
              select=c("team", "gamma","delta"))
sel <- split(sel, f=as.character(sel$team))
sel <- sel[i]
theta <- unlist(lapply(sel, "[", -1))
theta <- c(theta, omega=tail(omega, 1), kappa=tail(kappa, 1))
theta

R output

Corinthians.gamma Corinthians.delta        Avaí.gamma 
        0.8998578         1.0412194         0.8382000 
       Avaí.delta             omega             kappa 
        0.7189453         0.4777623         0.8272170

R source

g <- 0:10
## pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta))
pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta, hf=TRUE))
sum(pla) ## must be equal to 1.

R output

[1] 0.999994

R source

res <- c(homeWin=sum(pla[lower.tri(pla)]),
         draw=sum(diag(pla)),
         awayWin=sum(pla[upper.tri(pla)]))

barchart(res~names(res), main=paste(i, collapse=" vs "),
         horizontal=FALSE, stack=FALSE, origin=0,
         xlab="Outcome of the match",
         ylab="Predicted probability")

R source

## The outcome with highest probability.
m <- which(pla==max(pla), arr.ind=TRUE); m

R output

     row col
[1,]   2   1

R source

##----------------------------------------------------------------------

## Combines team and round in a new variable.
dg <- dc
dg <- transform(dg,
                hr=paste(home, rod, sep="."),
                ar=paste(away, rod, sep="."))
dd$tr <- with(dd, paste(team, rod, sep="."))

dg <- merge(dg,
            subset(dd, select=c("gamma","delta","tr")),
            by.x="hr", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("hgamma", "hdelta")

dg <- merge(dg,
            subset(dd, select=c("gamma","delta","tr")),
            by.x="ar", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("agamma", "adelta")

omg <- data.frame(omega=omega, kappa=kappa, rod=seq_along(omega))
dg <- merge(dg, omg, by="rod")

dg <- arrange(dg, dat)
str(dg)

R output

'data.frame':  380 obs. of  18 variables:
 $ rod   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ ar    : Factor w/ 380 levels "América/MG.10",..: 77 58 286 362 343 20 96 191 115 153 ...
 $ hr    : Factor w/ 380 levels "América/MG.1",..: 229 39 324 134 248 172 1 210 305 267 ...
 $ dat   : POSIXct, format: "2011-05-21 00:00:00" ...
 $ home  : Factor w/ 20 levels "América/MG","Atlético/GO",..: 13 3 18 8 14 10 1 12 17 15 ...
 $ away  : Factor w/ 20 levels "América/MG","Atlético/GO",..: 5 4 16 20 19 2 6 11 7 9 ...
 $ pla   : chr  "4x0" "3x0" "1x1" "1x3" ...
 $ yea   : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
 $ h     : int  4 3 1 1 0 0 2 1 1 1 ...
 $ a     : int  0 0 1 3 2 1 1 0 0 2 ...
 $ d     : int  4 3 0 2 2 1 1 1 1 1 ...
 $ s     : int  4 3 2 4 2 1 3 1 1 3 ...
 $ hgamma: num  1.134 1.016 0.984 0.975 0.972 ...
 $ hdelta: num  0.925 0.953 1.025 1.018 1.046 ...
 $ agamma: num  0.993 0.972 0.99 1.046 1.024 ...
 $ adelta: num  0.965 0.984 1.033 1.044 1.067 ...
 $ omega : num  0.617 0.617 0.617 0.617 0.617 ...
 $ kappa : num  0.9 0.9 0.9 0.9 0.9 ...

R source

##----------------------------------------------------------------------

## HomeWin Draw AwayWin matrix.
hda <- with(dg, {
    d <- h-a
    draw <- d==0
    winH <- d>0
    ## winA <- !draw & !winH
    winA <- !(draw | winH)
    x <- cbind(winH, draw, winA)
    return(1*x)
})

## Estimated probability of HomeWin Draw AwayWin matrix.
g <- 0:10
ephda <- t(apply(dg[,13:18], MARGIN=1,
                 FUN=function(i){
                     pla <- outer(g, g,
                                  function(x, y){
                                      Prob(gh=x, ga=y, i, hf=TRUE)
                                  })
                     res <- c(homeWin=sum(pla[lower.tri(pla)]),
                              draw=sum(diag(pla)),
                              awayWin=sum(pla[upper.tri(pla)]))
                     return(res)
                 }))
## head(ephda)

## Per match DeFinetti Index.
dfi <- rowSums((ephda-hda)^2)

histogram(~dfi|factor(dg$rod),
          as.table=TRUE,
          xlab="DeFinetti Index")

R source

xyplot(dfi~rod, data=dg,
       xlab="Round", ylab="DeFinetti Index")

R source

## Average DeFinetti Index.
mean(dfi)

R output

[1] 0.682659

R source

##----------------------------------------------------------------------
## Saving the results of MAP-2 approach.

map2 <- list(dd=dd, fr=fr, omega=omega, kappa=kappa, dg=dg, dfi=dfi)

Comparing MAP-1 and MAP-2

R source

##----------------------------------------------------------------------

## fp: number of field parameters.
map12 <- rbind(cbind(map1$dd, fp=1),
               cbind(map2$dd, fp=2))

## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(map12$team)[c(1,5,9,20)]
de <- droplevels(subset(map12, team%in%sel))

key.map <- list(columns=2,
               title="Model", cex.title=1.1,
               type="o", divide=1,
               lines=list(
                   col=ps$superpose.line$col[1:2],
                   pch=ps$superpose.symbol$pch[1:2]),
               text=list(c("MAP-1","MAP-2")))

sc <- list(x=list(at=c(1,10,20,30,38)),
           y=list(relation="free"))

useOuterStrips(combineLimits(
    xyplot(gamma+delta~rod|team, outer=TRUE, groups=fp,
           data=de, type="o",
           as.table=TRUE,
           scales=sc,
           key=key.map,
           xlab="Championship round",
           ylab="Parameter value")),
    strip.left=strip.custom(factor.levels=key.gd$text[[1]])
    )