Supplementary material of the paper
Marcelo Leme de Arruda (I)
Walmes Marques Zeviani (II),
Júlio Sílvio de Sousa Bueno Filho (I)
I: DEX: Department of Exact Sciences, UFLA: Federal University of Lavras,
II: LEG: Statistics and Geoinformation Lab, DEST: Department of Statistics, UFPR: Federal University of Paraná.
Download the dataset: brasileiro2004-2011.txt
Download the R script: map.R
##----------------------------------------------------------------------
## Required packages and session definition.
require(latticeExtra)
require(plyr)
require(reshape2)
require(gridExtra)
sessionInfo()
Sys.time()
##----------------------------------------------------------------------
rm(list=ls())
## Symbols.
## plot(1:30, pch=1:30)
## Color palette.
mycol <- brewer.pal(6, "Set1")
## Trellis graphical style.
ps <- list(
box.rectangle=list(col=1, fill=c("gray70")),
box.umbrella=list(col=1, lty=1),
dot.symbol=list(col=1, pch=19),
dot.line=list(col="gray50", lty=3),
plot.symbol=list(col=1, cex=1.2),
plot.line=list(col=1),
plot.polygon=list(col="gray95"),
superpose.line=list(col=mycol),
superpose.symbol=list(col=mycol, pch=c(1:5)),
superpose.polygon=list(col=mycol),
strip.background=list(col=c("gray80","gray50"))
)
trellis.par.set(ps)
## show.settings()
##----------------------------------------------------------------------
## Load the data of the season 2011.
## UNIX command to get the charset/encoding of a file.
system("file -bi brasileiro2004-2011.txt")
## iso-8859-1 aka latin1.
## Importing the dataset with seasons 2004 to 2011.
da <- read.table(
file="brasileiro2004-2011.txt",
header=TRUE,
sep="\t",
encoding="latin1",
colClasses=c("factor","character","integer","NULL")[
c(2,4,3,4,4,4,1,1,2)])
## Short names are more manageable.
## man: home team, away: away team.
names(da) <- substr(names(da), 1, 3)
names(da)[3:4] <- c("home", "away")
str(da)
'data.frame': 3294 obs. of 5 variables:
$ dat : chr "21/04/2004" "21/04/2004" "21/04/2004" "21/04/2004" ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ home: Factor w/ 40 levels "América/MG","América/RN",..: 9 16 36 15 28 21 31 29 13 39 ...
$ away: Factor w/ 40 levels "América/MG","América/RN",..: 20 24 40 26 4 17 12 35 23 14 ...
$ pla : chr "1X4" "1X0" "1X0" "2X1" ...
## Same order in the factor levels.
l <- union(levels(da$home), levels(da$away))
da$home <- factor(da$home, levels=l)
da$away <- factor(da$away, levels=l)
## Converting string to data (calendar format).
da$dat <- as.POSIXct(x=da$dat, format="%d/%m/%Y")
da$yea <- as.integer(format(da$dat, format="%Y"))
## Get the goals of each team in a match.
da$h <- as.integer(gsub("^(\\d+)x\\d+$", "\\1", tolower(da$pla)))
da$a <- as.integer(gsub("^\\d+x(\\d+)$", "\\1", tolower(da$pla)))
da <- transform(da, d=abs(h-a), s=h+a)
str(da)
'data.frame': 3294 obs. of 10 variables:
$ dat : POSIXct, format: "2004-04-21 00:00:00" ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ home: Factor w/ 40 levels "América/MG","América/RN",..: 9 16 36 15 28 21 31 29 13 39 ...
$ away: Factor w/ 40 levels "América/MG","América/RN",..: 20 24 40 26 4 17 12 35 23 14 ...
$ pla : chr "1X4" "1X0" "1X0" "2X1" ...
$ yea : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
$ h : int 1 1 1 2 0 0 3 3 1 0 ...
$ a : int 4 0 0 1 0 0 2 2 0 1 ...
$ d : int 3 1 1 1 0 0 1 1 1 1 ...
$ s : int 5 1 1 3 0 0 5 5 1 1 ...
## Marginal statistics in goals scored for each season.
aggregate(cbind(home=h, away=a, absdiff=d, total=s)~yea,
data=da, FUN=mean, na.rm=TRUE)
yea home away absdiff total
1 2004 1.717391 1.0652174 1.362319 2.782609
2 2005 1.803030 1.3311688 1.307359 3.134199
3 2006 1.589474 1.1210526 1.289474 2.710526
4 2007 1.665789 1.0868421 1.384211 2.752632
5 2008 1.731579 0.9894737 1.389474 2.721053
6 2009 1.736842 1.1421053 1.257895 2.878947
7 2010 1.528947 1.0447368 1.147368 2.573684
8 2011 1.605263 1.0710526 1.239474 2.676316
##----------------------------------------------------------------------
## 2010 season only.
db <- droplevels(subset(da, yea==2010,
select=c("rod","home","away","h","a")))
str(db)
'data.frame': 380 obs. of 5 variables:
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ home: Factor w/ 20 levels "Atlético/GO",..: 5 1 16 9 2 15 7 6 14 4 ...
$ away: Factor w/ 20 levels "Atlético/GO",..: 17 12 20 18 19 8 3 10 11 13 ...
$ h : int 3 0 1 1 2 1 2 1 1 6 ...
$ a : int 3 0 0 1 1 2 1 0 0 1 ...
subset(db, home=="Corinthians")
rod home away h a
2541 1 Corinthians Atlético/PR 2 1
2558 3 Corinthians Fluminense 1 0
2578 5 Corinthians Santos 4 2
2594 6 Corinthians Internacional 2 0
2618 9 Corinthians Atlético/MG 1 0
2643 11 Corinthians Guarani 3 1
2658 13 Corinthians Flamengo 1 0
2682 15 Corinthians São Paulo 3 0
2698 17 Corinthians Vitória 2 1
2717 19 Corinthians Goiás 5 1
2735 21 Corinthians Grêmio 0 1
2755 23 Corinthians Grêmio Prudente 3 0
2792 26 Corinthians Botafogo 1 1
2796 27 Corinthians Ceará 2 2
2820 29 Corinthians Atlético/GO 3 4
2839 31 Corinthians Palmeiras 1 0
2861 33 Corinthians Avaí 4 0
2877 35 Corinthians Cruzeiro 1 0
2895 37 Corinthians Vasco 2 0
## Are the levels in the same order?
all(levels(db$home)==levels(db$away))
[1] TRUE
##----------------------------------------------------------------------
## Matrices to identify teams matches (design matrices).
Xh <- model.matrix(~-1+home, db) ## h: home.
Xa <- model.matrix(~-1+away, db) ## a: away.
## Check.
Xha <- Xh-Xa
db[1:5,c("home","away")]
home away
2535 Botafogo Santos
2536 Atlético/GO Grêmio
2537 Palmeiras Vitória
2538 Flamengo São Paulo
2539 Atlético/MG Vasco
print(as.table(t(Xha[1:5,])), zero.print=".")
2535 2536 2537 2538 2539
homeAtlético/GO . 1 . . .
homeAtlético/MG . . . . 1
homeAtlético/PR . . . . .
homeAvaí . . . . .
homeBotafogo 1 . . . .
homeCeará . . . . .
homeCorinthians . . . . .
homeCruzeiro . . . . .
homeFlamengo . . . 1 .
homeFluminense . . . . .
homeGoiás . . . . .
homeGrêmio . -1 . . .
homeGrêmio Prudente . . . . .
homeGuarani . . . . .
homeInternacional . . . . .
homePalmeiras . . 1 . .
homeSantos -1 . . . .
homeSão Paulo . . . -1 .
homeVasco . . . . -1
homeVitória . . -1 . .
##----------------------------------------------------------------------
## Place in 2010 season, from first to last.
cbind(scored=t(Xh)%*%db$h+t(Xa)%*%db$a, ## goals scored
conceded=t(Xa)%*%db$h+t(Xh)%*%db$a) ## goals conceded
[,1] [,2]
homeAtlético/GO 51 57
homeAtlético/MG 52 64
homeAtlético/PR 43 45
homeAvaí 49 58
homeBotafogo 54 42
homeCeará 35 44
homeCorinthians 65 41
homeCruzeiro 53 38
homeFlamengo 41 44
homeFluminense 62 36
homeGoiás 41 68
homeGrêmio 68 43
homeGrêmio Prudente 39 64
homeGuarani 33 53
homeInternacional 48 41
homePalmeiras 42 43
homeSantos 63 50
homeSão Paulo 54 54
homeVasco 43 45
homeVitória 42 48
## http://esportes.terra.com.br/futebol/brasileiro/2010/noticias/0,,OI4339585-EI15413,00-Classificacao+e+Jogos+Serie+A.html
aux <- with(db, {
d <- h-a
draw <- d==0
winH <- d>0
## winA <- !draw & !winH
winA <- !(draw | winH)
x <- cbind(h=winH*3+draw, a=winA*3+draw)
return(x)
})
## cbind(db, aux)[sample(1:nrow(db), size=10),]
pl10 <- t(Xh)%*%aux[,"h"]+t(Xa)%*%aux[,"a"]
pl10 <- cbind(team=levels(db$home), data.frame(pts=pl10))
## pl10$rk <- rank(pl10$pts, decreasing=TRUE)
pl10 <- arrange(pl10, -pts)
pl10$pos <- rank(-pl10$pts)
pl10
team pts pos
1 Fluminense 71 1.0
2 Cruzeiro 69 2.0
3 Corinthians 68 3.0
4 Grêmio 63 4.0
5 Atlético/PR 60 5.0
6 Botafogo 59 6.0
7 Internacional 58 7.0
8 Santos 56 8.0
9 São Paulo 55 9.0
10 Palmeiras 50 10.0
11 Vasco 49 11.0
12 Ceará 47 12.0
13 Atlético/MG 45 13.0
14 Flamengo 44 14.0
15 Avaí 43 15.0
16 Atlético/GO 42 16.5
17 Vitória 42 16.5
18 Guarani 37 18.0
19 Goiás 33 19.0
20 Grêmio Prudente 31 20.0
th16 <- pl10[16,]
rm(aux)
##----------------------------------------------------------------------
## Likelihood definition.
## log-likeligood funtion corresponding to a Poisson model for the goals
## scored as function of teams attacking streng (gamma), teams
## deffensive strength (delta), and home(away) field
## advantage(disadvatage) (omega,kappa), so theta=c(gamma, delta,
## omega, kappa).
## attack, defense and home field advantage.
ll1 <- function(theta, gh, ga, Xh, Xa){
## theta: parameter vector.
## gh, ga: goals for the home and away team.
## Xh, Xa: design matrices for home and away team.
gamma <- 1:20; delta <- 21:40; omega <- 41
eta.h <- (Xh%*%theta[gamma]-Xa%*%theta[delta])+theta[omega]
eta.a <- (Xa%*%theta[gamma]-Xh%*%theta[delta])
ll.h <- dpois(gh, lambda=exp(eta.h), log=TRUE)
ll.a <- dpois(ga, lambda=exp(eta.a), log=TRUE)
ll <- sum(ll.h+ll.a)
return(ll)
}
## attack, defense, home field advantage and away field disadvantage.
ll2 <- function(theta, gh, ga, Xh, Xa){
## theta: parameter vector.
## gh, ga: goals for the home and away team.
## Xh, Xa: design matrices for home and away team.
gamma <- 1:20; delta <- 21:40; omega <- 41; kappa <- 42
eta.h <- (Xh%*%theta[gamma]-Xa%*%theta[delta])+theta[omega]
eta.a <- (Xa%*%theta[gamma]-Xh%*%theta[delta])-theta[kappa]
ll.h <- dpois(gh, lambda=exp(eta.h), log=TRUE)
ll.a <- dpois(ga, lambda=exp(eta.a), log=TRUE)
ll <- sum(ll.h+ll.a)
return(ll)
}
##----------------------------------------------------------------------
## Optimization to get parameter estimates.
op1 <- optim(par=rep(1, 41),
fn=ll1,
gh=db$h, ga=db$a,
Xh=Xh, Xa=Xa,
control=list(fnscale=-1),
hessian=TRUE)
str(op1)
List of 6
$ par : num [1:41] 1.004 1.01 0.978 0.954 1.017 ...
$ value : num -1080
$ counts : Named int [1:2] 502 NA
..- attr(*, "names")= chr [1:2] "function" "gradient"
$ convergence: int 1
$ message : NULL
$ hessian : num [1:41, 1:41] -45.9 0 0 0 0 ...
op2 <- optim(par=rep(1, 42),
fn=ll2,
gh=db$h, ga=db$a,
Xh=Xh, Xa=Xa,
control=list(fnscale=-1),
hessian=TRUE)
str(op2)
List of 6
$ par : num [1:42] 1.014 1.112 0.963 1 1.04 ...
$ value : num -1192
$ counts : Named int [1:2] 501 NA
..- attr(*, "names")= chr [1:2] "function" "gradient"
$ convergence: int 1
$ message : NULL
$ hessian : num [1:42, 1:42] -36.4 0 0 0 0 ...
## Likelihoods.
l <- c(ll1=op1$value, ll2=op2$value)
l
ll1 ll2
-1080.332 -1191.508
pchisq(-2*diff(l), df=1, lower.tail=FALSE)
ll2
2.775114e-50
theta <- data.frame(th1=c(op1$par, NA), th2=op2$par)
theta
th1 th2
1 1.0043790 1.0138957
2 1.0096615 1.1117698
3 0.9779044 0.9633655
4 0.9544233 1.0002134
5 1.0170494 1.0400751
6 0.9278776 0.8330297
7 0.9979084 1.0503616
8 1.0137993 0.9966172
9 0.9484080 1.0328762
10 1.0273349 1.0418324
11 0.9517496 1.0327474
12 1.0151424 1.1340466
13 0.9393636 1.0324598
14 0.8571552 0.9896103
15 0.9910947 0.9905845
16 0.8105746 1.1835028
17 1.1063962 1.0430362
18 1.0145194 1.0437392
19 0.9851621 0.9784566
20 0.9690200 0.9447839
21 1.0514958 1.0337385
22 1.0143191 0.8626878
23 1.0655327 1.0153220
24 1.0491480 1.0346921
25 1.0891696 1.0420349
26 1.0725644 1.0255372
27 1.0938077 1.0348820
28 1.1031880 1.0371056
29 1.0764055 1.0273088
30 1.1043323 1.0388512
31 0.9994671 1.0039978
32 1.0804586 1.0308121
33 0.9359466 1.0338432
34 1.0565694 1.0191678
35 1.0094950 1.0455539
36 1.0847163 1.0132376
37 1.0593575 1.0387299
38 1.0539577 1.0322186
39 1.0689397 1.0194866
40 1.0623410 1.0163889
41 0.4359021 0.3909383
42 NA 0.8104569
## plot(th2~th1, data=theta[1:40,])
## a <- with(theta, identify(th1, th2))
## dput(a)
a <- c(2L, 6L, 12L, 14L, 16L, 22L)
theta <- cbind(theta[1:40,],
expand.grid(team=levels(db$home),
param=c("gamma","delta")))
theta[a,]
th1 th2 team param
2 1.0096615 1.1117698 Atlético/MG gamma
6 0.9278776 0.8330297 Ceará gamma
12 1.0151424 1.1340466 Grêmio gamma
14 0.8571552 0.9896103 Guarani gamma
16 0.8105746 1.1835028 Palmeiras gamma
22 1.0143191 0.8626878 Atlético/MG delta
## Legend for gamma and delta.
key.gd <- list(columns=2,
title="Parameter", cex.title=1.1,
points=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c(expression(gamma), expression(delta))))
xyplot(th2~th1, data=theta, groups=param,
key=key.gd, aspect="iso",
xlab="Parameter estimates in the 1 field parameter model",
ylab="Parameter estimates in the 2 field parameter model")+
layer(panel.abline(a=0, b=1))+
layer(panel.text(x=theta$th1[a], y=theta$th2[a], pos=4,
labels=as.character(theta$team[a])))
##----------------------------------------------------------------------
## Precision matrix.
## levelplot(-op$hessian[-41,-41])
## Variace-covariance matrix.
## levelplot(-solve(op$hessian)[-41,-41])
## Standard error of parameter estimates (so big).
## sqrt(diag(-solve(op$hessian)))
priors1 <- expand.grid(teams=levels(db$home),
par=c("gamma", "delta"),
stringsAsFactors=FALSE)
priors1 <- rbind(priors1, c("homefield", "omega"))
priors1$est <- sqrt(op1$par) ## Estimate.
priors1$info <- diag(-op1$hessian) ## Precision.
priors1$var <- 1/priors1$info ## Inverse precision.
priors1$sd <- sqrt(priors1$var)
## Spliting the table of priors1.
L <- split(subset(priors1, select=c("teams","est","var")),
f=priors1$par)
## Using better names.
names(L$gamma)[-1] <- c("mu","sigma2")
names(L$delta)[-1] <- c("theta","psi2")
names(L$omega)[-1] <- c("mu0","sigma20")
priors1 <- merge(L$gamma, L$delta)
priors1$teams <- factor(priors1$teams, levels=levels(db$home))
attr(priors1, "hf") <- list(omega=L$omega)
arrange(priors1, -mu)
teams mu sigma2 theta psi2
1 Santos 1.0518537 0.01964525 1.0292509 0.02258880
2 Fluminense 1.0135753 0.02121257 1.0508722 0.02352013
3 Botafogo 1.0084887 0.02144826 1.0436329 0.02315306
4 Grêmio 1.0075428 0.02149876 1.0394511 0.02294985
5 São Paulo 1.0072335 0.02154183 1.0266244 0.02234888
6 Cruzeiro 1.0068760 0.02150287 1.0503276 0.02347573
7 Atlético/MG 1.0048192 0.02169298 1.0071341 0.02147464
8 Atlético/GO 1.0021871 0.02176421 1.0254247 0.02228163
9 Corinthians 0.9989536 0.02185762 1.0458526 0.02323653
10 Internacional 0.9955374 0.02210539 1.0047363 0.02134991
11 Vasco 0.9925533 0.02216623 1.0338954 0.02265038
12 Atlético/PR 0.9888905 0.02233165 1.0322464 0.02256469
13 Vitória 0.9843881 0.02253469 1.0306993 0.02248232
14 Avaí 0.9769459 0.02288191 1.0242793 0.02217090
15 Goiás 0.9755765 0.02300528 0.9997335 0.02109345
16 Flamengo 0.9738624 0.02298721 1.0374997 0.02277652
17 Grêmio Prudente 0.9692077 0.02337762 0.9674433 0.01978278
18 Ceará 0.9632640 0.02346868 1.0356469 0.02266570
19 Guarani 0.9258268 0.02520956 1.0278956 0.02223024
20 Palmeiras 0.9003192 0.02637312 1.0414972 0.02281669
attr(priors1, "hf")
$omega
teams mu0 sigma20
41 homefield 0.6602288 0.001839751
##----------------------------------------------------------------------
priors2 <- expand.grid(teams=levels(db$home),
par=c("gamma", "delta"),
stringsAsFactors=FALSE)
priors2 <- rbind(priors2,
c("homefield", "omega"),
c("awayfield", "kappa"))
priors2$est <- sqrt(op2$par) ## Estimate.
priors2$info <- diag(-op2$hessian) ## Precision.
priors2$var <- 1/priors2$info ## Inverse precision.
priors2$sd <- sqrt(priors2$var)
## Spliting the table of priors2.
M <- split(subset(priors2, select=c("teams","est","var")),
f=priors2$par)
## Using better names.
names(M$gamma)[-1] <- c("mu","sigma2")
names(M$delta)[-1] <- c("theta","psi2")
names(M$omega)[-1] <- c("mu0","sigma20")
names(M$kappa)[-1] <- c("theta0","psi20")
priors2 <- merge(M$gamma, M$delta)
priors2$teams <- factor(priors2$teams, levels=levels(db$home))
attr(priors2, "hf") <- list(omega=M$omega, kappa=M$kappa)
arrange(priors2, -mu)
teams mu sigma2 theta psi2
1 Palmeiras 1.0878891 0.02323790 1.0065971 0.02728627
2 Grêmio 1.0649162 0.02439354 1.0152892 0.02768694
3 Atlético/MG 1.0544050 0.02518285 0.9288099 0.02337196
4 Corinthians 1.0248715 0.02651716 1.0172915 0.02766873
5 São Paulo 1.0216355 0.02669704 1.0159816 0.02758529
6 Santos 1.0212914 0.02670681 1.0191810 0.02776445
7 Fluminense 1.0207019 0.02673881 1.0192405 0.02776603
8 Botafogo 1.0198407 0.02678145 1.0208011 0.02785195
9 Flamengo 1.0163052 0.02699553 1.0135624 0.02743428
10 Goiás 1.0162418 0.02703229 1.0019969 0.02680198
11 Grêmio Prudente 1.0161003 0.02699759 1.0167808 0.02761353
12 Atlético/GO 1.0069239 0.02750361 1.0167293 0.02758373
13 Avaí 1.0001067 0.02788113 1.0171982 0.02759055
14 Cruzeiro 0.9983071 0.02797809 1.0183838 0.02765213
15 Internacional 0.9952811 0.02813515 1.0225233 0.02787817
16 Guarani 0.9947916 0.02820119 1.0095384 0.02715085
17 Vasco 0.9891697 0.02851701 1.0096963 0.02714425
18 Atlético/PR 0.9815119 0.02895699 1.0076319 0.02701117
19 Vitória 0.9720000 0.02949843 1.0081611 0.02701548
20 Ceará 0.9127046 0.03297040 1.0126881 0.02712505
attr(priors2, "hf")
$omega
teams mu0 sigma20
41 homefield 0.6252506 0.001769845
$kappa
teams theta0 psi20
42 awayfield 0.9002538 0.005884297
##----------------------------------------------------------------------
## Just the 2011 season.
dc <- droplevels(subset(da, yea==2011))
rownames(dc) <- NULL
str(dc)
'data.frame': 380 obs. of 10 variables:
$ dat : POSIXct, format: "2011-05-21 00:00:00" ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ home: Factor w/ 20 levels "América/MG","Atlético/GO",..: 3 8 13 18 12 15 10 17 14 1 ...
$ away: Factor w/ 20 levels "América/MG","Atlético/GO",..: 4 20 5 16 11 9 2 7 19 6 ...
$ pla : chr "3x0" "1x3" "4x0" "1x1" ...
$ yea : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ h : int 3 1 4 1 1 1 0 1 0 2 ...
$ a : int 0 3 0 1 0 2 1 0 2 1 ...
$ d : int 3 2 4 0 1 1 1 1 2 1 ...
$ s : int 3 4 4 2 1 3 1 1 2 3 ...
##----------------------------------------------------------------------
## http://esportes.terra.com.br/futebol/brasileiro/2011/seriea/classificacao_jogos/
## Rating and position in 2011 season.
Xh <- model.matrix(~-1+home, dc) ## h: home.
Xa <- model.matrix(~-1+away, dc) ## a: away.
cbind(scored=t(Xh)%*%dc$h+t(Xa)%*%dc$a, ## goals scored
conceded=t(Xa)%*%dc$h+t(Xh)%*%dc$a) ## goals conceded
[,1] [,2]
homeAmérica/MG 51 69
homeAtlético/GO 50 45
homeAtlético/MG 50 60
homeAtlético/PR 38 55
homeAvaí 45 75
homeBahia 43 49
homeBotafogo 52 49
homeCeará 47 64
homeCorinthians 53 36
homeCoritiba 57 41
homeCruzeiro 48 51
homeFigueirense 46 45
homeFlamengo 59 47
homeFluminense 60 51
homeGrêmio 49 57
homeInternacional 57 43
homePalmeiras 43 39
homeSantos 55 55
homeSão Paulo 57 46
homeVasco 57 40
aux <- with(dc, {
d <- h-a
draw <- d==0
winH <- d>0
## winA <- !draw & !winH
winA <- !(draw | winH)
x <- cbind(h=winH*3+draw, a=winA*3+draw)
return(x)
})
pl11 <- t(Xh)%*%aux[,"h"]+t(Xa)%*%aux[,"a"]
pl11 <- cbind(team=levels(dc$home), data.frame(pts=pl11))
pl11 <- arrange(pl11, -pts)
pl11$pos <- rank(-pl11$pts)
## pl11
arrange(merge(pl10, pl11, by="team", all=TRUE), -pts.y)
team pts.x pos.x pts.y pos.y
1 Corinthians 68 3.0 71 1.0
2 Vasco 49 11.0 69 2.0
3 Fluminense 71 1.0 63 3.0
4 Flamengo 44 14.0 61 4.0
5 Internacional 58 7.0 60 5.0
6 São Paulo 55 9.0 59 6.0
7 Figueirense NA NA 58 7.0
8 Coritiba NA NA 57 8.0
9 Botafogo 59 6.0 56 9.0
10 Santos 56 8.0 53 10.0
11 Palmeiras 50 10.0 50 11.0
12 Atlético/GO 42 16.5 48 12.5
13 Grêmio 63 4.0 48 12.5
14 Bahia NA NA 46 14.0
15 Atlético/MG 45 13.0 45 15.0
16 Cruzeiro 69 2.0 43 16.0
17 Atlético/PR 60 5.0 41 17.0
18 Ceará 47 12.0 39 18.0
19 América/MG NA NA 37 19.0
20 Avaí 43 15.0 31 20.0
21 Goiás 33 19.0 NA NA
22 Grêmio Prudente 31 20.0 NA NA
23 Guarani 37 18.0 NA NA
24 Vitória 42 16.5 NA NA
##----------------------------------------------------------------------
## Distribution of goals.
histogram(~h+a+d+s, data=dc,
breaks=with(dc, seq(min(d), max(s)+1, by=1)-0.5),
xlab="Goals",
ylab="Relative frequency (%)",
strip=strip.custom(
factor.levels=c("Goals of Home","Goals of Away",
"Absolute difference","Total of goals")))
## Classes of goals: 0, 1, ..., 3, >3.
cutin <- function(z){
cut(z, breaks=c(0:4, Inf),
right=FALSE, include.lowest=TRUE)
}
## Crossed frequency.
xt <- xtabs(~cutin(h)+cutin(a), data=dc)
addmargins(xt)
cutin(a)
cutin(h) [0,1) [1,2) [2,3) [3,4) [4,Inf] Sum
[0,1) 26 26 13 6 2 73
[1,2) 38 52 25 8 1 124
[2,3) 32 40 24 6 2 104
[3,4) 16 15 13 3 1 48
[4,Inf] 13 11 4 1 2 31
Sum 125 144 79 24 8 380
## Remove objects.
rm(xt, cutin)
##----------------------------------------------------------------------
## Set operation.
## Teams in 2010 that aren't in 2011 (the worst in seria A).
wA <- setdiff(levels(db$home), levels(dc$home))
wA
[1] "Goiás" "Grêmio Prudente" "Guarani"
[4] "Vitória"
## The bests of serie B, came to 2011 season.
bB <- setdiff(levels(dc$home), levels(db$home))
bB
[1] "América/MG" "Bahia" "Coritiba" "Figueirense"
##----------------------------------------------------------------------
## Prior for the new teams, those in serie B that went to seria A.
priors1 <- droplevels(subset(priors1, !teams%in%wA))
## The team at 16th position.
t16 <- subset(priors1, teams==as.character(th16$team))
t16 <- t16[rep(1, 4),]
t16$teams <- bB
## Priors1 for all teams in the 2011 season.
priors1 <- rbind(priors1, t16)
rownames(priors1) <- NULL
priors1
teams mu sigma2 theta psi2
1 Atlético/GO 1.0021871 0.02176421 1.025425 0.02228163
2 Atlético/MG 1.0048192 0.02169298 1.007134 0.02147464
3 Atlético/PR 0.9888905 0.02233165 1.032246 0.02256469
4 Avaí 0.9769459 0.02288191 1.024279 0.02217090
5 Botafogo 1.0084887 0.02144826 1.043633 0.02315306
6 Ceará 0.9632640 0.02346868 1.035647 0.02266570
7 Corinthians 0.9989536 0.02185762 1.045853 0.02323653
8 Cruzeiro 1.0068760 0.02150287 1.050328 0.02347573
9 Flamengo 0.9738624 0.02298721 1.037500 0.02277652
10 Fluminense 1.0135753 0.02121257 1.050872 0.02352013
11 Grêmio 1.0075428 0.02149876 1.039451 0.02294985
12 Internacional 0.9955374 0.02210539 1.004736 0.02134991
13 Palmeiras 0.9003192 0.02637312 1.041497 0.02281669
14 Santos 1.0518537 0.01964525 1.029251 0.02258880
15 São Paulo 1.0072335 0.02154183 1.026624 0.02234888
16 Vasco 0.9925533 0.02216623 1.033895 0.02265038
17 América/MG 1.0021871 0.02176421 1.025425 0.02228163
18 Bahia 1.0021871 0.02176421 1.025425 0.02228163
19 Coritiba 1.0021871 0.02176421 1.025425 0.02228163
20 Figueirense 1.0021871 0.02176421 1.025425 0.02228163
## Prior mean and variance for the home-field advantage parameter.
attr(priors1, "hf") <- list(omega=L$omega)
attr(priors1, "hf")
$omega
teams mu0 sigma20
41 homefield 0.6602288 0.001839751
## Same order in the factor levels.
priors1$teams <- factor(as.character(priors1$teams),
levels=levels(dc$home))
##----------------------------------------------------------------------
priors2 <- droplevels(subset(priors2, !teams%in%wA))
## Priors2 for all teams in the 2011 season.
priors2 <- rbind(priors2, t16)
rownames(priors2) <- NULL
priors2
teams mu sigma2 theta psi2
1 Atlético/GO 1.0069239 0.02750361 1.0167293 0.02758373
2 Atlético/MG 1.0544050 0.02518285 0.9288099 0.02337196
3 Atlético/PR 0.9815119 0.02895699 1.0076319 0.02701117
4 Avaí 1.0001067 0.02788113 1.0171982 0.02759055
5 Botafogo 1.0198407 0.02678145 1.0208011 0.02785195
6 Ceará 0.9127046 0.03297040 1.0126881 0.02712505
7 Corinthians 1.0248715 0.02651716 1.0172915 0.02766873
8 Cruzeiro 0.9983071 0.02797809 1.0183838 0.02765213
9 Flamengo 1.0163052 0.02699553 1.0135624 0.02743428
10 Fluminense 1.0207019 0.02673881 1.0192405 0.02776603
11 Grêmio 1.0649162 0.02439354 1.0152892 0.02768694
12 Internacional 0.9952811 0.02813515 1.0225233 0.02787817
13 Palmeiras 1.0878891 0.02323790 1.0065971 0.02728627
14 Santos 1.0212914 0.02670681 1.0191810 0.02776445
15 São Paulo 1.0216355 0.02669704 1.0159816 0.02758529
16 Vasco 0.9891697 0.02851701 1.0096963 0.02714425
17 América/MG 1.0021871 0.02176421 1.0254247 0.02228163
18 Bahia 1.0021871 0.02176421 1.0254247 0.02228163
19 Coritiba 1.0021871 0.02176421 1.0254247 0.02228163
20 Figueirense 1.0021871 0.02176421 1.0254247 0.02228163
## Prior mean and variance for the home-field advantage parameter.
attr(priors2, "hf") <- list(omega=M$omega, kappa=M$kappa)
attr(priors2, "hf")
$omega
teams mu0 sigma20
41 homefield 0.6252506 0.001769845
$kappa
teams theta0 psi20
42 awayfield 0.9002538 0.005884297
## Same order in the factor levels.
priors2$teams <- factor(as.character(priors2$teams),
levels=levels(dc$home))
## Remove objects.
rm(wA, bB, t16, L, M)
## Check.
## all.equal(levels(db$home), levels(db$away))
## cbind(levels(dc$home), levels(dc$away), levels(priors1$teams))
##----------------------------------------------------------------------
priors12 <- rbind(cbind(priors1, pr=1),
cbind(priors2, pr=2))
key.gd <- list(columns=1,
title="Model", cex.title=1.1,
points=list(
col=ps$superpose.line$col[1:2],
pch=c(1,5)),
text=list(c("1 field parameter",
"2 field parameter")))
xyplot(mu+sigma2+theta+psi2~teams, outer=TRUE,
data=priors12, groups=pr, pch=c(1,5), key=key.gd,
xlab="Teams", ylab="Prior", as.table=TRUE,
scales=list(y="free", x=list(rot=90)),
strip=strip.custom(
factor.levels=expression(
mu*": prior mean for"~gamma,
sigma^2*": prior variance for"~gamma,
theta*": prior mean for"~delta,
psi^2*": prior variance for"~delta)))
##----------------------------------------------------------------------
## Prior densities.
## str(priors12)
## xtabs(~teams+pr, priors12)
L <- apply(priors12[,-1], MARGIN=1,
FUN=function(pars){
x <- seq(0.2, 1.8, length.out=50)
ygamma <- dnorm(x, mean=pars[1], sd=sqrt(pars[2]))
ydelta <- dnorm(x, mean=pars[3], sd=sqrt(pars[4]))
data.frame(x=x, ygamma=ygamma, ydelta=ydelta, pr=pars[5])
})
names(L) <- rep(levels(priors12$teams), times=2)
L <- ldply(L, .id="teams")
str(L)
'data.frame': 2000 obs. of 5 variables:
$ teams : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 1 1 1 1 1 1 1 1 1 ...
$ x : num 0.2 0.233 0.265 0.298 0.331 ...
$ ygamma: num 1.03e-06 3.34e-06 1.03e-05 3.05e-05 8.55e-05 ...
$ ydelta: num 6.12e-07 2.00e-06 6.25e-06 1.86e-05 5.27e-05 ...
$ pr : num 1 1 1 1 1 1 1 1 1 1 ...
## Legend for gamma and delta.
key.gd <- list(columns=1,
title="Parameter", cex.title=1.1,
lines=list(
col=ps$superpose.line$col[1:2]),
text=list(c(expression(gamma), expression(delta))))
## Legend for 1 and 2 field parameter models.
key.12 <- list(columns=1,
title="Model", cex.title=1.1,
lines=list(
lty=1:2),
text=list(c("1 field parameter",
"2 field parameter")))
## Create a grob by merging the two keys (latticeExtra).
keys <- mergedTrellisLegendGrob(
a=list(fun=draw.key, args=list(key=key.gd)),
b=list(fun=draw.key, args=list(key=key.12)),
vertical=FALSE)
xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==1),
type="l", as.table=TRUE,
xlab="Parameter space",
ylab="Prior density",
legend=list(top=list(fun=keys))
)+
as.layer(xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==2),
type="l", lty=2))
## About multiple legends:
## https://procomun.wordpress.com/2012/02/18/maps_with_r_1/
##----------------------------------------------------------------------
## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)
[1] 20 38
## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)
## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH
i <- 1
gamma[,i] <- priors1$mu
delta[,i] <- priors1$theta
omega[i] <- attr(priors1, "hf")$omega$mu0
## Fitting for each round.
for(i in 1:nrounds){
##-------------------------------------------
## Subset each round (10 matches).
##
dbi <- subset(dc, rod==i)
##
##-------------------------------------------
## Model matrices, which teams confronted each other.
##
Xh <- model.matrix(~-1+home, data=dbi)
Xa <- model.matrix(~-1+away, data=dbi)
##
##-------------------------------------------
## Columns of Home and Away teams.
##
posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
##
##-------------------------------------------
## Lambda values.
##
lambdaH[,i] <- as.vector(
exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
)
lambdaA[,i] <- as.vector(
exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
)
##
##-------------------------------------------
## Goals expected and goals scored.
##
## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
## goalScorHome=dbi$x, goalScorAway=dbi$y)
##
##-------------------------------------------
## Updating priors for the next round.
##
gamma[posh,i+1] <-
Xh%*%gamma[,i]+Xh%*%priors1$sigma2*(dbi$h-lambdaH[,i])
gamma[posa,i+1] <-
Xa%*%gamma[,i]+Xa%*%priors1$sigma2*(dbi$a-lambdaA[,i])
delta[posh,i+1] <-
Xh%*%delta[,i]-Xh%*%priors1$psi2*(dbi$h-lambdaA[,i])
delta[posa,i+1] <-
Xa%*%delta[,i]-Xa%*%priors1$psi2*(dbi$h-lambdaH[,i])
omega[i+1] <-
omega[i]+attr(priors1, "hf")$omega$sigma20*
sum(dbi$h-lambdaH[,i])
}
gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]
##----------------------------------------------------------------------
## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
id.vars="team", variable.name="rod", value.name ="gamma"),
delta=dd$delta)
dd$rod <- as.integer(dd$rod)
str(dd)
'data.frame': 760 obs. of 4 variables:
$ team : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ gamma: num 1.005 1.006 1.014 0.955 0.988 ...
$ delta: num 1.002 1.049 0.986 0.999 0.99 ...
sc <- list(x=list(at=c(1,10,20,30,38)))
## Legend for gamma and delta.
key.gd <- list(columns=2,
title="Parameter", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c(expression(gamma), expression(delta))))
## All teams.
xyplot(gamma+delta~rod|team, data=dd, type="l",
as.table=TRUE,
scales=sc,
key=key.gd[c(-4,-5)],
xlab="Championship round",
ylab="Parameter value")
## Home field parameter.
xyplot(omega~seq_along(omega), type="b",
scales=sc,
xlab="Championship round",
ylab="Home field advantage parameter")
##----------------------------------------------------------------------
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(dd$team)[c(1,5,9,20)]
de <- droplevels(subset(dd, team%in%sel))
## Selected teams.
xyplot(gamma+delta~rod|team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.gd,
xlab="Championship round",
ylab="Parameter value")
key.tm <- list(columns=2,
title="Teams", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:4],
pch=ps$superpose.symbol$pch[1:4]),
text=list(levels(de$team)))
xyplot(gamma+delta~rod, outer=TRUE, groups=team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.tm,
strip=strip.custom(factor.levels=key.gd$text[[1]]),
xlab="Championship round",
ylab="Parameter value")
##----------------------------------------------------------------------
## Rating series.
## Rating along the matches.
dd$rating <- with(dd, gamma+delta)
de$rating <- with(de, gamma+delta)
xyplot(rating~rod|team,
data=dd, type="o", cex=0.8,
as.table=TRUE,
scales=sc,
xlab="Championship round",
ylab=expression("Rating"~(gamma+delta)))
xyplot(rating~rod, groups=team,
data=de, type="o", cex=0.8,
as.table=TRUE,
scales=sc,
key=key.tm,
strip=strip.custom(factor.levels=key.gd$text[[1]]),
xlab="Championship round",
ylab=expression("Rating"~(gamma+delta)))
##----------------------------------------------------------------------
## Average rating.
rtm <- arrange(aggregate(rating~team, data=dd, FUN=mean), -rating)
names(rtm)[2] <- "averating"
## Final rating.
frat <- subset(dd, rod==max(dd$rod))
names(frat)[5] <- "finrating"
## Final, average rating and rank.
fr <- arrange(merge(merge(pl11, frat, by="team"),
rtm, by="team"), -finrating)
fr
team pts pos rod gamma delta finrating averating
1 Corinthians 71 1.0 38 0.9126470 1.0562906 1.968938 2.033645
2 Figueirense 58 7.0 38 0.8638266 1.0982144 1.962041 1.910578
3 Fluminense 63 3.0 38 1.1214535 0.8325354 1.953989 1.936301
4 Internacional 60 5.0 38 0.9632204 0.9715182 1.934739 2.038818
5 Vasco 69 2.0 38 1.0051561 0.8989380 1.904094 1.969309
6 Palmeiras 50 11.0 38 0.7946274 1.1034982 1.898126 1.929364
7 Flamengo 61 4.0 38 0.9685208 0.9193115 1.887832 1.971458
8 Atlético/GO 48 12.5 38 0.9454246 0.8925307 1.837955 1.953604
9 São Paulo 59 6.0 38 0.9929939 0.8430417 1.836036 2.026626
10 Bahia 46 14.0 38 0.8069065 1.0079671 1.814874 1.897562
11 Coritiba 57 8.0 38 0.9673316 0.8385369 1.805868 1.906328
12 Santos 53 10.0 38 1.0149936 0.7887464 1.803740 1.890617
13 Cruzeiro 43 16.0 38 0.9073711 0.8478226 1.755194 1.948746
14 Grêmio 48 12.5 38 0.9542689 0.7577612 1.712030 1.842851
15 Botafogo 56 9.0 38 0.9142141 0.7732264 1.687441 1.928350
16 Ceará 39 18.0 38 0.8872581 0.7888507 1.676109 1.798350
17 Atlético/MG 45 15.0 38 0.9412810 0.7102926 1.651574 1.847031
18 Avaí 31 20.0 38 0.8626881 0.7620425 1.624731 1.798487
19 Atlético/PR 41 17.0 38 0.7590013 0.8656251 1.624626 1.801613
20 América/MG 37 19.0 38 0.9733991 0.6462534 1.619653 1.799950
xyplot(finrating+averating~pts, data=fr,
auto.key=TRUE, type=c("p","smooth"))
splom(fr[,c("pts","finrating","averating")],
type=c("p","smooth"))
cor(fr[,c("pts","finrating","averating")],
method="spearman")
pts finrating averating
pts 1.0000000 0.8559609 0.8078225
finrating 0.8559609 1.0000000 0.7879699
averating 0.8078225 0.7879699 1.0000000
## Como calcular as probabilidades de ganhar, empatar e perder?
##-----------------------------------------------------------------------------
## Who have more chance to gain, Palmeiras or Corinthians?
prob <- function(gh, ga, theta, hf=FALSE){
## 1: attack home
## 2: defense home
## 3: attack away
## 4: defense away
## 5: home field advantage
if(hf){
pa <- dpois(gh, lambda=exp(theta[5]+theta[1]-theta[4]))
} else {
pa <- dpois(gh, lambda=exp(theta[1]-theta[4]))
}
pb <- dpois(ga, lambda=exp(theta[3]-theta[2]))
pa*pb
}
Prob <- Vectorize(prob, c("gh", "ga"))
rm("prob")
i <- c("Avaí", "Corinthians")
i <- rev(i)
sel <- subset(fr, team%in%i,
select=c("team", "gamma","delta"))
sel <- split(sel, f=as.character(sel$team))
sel <- sel[i]
theta <- unlist(lapply(sel, "[", -1))
theta <- c(theta, omega=tail(omega, 1))
theta
Corinthians.gamma Corinthians.delta Avaí.gamma
0.9126470 1.0562906 0.8626881
Avaí.delta omega
0.7620425 0.5051326
g <- 0:10
## pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta))
pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta, hf=TRUE))
sum(pla) ## must be equal to 1.
[1] 0.9999941
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
barchart(res~names(res), main=paste(i, collapse=" vs "),
horizontal=FALSE, stack=FALSE, origin=0,
xlab="Outcome of the match",
ylab="Predicted probability")
## The outcome with highest probability.
m <- which(pla==max(pla), arr.ind=TRUE); m
row col
[1,] 2 1
##----------------------------------------------------------------------
## Combines team and round in a new variable.
dg <- dc
dg <- transform(dg,
hr=paste(home, rod, sep="."),
ar=paste(away, rod, sep="."))
dd$tr <- with(dd, paste(team, rod, sep="."))
dg <- merge(dg,
subset(dd, select=c("gamma","delta","tr")),
by.x="hr", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("hgamma", "hdelta")
dg <- merge(dg,
subset(dd, select=c("gamma","delta","tr")),
by.x="ar", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("agamma", "adelta")
omg <- data.frame(omega=omega, rod=seq_along(omega))
dg <- merge(dg, omg, by="rod")
dg <- arrange(dg, dat)
str(dg)
'data.frame': 380 obs. of 17 variables:
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ ar : Factor w/ 380 levels "América/MG.10",..: 77 58 286 362 343 20 96 191 115 153 ...
$ hr : Factor w/ 380 levels "América/MG.1",..: 229 39 324 134 248 172 1 210 305 267 ...
$ dat : POSIXct, format: "2011-05-21 00:00:00" ...
$ home : Factor w/ 20 levels "América/MG","Atlético/GO",..: 13 3 18 8 14 10 1 12 17 15 ...
$ away : Factor w/ 20 levels "América/MG","Atlético/GO",..: 5 4 16 20 19 2 6 11 7 9 ...
$ pla : chr "4x0" "3x0" "1x1" "1x3" ...
$ yea : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ h : int 4 3 1 1 0 0 2 1 1 1 ...
$ a : int 0 0 1 3 2 1 1 0 0 2 ...
$ d : int 4 3 0 2 2 1 1 1 1 1 ...
$ s : int 4 3 2 4 2 1 3 1 1 3 ...
$ hgamma: num 0.962 1.014 0.983 0.988 1.013 ...
$ hdelta: num 0.972 0.986 1.025 1.049 1.051 ...
$ agamma: num 0.988 0.955 0.993 1.047 1.025 ...
$ adelta: num 0.99 0.999 1.054 1.045 1.07 ...
$ omega : num 0.652 0.652 0.652 0.652 0.652 ...
##----------------------------------------------------------------------
## HomeWin Draw AwayWin matrix.
hda <- with(dg, {
d <- h-a
draw <- d==0
winH <- d>0
## winA <- !draw & !winH
winA <- !(draw | winH)
x <- cbind(winH, draw, winA)
return(1*x)
})
## Estimated probability of HomeWin Draw AwayWin matrix.
g <- 0:10
ephda <- t(apply(dg[,13:17], MARGIN=1,
FUN=function(i){
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, i, hf=TRUE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## head(ephda)
## Per match DeFinetti Index.
dfi <- rowSums((ephda-hda)^2)
histogram(~dfi|factor(dg$rod),
as.table=TRUE,
xlab="DeFinetti Index")
xyplot(dfi~rod, data=dg,
xlab="Round", ylab="DeFinetti Index")
## Average DeFinetti Index.
mean(dfi)
[1] 0.6162193
##----------------------------------------------------------------------
## Saving the results of MAP-1 approach.
map1 <- list(dd=dd, fr=fr, omega=omega, dg=dg, dfi=dfi)
##----------------------------------------------------------------------
## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)
[1] 20 38
## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)
kappa <- omega
## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH
i <- 1
gamma[,i] <- priors2$mu
delta[,i] <- priors2$theta
omega[i] <- attr(priors2, "hf")$omega$mu0
kappa[i] <- attr(priors2, "hf")$kappa$theta0
## Fitting for each round.
for(i in 1:nrounds){
##-------------------------------------------
## Subset each round (10 matches).
##
dbi <- subset(dc, rod==i)
##
##-------------------------------------------
## Model matrices, which teams confronted each other.
##
Xh <- model.matrix(~-1+home, data=dbi)
Xa <- model.matrix(~-1+away, data=dbi)
##
##-------------------------------------------
## Columns of Home and Away teams.
##
posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
##
##-------------------------------------------
## Lambda values.
##
lambdaH[,i] <- as.vector(
exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
)
lambdaA[,i] <- as.vector(
exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
)
##
##-------------------------------------------
## Goals expected and goals scored.
##
## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
## goalScorHome=dbi$x, goalScorAway=dbi$y)
##
##-------------------------------------------
## Updating priors for the next round.
##
gamma[posh,i+1] <-
Xh%*%gamma[,i]+Xh%*%priors2$sigma2*(dbi$h-lambdaH[,i])
gamma[posa,i+1] <-
Xa%*%gamma[,i]+Xa%*%priors2$sigma2*(dbi$a-lambdaA[,i])
delta[posh,i+1] <-
Xh%*%delta[,i]-Xh%*%priors2$psi2*(dbi$h-lambdaA[,i])
delta[posa,i+1] <-
Xa%*%delta[,i]-Xa%*%priors2$psi2*(dbi$h-lambdaH[,i])
omega[i+1] <-
omega[i]+attr(priors2, "hf")$omega$sigma20*
sum(dbi$h-lambdaH[,i])
kappa[i+1] <-
kappa[i]-attr(priors2, "hf")$kappa$psi20*
sum(dbi$a-lambdaA[,i])
}
gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]
kappa <- kappa[-1]
##----------------------------------------------------------------------
## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
id.vars="team", variable.name="rod", value.name ="gamma"),
delta=dd$delta)
dd$rod <- as.integer(dd$rod)
str(dd)
'data.frame': 760 obs. of 4 variables:
$ team : Factor w/ 20 levels "América/MG","Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ gamma: num 1.011 1.054 1.016 0.972 0.993 ...
$ delta: num 0.986 0.977 0.953 0.984 0.965 ...
sc <- list(x=list(at=c(1,10,20,30,38)))
## Legend for gamma and delta.
key.gd <- list(columns=2,
title="Parameter", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c(expression(gamma), expression(delta))))
## All teams.
xyplot(gamma+delta~rod|team, data=dd, type="l",
as.table=TRUE,
scales=sc,
key=key.gd[c(-4,-5)],
xlab="Championship round",
ylab="Parameter value")
## Legend for omega and kappa.
key.ok <- list(columns=2,
title="Parameter", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c(expression(omega), expression(kappa))))
## Field parameters.
xyplot(omega+kappa~seq_along(omega), type="b",
scales=sc, key=key.ok,
xlab="Championship round",
ylab="Field parameters")
##----------------------------------------------------------------------
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(dd$team)[c(1,5,9,20)]
de <- droplevels(subset(dd, team%in%sel))
## Selected teams.
xyplot(gamma+delta~rod|team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.gd,
xlab="Championship round",
ylab="Parameter value")
key.tm <- list(columns=2,
title="Teams", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:4],
pch=ps$superpose.symbol$pch[1:4]),
text=list(levels(de$team)))
xyplot(gamma+delta~rod, outer=TRUE, groups=team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.tm,
strip=strip.custom(factor.levels=key.gd$text[[1]]),
xlab="Championship round",
ylab="Parameter value")
##----------------------------------------------------------------------
## Rating series.
## Rating along the matches.
dd$rating <- with(dd, gamma+delta)
de$rating <- with(de, gamma+delta)
xyplot(rating~rod|team,
data=dd, type="o", cex=0.8,
as.table=TRUE,
scales=sc,
xlab="Championship round",
ylab=expression("Rating"~(gamma+delta)))
xyplot(rating~rod, groups=team,
data=de, type="o", cex=0.8,
as.table=TRUE,
scales=sc,
key=key.tm,
strip=strip.custom(factor.levels=key.gd$text[[1]]),
xlab="Championship round",
ylab=expression("Rating"~(gamma+delta)))
##----------------------------------------------------------------------
## Average rating.
rtm <- arrange(aggregate(rating~team, data=dd, FUN=mean), -rating)
names(rtm)[2] <- "averating"
## Final rating.
frat <- subset(dd, rod==max(dd$rod))
names(frat)[5] <- "finrating"
## Final, average rating and rank.
fr <- arrange(merge(merge(pl11, frat, by="team"),
rtm, by="team"), -finrating)
fr
team pts pos rod gamma delta finrating averating
1 Figueirense 58 7.0 38 0.8331871 1.1189569 1.952144 1.897312
2 Corinthians 71 1.0 38 0.8998578 1.0412194 1.941077 2.034678
3 Fluminense 63 3.0 38 1.1398870 0.7955653 1.935452 1.885016
4 Flamengo 61 4.0 38 1.0109584 0.8846871 1.895646 2.020284
5 Vasco 69 2.0 38 0.9993262 0.8873846 1.886711 1.961439
6 Internacional 60 5.0 38 0.9331674 0.9481169 1.881284 2.013915
7 Palmeiras 50 11.0 38 0.7885027 1.0905794 1.879082 1.920158
8 São Paulo 59 6.0 38 0.9846371 0.8316986 1.816336 2.019588
9 Atlético/GO 48 12.5 38 0.9490610 0.8464453 1.795506 1.920805
10 Santos 53 10.0 38 1.0097980 0.7725788 1.782377 1.881937
11 Coritiba 57 8.0 38 0.9390714 0.8111666 1.750238 1.871157
12 Bahia 46 14.0 38 0.7610293 0.9880016 1.749031 1.828892
13 Cruzeiro 43 16.0 38 0.9100550 0.7926209 1.702676 1.945772
14 Grêmio 48 12.5 38 0.9556129 0.7005182 1.656131 1.807381
15 Botafogo 56 9.0 38 0.8913755 0.7290012 1.620377 1.902378
16 Ceará 39 18.0 38 0.8608879 0.7560238 1.616912 1.737877
17 Atlético/MG 45 15.0 38 0.9363727 0.6484578 1.584830 1.798111
18 Atlético/PR 41 17.0 38 0.7363706 0.8243455 1.560716 1.771267
19 Avaí 31 20.0 38 0.8382000 0.7189453 1.557145 1.753607
20 América/MG 37 19.0 38 0.9738472 0.5753004 1.549148 1.755183
xyplot(finrating+averating~pts, data=fr,
auto.key=TRUE, type=c("p","smooth"))
splom(fr[,c("pts","finrating","averating")],
type=c("p","smooth"))
cor(fr[,c("pts","finrating","averating")],
method="spearman")
pts finrating averating
pts 1.0000000 0.8867996 0.7912750
finrating 0.8867996 1.0000000 0.7714286
averating 0.7912750 0.7714286 1.0000000
## Como calcular as probabilidades de ganhar, empatar e perder?
##-----------------------------------------------------------------------------
## Who have more chance to gain, Palmeiras or Corinthians?
prob <- function(gh, ga, theta, hf=FALSE){
## 1: attack home
## 2: defense home
## 3: attack away
## 4: defense away
## 5: home field advantage
## 6: away field disadvantage
if(hf){
pa <- dpois(gh, lambda=exp(theta[1]-theta[4]+theta[5]))
pb <- dpois(ga, lambda=exp(theta[3]-theta[2]-theta[6]))
} else {
pa <- dpois(gh, lambda=exp(theta[1]-theta[4]))
pb <- dpois(ga, lambda=exp(theta[3]-theta[2]))
}
pa*pb
}
Prob <- Vectorize(prob, c("gh", "ga"))
rm("prob")
i <- c("Avaí", "Corinthians")
i <- rev(i)
sel <- subset(fr, team%in%i,
select=c("team", "gamma","delta"))
sel <- split(sel, f=as.character(sel$team))
sel <- sel[i]
theta <- unlist(lapply(sel, "[", -1))
theta <- c(theta, omega=tail(omega, 1), kappa=tail(kappa, 1))
theta
Corinthians.gamma Corinthians.delta Avaí.gamma
0.8998578 1.0412194 0.8382000
Avaí.delta omega kappa
0.7189453 0.4777623 0.8272170
g <- 0:10
## pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta))
pla <- outer(g, g, function(x, y) Prob(gh=x, ga=y, theta, hf=TRUE))
sum(pla) ## must be equal to 1.
[1] 0.999994
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
barchart(res~names(res), main=paste(i, collapse=" vs "),
horizontal=FALSE, stack=FALSE, origin=0,
xlab="Outcome of the match",
ylab="Predicted probability")
## The outcome with highest probability.
m <- which(pla==max(pla), arr.ind=TRUE); m
row col
[1,] 2 1
##----------------------------------------------------------------------
## Combines team and round in a new variable.
dg <- dc
dg <- transform(dg,
hr=paste(home, rod, sep="."),
ar=paste(away, rod, sep="."))
dd$tr <- with(dd, paste(team, rod, sep="."))
dg <- merge(dg,
subset(dd, select=c("gamma","delta","tr")),
by.x="hr", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("hgamma", "hdelta")
dg <- merge(dg,
subset(dd, select=c("gamma","delta","tr")),
by.x="ar", by.y="tr")
names(dg)[ncol(dg)-1:0] <- c("agamma", "adelta")
omg <- data.frame(omega=omega, kappa=kappa, rod=seq_along(omega))
dg <- merge(dg, omg, by="rod")
dg <- arrange(dg, dat)
str(dg)
'data.frame': 380 obs. of 18 variables:
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ ar : Factor w/ 380 levels "América/MG.10",..: 77 58 286 362 343 20 96 191 115 153 ...
$ hr : Factor w/ 380 levels "América/MG.1",..: 229 39 324 134 248 172 1 210 305 267 ...
$ dat : POSIXct, format: "2011-05-21 00:00:00" ...
$ home : Factor w/ 20 levels "América/MG","Atlético/GO",..: 13 3 18 8 14 10 1 12 17 15 ...
$ away : Factor w/ 20 levels "América/MG","Atlético/GO",..: 5 4 16 20 19 2 6 11 7 9 ...
$ pla : chr "4x0" "3x0" "1x1" "1x3" ...
$ yea : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ h : int 4 3 1 1 0 0 2 1 1 1 ...
$ a : int 0 0 1 3 2 1 1 0 0 2 ...
$ d : int 4 3 0 2 2 1 1 1 1 1 ...
$ s : int 4 3 2 4 2 1 3 1 1 3 ...
$ hgamma: num 1.134 1.016 0.984 0.975 0.972 ...
$ hdelta: num 0.925 0.953 1.025 1.018 1.046 ...
$ agamma: num 0.993 0.972 0.99 1.046 1.024 ...
$ adelta: num 0.965 0.984 1.033 1.044 1.067 ...
$ omega : num 0.617 0.617 0.617 0.617 0.617 ...
$ kappa : num 0.9 0.9 0.9 0.9 0.9 ...
##----------------------------------------------------------------------
## HomeWin Draw AwayWin matrix.
hda <- with(dg, {
d <- h-a
draw <- d==0
winH <- d>0
## winA <- !draw & !winH
winA <- !(draw | winH)
x <- cbind(winH, draw, winA)
return(1*x)
})
## Estimated probability of HomeWin Draw AwayWin matrix.
g <- 0:10
ephda <- t(apply(dg[,13:18], MARGIN=1,
FUN=function(i){
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, i, hf=TRUE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## head(ephda)
## Per match DeFinetti Index.
dfi <- rowSums((ephda-hda)^2)
histogram(~dfi|factor(dg$rod),
as.table=TRUE,
xlab="DeFinetti Index")
xyplot(dfi~rod, data=dg,
xlab="Round", ylab="DeFinetti Index")
## Average DeFinetti Index.
mean(dfi)
[1] 0.682659
##----------------------------------------------------------------------
## Saving the results of MAP-2 approach.
map2 <- list(dd=dd, fr=fr, omega=omega, kappa=kappa, dg=dg, dfi=dfi)
##----------------------------------------------------------------------
## fp: number of field parameters.
map12 <- rbind(cbind(map1$dd, fp=1),
cbind(map2$dd, fp=2))
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- levels(map12$team)[c(1,5,9,20)]
de <- droplevels(subset(map12, team%in%sel))
key.map <- list(columns=2,
title="Model", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c("MAP-1","MAP-2")))
sc <- list(x=list(at=c(1,10,20,30,38)),
y=list(relation="free"))
useOuterStrips(combineLimits(
xyplot(gamma+delta~rod|team, outer=TRUE, groups=fp,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.map,
xlab="Championship round",
ylab="Parameter value")),
strip.left=strip.custom(factor.levels=key.gd$text[[1]])
)