# DirichletReg – Article Sources

#### 2021-05-18

Full R-Code for:

Maier, M. J. (2014). DirichletReg: Dirichlet Regression for Compositional Data in R. Research Report Series/Department of Statistics and Mathematics, 125. WU Vienna University of Economics and Business, Vienna. https://epub.wu.ac.at/4077/

## 4. Application examples

### 4.1. The Arctic lake (common parametrization)

library("DirichletReg")
##    sand  silt  clay depth
## 1 0.775 0.195 0.030  10.4
## 2 0.719 0.249 0.032  11.7
## 3 0.507 0.361 0.132  12.8
## 4 0.522 0.409 0.066  13.0
## 5 0.700 0.265 0.035  15.7
## 6 0.665 0.322 0.013  16.3

AL <- DR_data(ArcticLake[, 1:3])
## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced

AL[1:6, ]
##        sand      silt      clay
## 1 0.7750000 0.1950000 0.0300000
## 2 0.7190000 0.2490000 0.0320000
## 3 0.5070000 0.3610000 0.1320000
## 4 0.5235707 0.4102307 0.0661986
## 5 0.7000000 0.2650000 0.0350000
## 6 0.6650000 0.3220000 0.0130000

Code for Fig. 1 (left):

plot(AL, cex = 0.5, a2d = list(colored = FALSE, c.grid = FALSE))

Code for Fig. 1 (right):

plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875", "#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1) lake1 <- DirichReg(AL ~ depth, ArcticLake) lake1 ## Call: ## DirichReg(formula = AL ~ depth, data = ArcticLake) ## using the common parametrization ## ## Log-likelihood: 101.4 on 6 df (104 BFGS + 1 NR Iterations) ## ## ----------------------------------------- ## Coefficients for variable no. 1: sand ## (Intercept) depth ## 0.11662 0.02335 ## ----------------------------------------- ## Coefficients for variable no. 2: silt ## (Intercept) depth ## -0.31060 0.05557 ## ----------------------------------------- ## Coefficients for variable no. 3: clay ## (Intercept) depth ## -1.1520 0.0643 ## ----------------------------------------- coef(lake1) ##$sand
## (Intercept)       depth
##  0.11662480  0.02335114
##
## $silt ## (Intercept) depth ## -0.31059591 0.05556745 ## ##$clay
## (Intercept)       depth
## -1.15195642  0.06430175

lake2 <- update(lake1, . ~ . + I(depth^2) | . + I(depth^2) | . + I(depth^2))
anova(lake1, lake2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = AL ~ depth, data = ArcticLake)
## Model 2: DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) |
##   depth + I(depth^2), data = ArcticLake)
##
##         Deviance N. par Difference df Pr(>Chi)
## Model 1  -202.74      6
## Model 2  -217.99      9     15.254  3 0.001612 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lake2)
## Call:
## DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) | depth +
## I(depth^2), data = ArcticLake)
##
## Standardized Residuals:
##           Min       1Q   Median      3Q     Max
## sand  -1.7647  -0.7080  -0.1786  0.9598  3.0460
## silt  -1.1379  -0.5330  -0.1546  0.2788  1.5604
## clay  -1.7661  -0.6583  -0.0454  0.6584  2.0152
##
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 1: sand
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.4361967  0.8026814   1.789   0.0736 .
## depth       -0.0072382  0.0329433  -0.220   0.8261
## I(depth^2)   0.0001324  0.0002761   0.480   0.6315
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 2: silt
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0259705  0.7598827  -0.034   0.9727
## depth        0.0717450  0.0343089   2.091   0.0365 *
## I(depth^2)  -0.0002679  0.0003088  -0.867   0.3857
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 3: clay
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7931487  0.7362293  -2.436  0.01487 *
## depth        0.1107906  0.0357705   3.097  0.00195 **
## I(depth^2)  -0.0004872  0.0003308  -1.473  0.14079
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 109 on 9 df (162 BFGS + 2 NR Iterations)
## AIC: -200, BIC: -185
## Number of Observations: 39
## Parametrization: common

Code for Fig. 2:

par(mar = c(4, 4, 4, 4) + 0.1)
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875", "#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1, main = "Sediment Composition in an Arctic Lake") Xnew <- data.frame(depth = seq(min(ArcticLake$depth), max(ArcticLake$depth), length.out = 100)) for (i in 1:3) lines(cbind(Xnew, predict(lake2, Xnew)[, i]), col = c("#E495A5", "#86B875", "#7DB0DD")[i], lwd = 2) legend("topleft", legend = c("Sand", "Silt", "Clay"), lwd = 2, col = c("#E495A5", "#86B875", "#7DB0DD"), pt.bg = c("#E495A5", "#86B875", "#7DB0DD"), pch = 21, bty = "n") par(new = TRUE) plot(cbind(Xnew, predict(lake2, Xnew, F, F, T)), lty = "24", type = "l", ylim = c(0, max(predict(lake2, Xnew, F, F, T))), axes = F, ann = F, lwd = 2) axis(4) mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3) legend("top", legend = c(expression(hat(mu[c] == hat(alpha)[c]/hat(alpha)[0])), expression(hat(phi) == hat(alpha)[0])), lty = c(1, 2), lwd = c(3, 2), bty = "n") AL <- ArcticLake AL$AL <- DR_data(ArcticLake[, 1:3])
## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced

dd <- range(ArcticLake$depth) X <- data.frame(depth = seq(dd[1], dd[2], length.out = 200)) pp <- predict(DirichReg(AL ~ depth + I(depth^2), AL), X) Code for Fig. 3: plot(AL$AL, cex = 0.1, reset_par = FALSE)
points(toSimplex(AL$AL), pch = 16, cex = 0.5, col = gray(0.5)) lines(toSimplex(pp), lwd = 3, col = c("#6E1D34", "#004E42")[2]) Dols <- log(cbind(ArcticLake[, 2]/ArcticLake[, 1], ArcticLake[, 3]/ArcticLake[, 1])) ols <- lm(Dols ~ depth + I(depth^2), ArcticLake) p2 <- predict(ols, X) p2m <- exp(cbind(0, p2[, 1], p2[, 2]))/rowSums(exp(cbind(0, p2[, 1], p2[, 2]))) lines(toSimplex(p2m), lwd = 3, col = c("#6E1D34", "#004E42")[1], lty = "21") ### 4.2. Blood samples (alternative parametrization) Bld <- BloodSamples Bld$Smp <- DR_data(Bld[, 1:4])
## Warning in DR_data(Bld[, 1:4]): not all rows sum up to 1 => normalization forced

blood1 <- DirichReg(Smp ~ Disease | 1, Bld, model = "alternative", base = 3)
blood2 <- DirichReg(Smp ~ Disease | Disease, Bld, model = "alternative", base = 3)
anova(blood1, blood2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = Smp ~ Disease | 1, data = Bld, model =
##   "alternative", base = 3)
## Model 2: DirichReg(formula = Smp ~ Disease | Disease, data = Bld, model =
##   "alternative", base = 3)
##
##         Deviance N. par Difference df Pr(>Chi)
## Model 1  -303.86      7
## Model 2  -304.61      8     0.7587  1   0.3837

summary(blood1)
## Call:
## DirichReg(formula = Smp ~ Disease | 1, data = Bld, model = "alternative", base
## = 3)
##
## Standardized Residuals:
##                  Min       1Q   Median      3Q     Max
## Albumin      -2.1310  -0.9307  -0.1234  0.8149  2.8429
## Pre.Albumin  -1.0687  -0.4054  -0.0789  0.1947  1.5691
## Globulin.A   -2.0503  -1.0392   0.1938  0.7927  2.2393
## Globulin.B   -1.8176  -0.5347   0.1488  0.5115  1.3284
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: Albumin
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.11639    0.09935  11.237   <2e-16 ***
## DiseaseB    -0.07002    0.13604  -0.515    0.607
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Pre.Albumin
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.5490     0.1082   5.076 3.86e-07 ***
## DiseaseB     -0.1276     0.1493  -0.855    0.393
## ------------------------------------------------------------------
## Coefficients for variable no. 3: Globulin.A
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Globulin.B
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.4863     0.1094   4.445  8.8e-06 ***
## DiseaseB      0.1819     0.1472   1.236    0.216
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   4.2227     0.1475   28.64   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 151.9 on 7 df (44 BFGS + 1 NR Iterations)
## AIC: -289.9, BIC: -280
## Number of Observations: 30
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

Code for Fig.~:

par(mfrow = c(1, 4), mar = c(4, 4, 4, 2) + 0.25)
for (i in 1:4) {
boxplot(Bld$Smp[, i] ~ Bld$Disease, ylim = range(Bld$Smp[, 1:4]), main = paste(names(Bld)[i]), xlab = "Disease Type", ylab = "Proportion") segments(c(-5, 1.5), unique(fitted(blood2)[, i]), c(1.5, 5), unique(fitted(blood2)[, i]), lwd = 2, lty = 2) } alpha <- predict(blood2, data.frame(Disease = factor(c("A", "B"))), F, T, F) L <- sapply(1:2, function(i) ddirichlet(DR_data(Bld[31:36, 1:4]), unlist(alpha[i, ]))) ## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization ## forced ## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization ## forced LP <- L/rowSums(L) dimnames(LP) <- list(paste("C", 1:6), c("A", "B")) print(data.frame(round(LP * 100, 1), pred. = as.factor(ifelse(LP[, 1] > LP[, 2], "==> A", "==> B"))), print.gap = 2) ## A B pred. ## C 1 59.4 40.6 ==> A ## C 2 43.2 56.8 ==> B ## C 3 38.4 61.6 ==> B ## C 4 43.8 56.2 ==> B ## C 5 36.6 63.4 ==> B ## C 6 70.2 29.8 ==> A Code for Fig.~: B2 <- DR_data(BloodSamples[, c(1, 2, 4)]) ## Warning in DR_data(BloodSamples[, c(1, 2, 4)]): not all rows sum up to 1 => ## normalization forced plot(B2, cex = 0.001, reset_par = FALSE) div.col <- colorRampPalette(c("#023FA5", "#c0c0c0", "#8E063B"))(100) # expected values temp <- (alpha/rowSums(alpha))[, c(1, 2, 4)] points(toSimplex(temp/rowSums(temp)), pch = 22, bg = div.col[c(1, 100)], cex = 2, lwd = 0.25) # known values temp <- B2[1:30, ] points(toSimplex(temp/rowSums(temp)), pch = 21, bg = (div.col[c(1, 100)])[BloodSamples$Disease[1:30]],
cex = 0.5, lwd = 0.25)

# unclassified
temp <- B2[31:36, ]
points(toSimplex(temp/rowSums(temp)), pch = 21, bg = div.col[round(100 * LP[, 2],
0)], cex = 1, lwd = 0.5)

legend("topright", bty = "n", legend = c("Disease A", "Disease B", NA, "Expected Values"),
pch = c(21, 21, NA, 22), pt.bg = c(div.col[c(1, 100)], NA, "white"))

### 4.3. Reading skills data (alternative parametrization)

RS <- ReadingSkills
RS$acc <- DR_data(RS$accuracy)
## only one variable in [0, 1] supplied - beta-distribution assumed.
## check this assumption.

RS$dyslexia <- C(RS$dyslexia, treatment)
rs1 <- DirichReg(acc ~ dyslexia * iq | dyslexia * iq, RS, model = "alternative")
rs2 <- DirichReg(acc ~ dyslexia * iq | dyslexia + iq, RS, model = "alternative")
anova(rs1, rs2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = acc ~ dyslexia * iq | dyslexia * iq, data = RS,
##   model = "alternative")
## Model 2: DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS,
##   model = "alternative")
##
##         Deviance N. par Difference df Pr(>Chi)
## Model 1  -133.47      8
## Model 2  -131.80      7     1.6645  1    0.197

Code for Fig.~:

g.ind <- as.numeric(RS$dyslexia) g1 <- g.ind == 1 # normal g2 <- g.ind != 1 # dyslexia par(mar = c(4, 4, 4, 4) + 0.25) plot(accuracy ~ iq, RS, pch = 21, bg = c("#E495A5", "#39BEB1")[3 - g.ind], cex = 1.5, main = "Dyslexic (Red) vs. Control (Green) Group", xlab = "IQ Score", ylab = "Reading Accuracy", xlim = range(ReadingSkills$iq))

x1 <- seq(min(RS$iq[g1]), max(RS$iq[g1]), length.out = 200)
x2 <- seq(min(RS$iq[g2]), max(RS$iq[g2]), length.out = 200)
n <- length(x1)
X <- data.frame(dyslexia = factor(rep(0:1, each = n), levels = 0:1, labels = c("no",
"yes")), iq = c(x1, x2))
pv <- predict(rs2, X, TRUE, TRUE, TRUE)

lines(x1, pv$mu[1:n, 2], col = c("#E495A5", "#39BEB1")[2], lwd = 3) lines(x2, pv$mu[(n + 1):(2 * n), 2], col = c("#E495A5", "#39BEB1")[1], lwd = 3)

a <- RS$accuracy logRa_a <- log(a/(1 - a)) rlr <- lm(logRa_a ~ dyslexia * iq, RS) ols <- 1/(1 + exp(-predict(rlr, X))) lines(x1, ols[1:n], col = c("#AD6071", "#00897D")[2], lwd = 3, lty = 2) lines(x2, ols[(n + 1):(2 * n)], col = c("#AD6071", "#00897D")[1], lwd = 3, lty = 2) ### precision plot par(new = TRUE) plot(x1, pv$phi[1:n], col = c("#6E1D34", "#004E42")[2], lty = "11", type = "l", ylim = c(0,
max(pv$phi)), axes = F, ann = F, lwd = 2, xlim = range(RS$iq))
lines(x2, pv$phi[(n + 1):(2 * n)], col = c("#6E1D34", "#004E42")[1], lty = "11", type = "l", lwd = 2) axis(4) mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3) legend("topleft", legend = c(expression(hat(mu)), expression(hat(phi)), "OLS"), lty = c(1, 3, 2), lwd = c(3, 2, 3), bty = "n") a <- RS$accuracy
logRa_a <- log(a/(1 - a))
rlr <- lm(logRa_a ~ dyslexia * iq, RS)
summary(rlr)
##
## Call:
## lm(formula = logRa_a ~ dyslexia * iq, data = RS)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -2.66405 -0.37966  0.03687  0.40887  2.50345
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)      2.8067     0.2822   9.944 2.27e-12 ***
## dyslexiayes     -2.4113     0.4517  -5.338 4.01e-06 ***
## iq               0.7823     0.2992   2.615   0.0125 *
## dyslexiayes:iq  -0.8457     0.4510  -1.875   0.0681 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 40 degrees of freedom
## Multiple R-squared:  0.6151, Adjusted R-squared:  0.5862
## F-statistic: 21.31 on 3 and 40 DF,  p-value: 2.083e-08

summary(rs2)
## Call:
## DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS, model =
## "alternative")
##
## Standardized Residuals:
##                   Min       1Q   Median      3Q     Max
## 1 - accuracy  -1.5661  -0.8204  -0.5112  0.5211  3.4334
## accuracy      -3.4334  -0.5211   0.5112  0.8204  1.5661
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: 1 - accuracy
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: accuracy
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)      1.8649     0.2991   6.235 4.52e-10 ***
## dyslexiayes     -1.4833     0.3029  -4.897 9.74e-07 ***
## iq               1.0676     0.3359   3.178 0.001482 **
## dyslexiayes:iq  -1.1625     0.3452  -3.368 0.000757 ***
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1.5579     0.3336   4.670 3.01e-06 ***
## dyslexiayes   3.4931     0.5880   5.941 2.83e-09 ***
## iq            1.2291     0.4596   2.674  0.00749 **
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 65.9 on 7 df (56 BFGS + 2 NR Iterations)
## AIC: -117.8, BIC: -105.3
## Number of Observations: 44
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

confint(rs2)
##
## 95% Confidence Intervals (original form)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
##   variable omitted
##
## Variable: accuracy
##                   2.5%   Est.   97.5%
## (Intercept)      1.279   1.86   2.451
## dyslexiayes     -2.077  -1.48  -0.890
## iq               0.409   1.07   1.726
## dyslexiayes:iq  -1.839  -1.16  -0.486
##
## - Gamma-Parameters
##               2.5%  Est.  97.5%
## (Intercept)  0.904  1.56   2.21
## dyslexiayes  2.341  3.49   4.65
## iq           0.328  1.23   2.13

confint(rs2, exp = TRUE)
##
## 95% Confidence Intervals (exponentiated)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
##   variable omitted
##
## Variable: accuracy
##                  2.5%  exp(Est.)   97.5%
## (Intercept)     3.592      6.455  11.601
## dyslexiayes     0.125      0.227   0.411
## iq              1.506      2.908   5.618
## dyslexiayes:iq  0.159      0.313   0.615
##
## - Gamma-Parameters
##               2.5%  exp(Est.)   97.5%
## (Intercept)   2.47       4.75    9.13
## dyslexiayes  10.39      32.89  104.12
## iq            1.39       3.42    8.41

Code for Fig.~:

gcol <- c("#E495A5", "#39BEB1")[3 - as.numeric(RS$dyslexia)] tmt <- c(-3, 3) par(mfrow = c(3, 2), cex = 0.8) qqnorm(residuals(rlr, "pearson"), ylim = tmt, xlim = tmt, pch = 21, bg = gcol, main = "Normal Q-Q-Plot: OLS Residuals", cex = 0.75, lwd = 0.5) abline(0, 1, lwd = 2) qqline(residuals(rlr, "pearson"), lty = 2) qqnorm(residuals(rs2, "standardized")[, 2], ylim = tmt, xlim = tmt, pch = 21, bg = gcol, main = "Normal Q-Q-Plot: DirichReg Residuals", cex = 0.75, lwd = 0.5) abline(0, 1, lwd = 2) qqline(residuals(rs2, "standardized")[, 2], lty = 2) plot(ReadingSkills$iq, residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3,
3), main = "OLS Residuals", xlab = "IQ", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(ReadingSkills\$iq, residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "IQ", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)

plot(fitted(rlr), residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3, 3),
main = "OLS Residuals", xlab = "Fitted", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rs2)[, 2], residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "Fitted", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)