devtools::load_all() # if using the rproject dowloaded from the slides
# source("utils-glm.R") # if using a standard setup
library(here)
library(tidyr) # for data manipulation
library(dplyr) # for data manipulation
library(ggplot2) # plotting
library(car) # general utilities
library(effects) # for extracting and plotting effects
library(emmeans) # for marginal effects
dat <- read.csv(here("data", "nwords.csv"))
This dataset nwords.csv
represent a developmental
psychology study investigating the factors that influence children
language development. The dataset includes information about the number
of words in a a language task, and some potential predictors: caregiver
behavior, socioeconomic status, amount of time spent with the child
during a week and the presence of a baby-sitter or not.
child
: numeric variable representing the child ID
number.timebonding
: numeric variable representing the average
hours per week of child-parent activitiesnwords
: numeric variable representing the number of
words produced by the child in a language task.caregiving
: numeric variable representing the
caregiver’s behavior in a parent-child interaction task, measured on a
scale from 0 to 10ses
: categorical variable representing the family
socioeconomic status, with three levels: “Low”, “Middle”, and
“High”glm()
and poisson
family starting from additive models and then adding the
interactions.dat <- read.csv("data/nwords.csv")
str(dat)
## 'data.frame': 150 obs. of 6 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ timebonding: int 28 20 26 23 6 31 23 32 20 13 ...
## $ caregiving : int 9 8 10 10 6 13 11 10 8 5 ...
## $ ses : chr "low" "low" "low" "low" ...
## $ babysitter : chr "no" "yes" "yes" "yes" ...
## $ nwords : int 18 9 31 17 14 47 39 31 24 5 ...
Check for NA
values:
count_na(dat) # from utils-glm.R
## id timebonding caregiving ses babysitter nwords
## 0 0 0 0 0 0
# equivalent to sapply(dat, function(x) sum(is.na(x)))
Everything seems good, we do not have NA
values.
Let’s convert categorical variables into factor setting the appropriate order:
ses
: low, middle, highbabysitter
: no, yesdat$ses <- factor(dat$ses, levels = c("low", "middle", "high"))
dat$babysitter <- factor(dat$babysitter, levels = c("no", "yes"))
levels(dat$ses)
## [1] "low" "middle" "high"
levels(dat$babysitter)
## [1] "no" "yes"
timebonding
and caregiving
are the two
numerical predictors. Given that we are going to fit main effects and
interactions and we want to interpret the intercept and test the
interaction on a meaningful point, we create two centered versions of
these variables:
dat$timebonding0 <- dat$timebonding - mean(dat$timebonding)
dat$caregiving0 <- dat$caregiving - mean(dat$caregiving)
Then we can see what happen to the variables:
cols <- grepl("timebonding|caregiving", names(dat))
lapply(dat[, cols], function(x) round(c(mean = mean(x), sd = sd(x)), 3))
## $timebonding
## mean sd
## 15.953 8.345
##
## $caregiving
## mean sd
## 5.867 2.532
##
## $timebonding0
## mean sd
## 0.000 8.345
##
## $caregiving0
## mean sd
## 0.000 2.532
summary(dat)
## id timebonding caregiving ses babysitter
## Min. : 1.00 Min. : 0.00 Min. : 0.000 low :45 no :83
## 1st Qu.: 38.25 1st Qu.:10.00 1st Qu.: 4.000 middle:75 yes:67
## Median : 75.50 Median :16.00 Median : 6.000 high :30
## Mean : 75.50 Mean :15.95 Mean : 5.867
## 3rd Qu.:112.75 3rd Qu.:22.00 3rd Qu.: 8.000
## Max. :150.00 Max. :36.00 Max. :13.000
## nwords timebonding0 caregiving0
## Min. : 0.00 Min. :-15.95333 Min. :-5.8667
## 1st Qu.:10.00 1st Qu.: -5.95333 1st Qu.:-1.8667
## Median :15.50 Median : 0.04667 Median : 0.1333
## Mean :18.35 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:24.00 3rd Qu.: 6.04667 3rd Qu.: 2.1333
## Max. :52.00 Max. : 20.04667 Max. : 7.1333
Let’s do some plotting of predictors:
par(mfrow = c(2,2))
hist(dat$timebonding)
hist(dat$caregiving)
barplot(table(dat$babysitter))
barplot(table(dat$ses))
Comments?
Also the response variable:
hist(dat$nwords, probability = TRUE)
lines(density(dat$nwords), col = "salmon", lwd = 2)
Let’s plot the theoretical distributions, Poisson and Gaussian:
m <- mean(dat$nwords)
s <- sd(dat$nwords)
xs <- seq(0, 50, 1)
hist(dat$nwords, probability = TRUE, xlim = c(-10, 50), ylim = c(0, 0.1))
curve(dnorm(x, m, s), add = TRUE, col = "green", lwd = 2)
lines(xs, dpois(xs, m), col = "red", lwd = 2)
lines(xs, dpois(xs, m), col = "red", lwd = 2)
Comments?
Let’s plot some bivariate distributions:
# caregiving ~ timebonding
r <- cor(dat$timebonding, dat$caregiving)
plot(dat$timebonding, dat$caregiving, pch = 19)
abline(lm(dat$caregiving ~ dat$timebonding), col = "red", lwd = 2)
text(30, 2, label = paste("r =", round(r, 2)))
par(mfrow = c(1,2))
boxplot(caregiving ~ ses, data = dat)
boxplot(timebonding ~ ses, data = dat)
mosaicplot(table(dat$ses, dat$babysitter))
# or
barplot(table(dat$babysitter, dat$ses), col = c("red", "green"), beside = TRUE)
legend(7,45, legend = c("no", "yes"), fill = c("red", "green"))
par(mfrow = c(1,2))
boxplot(caregiving ~ babysitter, data = dat)
boxplot(timebonding ~ babysitter, data = dat)
Then the relationships with the nwords
variable:
par(mfrow = c(2,2))
plot(dat$timebonding, dat$nwords, pch = 19)
plot(dat$caregiving, dat$nwords, pch = 19)
boxplot(dat$nwords ~ dat$ses)
boxplot(dat$nwords ~ dat$babysitter)
Comments?
Finally some interactions plot (with lm
):
par(mfrow = c(2,1))
colors <- c(low = "red", middle = "blue", high = "green")
plot(dat$timebonding, dat$nwords, col = colors[dat$ses], pch = 19)
lms <- lapply(split(dat, dat$ses), function(x) lm(nwords ~ timebonding, data = x))
lapply(1:length(lms), function(i) abline(lms[[i]], col = colors[i], lwd = 2))
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
plot(dat$caregiving, dat$nwords, col = colors[dat$ses], pch = 19)
lms <- lapply(split(dat, dat$ses), function(x) lm(nwords ~ caregiving, data = x))
lapply(1:length(lms), function(i) abline(lms[[i]], col = colors[i], lwd = 2))
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
par(mfrow = c(2,1))
colors <- c(no = "orange", yes = "purple")
plot(dat$timebonding, dat$nwords, col = colors[dat$babysitter], pch = 19)
lms <- lapply(split(dat, dat$babysitter), function(x) lm(nwords ~ timebonding, data = x))
lapply(1:length(lms), function(i) abline(lms[[i]], col = colors[i], lwd = 2))
## [[1]]
## NULL
##
## [[2]]
## NULL
plot(dat$caregiving, dat$nwords, col = colors[dat$babysitter], pch = 19)
lms <- lapply(split(dat, dat$babysitter), function(x) lm(nwords ~ caregiving, data = x))
lapply(1:length(lms), function(i) abline(lms[[i]], col = colors[i], lwd = 2))
## [[1]]
## NULL
##
## [[2]]
## NULL
glm()
and
poisson
Let’s start by using an additive model:
fit <- glm(nwords ~ timebonding + caregiving + babysitter + ses, family = poisson(link = "log"), data = dat)
summary(fit)
##
## Call:
## glm(formula = nwords ~ timebonding + caregiving + babysitter +
## ses, family = poisson(link = "log"), data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.0343 -1.4787 -0.2251 1.0048 5.2483
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.872174 0.098615 18.985 < 2e-16 ***
## timebonding 0.032716 0.003387 9.660 < 2e-16 ***
## caregiving 0.071359 0.013189 5.410 6.29e-08 ***
## babysitteryes -0.054370 0.042762 -1.271 0.204
## sesmiddle 0.039277 0.050956 0.771 0.441
## seshigh 0.081874 0.071100 1.152 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1030.28 on 149 degrees of freedom
## Residual deviance: 573.72 on 144 degrees of freedom
## AIC: 1264.9
##
## Number of Fisher Scoring iterations: 5
And always plotting before anything else:
plot(allEffects(fit))
Comments? How could you describe the results? Something different from the descriptive statistics?
car::residualPlots(fit)
## Test stat Pr(>|Test stat|)
## timebonding 0.0352 0.8512
## caregiving 1.6690 0.1964
## babysitter
## ses
Comments? Are we missing something?
dat |>
ggplot(aes(x = timebonding, y = nwords, color = ses)) +
geom_point() +
stat_smooth(method = "glm", method.args = list(family = poisson()), se = FALSE)
dat |>
ggplot(aes(x = caregiving, y = nwords, color = ses)) +
geom_point() +
stat_smooth(method = "glm", method.args = list(family = poisson()), se = FALSE)
dat |>
ggplot(aes(x = timebonding, y = nwords, color = babysitter)) +
geom_point() +
stat_smooth(method = "glm", method.args = list(family = poisson()), se = FALSE)
dat |>
ggplot(aes(x = caregiving, y = nwords, color = babysitter)) +
geom_point() +
stat_smooth(method = "glm", method.args = list(family = poisson()), se = FALSE)
Let’s add some interactions:
fit2 <- glm(nwords ~ timebonding*ses + caregiving*ses + timebonding*babysitter + caregiving*babysitter, family = poisson(link = "log"), data = dat)
summary(fit2)
##
## Call:
## glm(formula = nwords ~ timebonding * ses + caregiving * ses +
## timebonding * babysitter + caregiving * babysitter, family = poisson(link = "log"),
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.1745 -1.3330 -0.1485 1.0867 5.2217
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.247867 0.155725 14.435 < 2e-16 ***
## timebonding 0.016213 0.005788 2.801 0.005091 **
## sesmiddle -0.350012 0.178986 -1.956 0.050520 .
## seshigh -0.434708 0.215428 -2.018 0.043604 *
## caregiving 0.070537 0.020455 3.448 0.000564 ***
## babysitteryes -0.147848 0.129801 -1.139 0.254687
## timebonding:sesmiddle 0.010292 0.007222 1.425 0.154123
## timebonding:seshigh 0.036204 0.014669 2.468 0.013583 *
## sesmiddle:caregiving 0.020211 0.027938 0.723 0.469420
## seshigh:caregiving -0.018795 0.052789 -0.356 0.721808
## timebonding:babysitteryes 0.021889 0.007805 2.805 0.005038 **
## caregiving:babysitteryes -0.048280 0.027164 -1.777 0.075507 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1030.28 on 149 degrees of freedom
## Residual deviance: 545.27 on 138 degrees of freedom
## AIC: 1248.5
##
## Number of Fisher Scoring iterations: 5
plot(allEffects(fit2))
MASS::glm.nb()
There is still evidence for overdispersion, even after including all predictors and a series of interactions. Let’s assume that this is our most complex model, we need to take into account the overdispersion:
performance::check_overdispersion(fit2)
## # Overdispersion test
##
## dispersion ratio = 3.888
## Pearson's Chi-Squared = 536.486
## p-value = < 0.001
fit3 <- MASS::glm.nb(nwords ~ timebonding*ses + caregiving*ses + timebonding*babysitter + caregiving*babysitter, data = dat)
summary(fit3)
##
## Call:
## MASS::glm.nb(formula = nwords ~ timebonding * ses + caregiving *
## ses + timebonding * babysitter + caregiving * babysitter,
## data = dat, init.theta = 6.373674959, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1915 -0.7045 -0.0837 0.5252 2.3301
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.338972 0.327310 7.146 8.93e-13 ***
## timebonding 0.015621 0.012609 1.239 0.215
## sesmiddle -0.387638 0.363066 -1.068 0.286
## seshigh -0.449211 0.407454 -1.102 0.270
## caregiving 0.060868 0.046107 1.320 0.187
## babysitteryes -0.214001 0.230980 -0.926 0.354
## timebonding:sesmiddle 0.007400 0.015144 0.489 0.625
## timebonding:seshigh 0.032108 0.026562 1.209 0.227
## sesmiddle:caregiving 0.031846 0.058583 0.544 0.587
## seshigh:caregiving -0.008505 0.094220 -0.090 0.928
## timebonding:babysitteryes 0.021072 0.015598 1.351 0.177
## caregiving:babysitteryes -0.036492 0.053785 -0.678 0.497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.3737) family taken to be 1)
##
## Null deviance: 289.68 on 149 degrees of freedom
## Residual deviance: 161.78 on 138 degrees of freedom
## AIC: 1057.4
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.37
## Std. Err.: 1.06
##
## 2 x log-likelihood: -1031.391
Now overdispersion is taken into account and standard errors are larger:
car::compareCoefs(fit2, fit3)
## Calls:
## 1: glm(formula = nwords ~ timebonding * ses + caregiving * ses +
## timebonding * babysitter + caregiving * babysitter, family = poisson(link =
## "log"), data = dat)
## 2: MASS::glm.nb(formula = nwords ~ timebonding * ses + caregiving * ses +
## timebonding * babysitter + caregiving * babysitter, data = dat, init.theta =
## 6.373674959, link = log)
##
## Model 1 Model 2
## (Intercept) 2.248 2.339
## SE 0.156 0.327
##
## timebonding 0.01621 0.01562
## SE 0.00579 0.01261
##
## sesmiddle -0.350 -0.388
## SE 0.179 0.363
##
## seshigh -0.435 -0.449
## SE 0.215 0.407
##
## caregiving 0.0705 0.0609
## SE 0.0205 0.0461
##
## babysitteryes -0.148 -0.214
## SE 0.130 0.231
##
## timebonding:sesmiddle 0.01029 0.00740
## SE 0.00722 0.01514
##
## timebonding:seshigh 0.0362 0.0321
## SE 0.0147 0.0266
##
## sesmiddle:caregiving 0.0202 0.0318
## SE 0.0279 0.0586
##
## seshigh:caregiving -0.0188 -0.0085
## SE 0.0528 0.0942
##
## timebonding:babysitteryes 0.0219 0.0211
## SE 0.0078 0.0156
##
## caregiving:babysitteryes -0.0483 -0.0365
## SE 0.0272 0.0538
##
# test statistics for the poisson model
data.frame(
poisson = fit2$coefficients/sqrt(diag(vcov(fit2))),
negative_binomial = fit3$coefficients/sqrt(diag(vcov(fit3)))
)
## poisson negative_binomial
## (Intercept) 14.4348724 7.14604731
## timebonding 2.8011983 1.23888446
## sesmiddle -1.9555334 -1.06767938
## seshigh -2.0178825 -1.10248163
## caregiving 3.4484763 1.32016562
## babysitteryes -1.1390397 -0.92649153
## timebonding:sesmiddle 1.4251192 0.48865248
## timebonding:seshigh 2.4681042 1.20876478
## sesmiddle:caregiving 0.7234224 0.54360423
## seshigh:caregiving -0.3560440 -0.09026367
## timebonding:babysitteryes 2.8045832 1.35099433
## caregiving:babysitteryes -1.7773722 -0.67847958