Load the Galton height data from the HistData
library.
data(GaltonFamilies)
str(GaltonFamilies)
## 'data.frame': 934 obs. of 8 variables:
## $ family : Factor w/ 205 levels "001","002","003",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ father : num 78.5 78.5 78.5 78.5 75.5 75.5 75.5 75.5 75 75 ...
## $ mother : num 67 67 67 67 66.5 66.5 66.5 66.5 64 64 ...
## $ midparentHeight: num 75.4 75.4 75.4 75.4 73.7 ...
## $ children : int 4 4 4 4 4 4 4 4 2 2 ...
## $ childNum : int 1 2 3 4 1 2 3 4 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 1 1 2 1 ...
## $ childHeight : num 73.2 69.2 69 69 73.5 72.5 65.5 65.5 71 68 ...
Reproduce Fig 2 in Hanley (2004).
scatterplot(childHeight ~ midparentHeight | gender, data=GaltonFamilies,
ellipse=TRUE, levels=0.68, legend.coords=list(x=64, y=78))
Data exploration
nrow(GaltonFamilies)
## [1] 934
length(unique(GaltonFamilies$family))
## [1] 205
summary(GaltonFamilies$children)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 6.171 8.000 15.000
table(GaltonFamilies$gender)
##
## female male
## 453 481
summary(GaltonFamilies$childHeight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 56.00 64.00 66.50 66.75 69.70 79.00
sd(GaltonFamilies$father)
## [1] 2.476479
sd(GaltonFamilies$mother)
## [1] 2.290886
ggplot(GaltonFamilies, aes(x=gender, y=childHeight, fill=gender)) + geom_violin() + theme_bw()
What’s the simplest possible model of this data you could imagine? Probably one that just computes the mean height.
summary(lm(childHeight ~ 1, data=GaltonFamilies))
##
## Call:
## lm(formula = childHeight ~ 1, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7459 -2.7459 -0.2459 2.9541 12.2541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.7459 0.1171 569.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.579 on 933 degrees of freedom
Let’s make it more realistic. Fit a simple regression model predicting children’s height from gender.
m0 = lm(childHeight ~ gender, data=GaltonFamilies)
summary(m0)
##
## Call:
## lm(formula = childHeight ~ gender, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.234 -1.604 -0.104 1.766 9.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.1040 0.1173 546.32 <2e-16 ***
## gendermale 5.1301 0.1635 31.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.497 on 932 degrees of freedom
## Multiple R-squared: 0.5137, Adjusted R-squared: 0.5132
## F-statistic: 984.4 on 1 and 932 DF, p-value: < 2.2e-16
Look at some model diagnostics to confirm that this is an appropriate model. Since this model only produces two different predictions (one for males and one for females), that isn’t very helpful here.
plot(m0)
A boxplot or violin plot can help to summarise the distribution of residuals by group. Since the model simply estimates the mean heights of males and females a violin plot of the residuals should look very similar to the violin plot of heights above, but with the means of both groups aligned at 0.
ggplot(GaltonFamilies, aes(x=gender, y=resid(m0), fill=gender)) +
geom_violin() +
theme_bw()
Fit a multiple regression model predicting children’s height from father’s height, mother’s height, and gender
m1 = lm(childHeight ~ gender + father + mother, data=GaltonFamilies)
summary(m1)
##
## Call:
## lm(formula = childHeight ~ gender + father + mother, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5247 -1.4653 0.0943 1.4860 9.1201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.52124 2.72720 6.058 2e-09 ***
## gendermale 5.21499 0.14181 36.775 <2e-16 ***
## father 0.39284 0.02868 13.699 <2e-16 ***
## mother 0.31761 0.03100 10.245 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.165 on 930 degrees of freedom
## Multiple R-squared: 0.6354, Adjusted R-squared: 0.6342
## F-statistic: 540.3 on 3 and 930 DF, p-value: < 2.2e-16
Change reference level for gender variable.
levels(GaltonFamilies$gender)
## [1] "female" "male"
GaltonFamilies$gender = factor(GaltonFamilies$gender, levels = c("male", "female"))
# The regression
m2 = lm(childHeight ~ gender + father + mother, data=GaltonFamilies)
summary(m2)
##
## Call:
## lm(formula = childHeight ~ gender + father + mother, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5247 -1.4653 0.0943 1.4860 9.1201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.73623 2.72223 7.985 4.14e-15 ***
## genderfemale -5.21499 0.14181 -36.775 < 2e-16 ***
## father 0.39284 0.02868 13.699 < 2e-16 ***
## mother 0.31761 0.03100 10.245 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.165 on 930 degrees of freedom
## Multiple R-squared: 0.6354, Adjusted R-squared: 0.6342
## F-statistic: 540.3 on 3 and 930 DF, p-value: < 2.2e-16
Inspect residuals. We can clearly see two clusters of points here. The cluster of points with smaller predicted heights belong to the female children and the other cluster of points belong to the male children.
plot(m1$residuals ~ m1$fitted.values,
xlab="Predicted", ylab="Residuals",
pch=16, las=1)
abline(h=0)
Color-code the residuals by gender
plot(m1$residuals ~ m1$fitted.values,
col=GaltonFamilies$gender,
xlab="Predicted", ylab="Residuals",
pch=16, las=1)
abline(h=0)
legend("topleft",c("Male","Female"), pch=16, col=1:2)
Diagnostics
plot(m1)
Scale parents. See also https://stats.stackexchange.com/questions/254934/what-is-the-interpretation-of-scaled-regression-coefficients-when-only-the-predi/254982
m3 = lm(scale(childHeight) ~ gender + scale(father) + scale(mother), data=GaltonFamilies)
summary(m3)
##
## Call:
## lm(formula = scale(childHeight) ~ gender + scale(father) + scale(mother),
## data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.66108 -0.40938 0.02635 0.41517 2.54804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.70666 0.02758 25.62 <2e-16 ***
## genderfemale -1.45701 0.03962 -36.77 <2e-16 ***
## scale(father) 0.27181 0.01984 13.70 <2e-16 ***
## scale(mother) 0.20329 0.01984 10.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6048 on 930 degrees of freedom
## Multiple R-squared: 0.6354, Adjusted R-squared: 0.6342
## F-statistic: 540.3 on 3 and 930 DF, p-value: < 2.2e-16
We would expect siblings to be somewhat similar in height as they share genetic factors through their parents and environmental factors through their shared upbringing.
We can model this structure of the data, children clustering in families, using linear mixed effects models. In addition to estimating population means (fixed effects) these models will also allow us to estimate how average family heights vary around these population means (random effects).
library(lme4)
library(lmerTest)
# The random effect for family indicates that the mean height of each family may differ from the population mean.
fit_me = lmer(childHeight ~ gender + father + mother + (1|family), data=GaltonFamilies)
# In addition to the gender fixed effect that we have already seen in the simple linear regression model, this model also provides us with an estimate of the variance in average height between families (2.43) as well as the remaining (residual) variance within families (3.81).
summary(fit_me)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: childHeight ~ gender + father + mother + (1 | family)
## Data: GaltonFamilies
##
## REML criterion at convergence: 4053.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2081 -0.5887 0.0073 0.6202 3.7316
##
## Random effects:
## Groups Name Variance Std.Dev.
## family (Intercept) 0.9073 0.9525
## Residual 3.8197 1.9544
## Number of obs: 934, groups: family, 205
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 23.56134 3.65758 193.25285 6.442 9.16e-10 ***
## genderfemale -5.22364 0.13522 877.55415 -38.631 < 2e-16 ***
## father 0.38161 0.03798 193.65643 10.049 < 2e-16 ***
## mother 0.30180 0.04245 178.66568 7.109 2.70e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) gndrfm father
## genderfemal 0.021
## father -0.670 -0.031
## mother -0.696 -0.022 -0.065
A dot plot, also known as a caterpillar plot, can help to visualise random effects. This plot shows the deviation from the mean population height for each family, together with standard errors. Note how some families fall clearly below or above the population mean.
library(lattice)
randoms = ranef(fit_me)
dotplot(randoms)
## $family
Model comparison with anova() and ranova(). In this case, the inclusion of the family random effect clearly improves model fit.
fit_lm = lm(childHeight ~ gender, data=GaltonFamilies)
## Re-fit model using ML, rather than REML
fit_me = lmer(childHeight ~ gender + (1|family), data=GaltonFamilies, REML=FALSE)
anova(fit_me, fit_lm)
ranova(fit_me)