Beyond Linear Regression

Load the Galton height data from the HistData library.

data(GaltonFamilies)
str(GaltonFamilies)

## 'data.frame':    934 obs. of  8 variables:
##  $ family         : Factor w/ 205 levels "001","002","003",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ father         : num  78.5 78.5 78.5 78.5 75.5 75.5 75.5 75.5 75 75 ...
##  $ mother         : num  67 67 67 67 66.5 66.5 66.5 66.5 64 64 ...
##  $ midparentHeight: num  75.4 75.4 75.4 75.4 73.7 ...
##  $ children       : int  4 4 4 4 4 4 4 4 2 2 ...
##  $ childNum       : int  1 2 3 4 1 2 3 4 1 2 ...
##  $ gender         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 1 1 2 1 ...
##  $ childHeight    : num  73.2 69.2 69 69 73.5 72.5 65.5 65.5 71 68 ...

Reproduce Fig 2 in Hanley (2004).

scatterplot(childHeight ~ midparentHeight | gender, data=GaltonFamilies, 
    ellipse=TRUE, levels=0.68, legend.coords=list(x=64, y=78))

Data exploration

nrow(GaltonFamilies)

## [1] 934

length(unique(GaltonFamilies$family))

## [1] 205

summary(GaltonFamilies$children)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   6.000   6.171   8.000  15.000

table(GaltonFamilies$gender)

## 
## female   male 
##    453    481

summary(GaltonFamilies$childHeight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   56.00   64.00   66.50   66.75   69.70   79.00

sd(GaltonFamilies$father)

## [1] 2.476479

sd(GaltonFamilies$mother)

## [1] 2.290886

ggplot(GaltonFamilies, aes(x=gender, y=childHeight, fill=gender)) + geom_violin() + theme_bw()

What’s the simplest possible model of this data you could imagine? Probably one that just computes the mean height.

summary(lm(childHeight ~ 1, data=GaltonFamilies))

## 
## Call:
## lm(formula = childHeight ~ 1, data = GaltonFamilies)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7459  -2.7459  -0.2459   2.9541  12.2541 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  66.7459     0.1171   569.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.579 on 933 degrees of freedom

Let’s make it more realistic. Fit a simple regression model predicting children’s height from gender.

m0 = lm(childHeight ~ gender, data=GaltonFamilies)
summary(m0)

## 
## Call:
## lm(formula = childHeight ~ gender, data = GaltonFamilies)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.234 -1.604 -0.104  1.766  9.766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  64.1040     0.1173  546.32   <2e-16 ***
## gendermale    5.1301     0.1635   31.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.497 on 932 degrees of freedom
## Multiple R-squared:  0.5137, Adjusted R-squared:  0.5132 
## F-statistic: 984.4 on 1 and 932 DF,  p-value: < 2.2e-16

Look at some model diagnostics to confirm that this is an appropriate model. Since this model only produces two different predictions (one for males and one for females), that isn’t very helpful here.

plot(m0)

A boxplot or violin plot can help to summarise the distribution of residuals by group. Since the model simply estimates the mean heights of males and females a violin plot of the residuals should look very similar to the violin plot of heights above, but with the means of both groups aligned at 0.

ggplot(GaltonFamilies, aes(x=gender, y=resid(m0), fill=gender)) +
   geom_violin() +
   theme_bw()

Fit a multiple regression model predicting children’s height from father’s height, mother’s height, and gender

m1 = lm(childHeight ~ gender + father + mother, data=GaltonFamilies)
summary(m1)

## 
## Call:
## lm(formula = childHeight ~ gender + father + mother, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5247 -1.4653  0.0943  1.4860  9.1201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.52124    2.72720   6.058    2e-09 ***
## gendermale   5.21499    0.14181  36.775   <2e-16 ***
## father       0.39284    0.02868  13.699   <2e-16 ***
## mother       0.31761    0.03100  10.245   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.165 on 930 degrees of freedom
## Multiple R-squared:  0.6354, Adjusted R-squared:  0.6342 
## F-statistic: 540.3 on 3 and 930 DF,  p-value: < 2.2e-16

Change reference level for gender variable.

levels(GaltonFamilies$gender)

## [1] "female" "male"

GaltonFamilies$gender = factor(GaltonFamilies$gender, levels = c("male", "female"))

# The regression
m2 = lm(childHeight ~ gender + father + mother, data=GaltonFamilies)
summary(m2)

## 
## Call:
## lm(formula = childHeight ~ gender + father + mother, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5247 -1.4653  0.0943  1.4860  9.1201 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.73623    2.72223   7.985 4.14e-15 ***
## genderfemale -5.21499    0.14181 -36.775  < 2e-16 ***
## father        0.39284    0.02868  13.699  < 2e-16 ***
## mother        0.31761    0.03100  10.245  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.165 on 930 degrees of freedom
## Multiple R-squared:  0.6354, Adjusted R-squared:  0.6342 
## F-statistic: 540.3 on 3 and 930 DF,  p-value: < 2.2e-16

Inspect residuals. We can clearly see two clusters of points here. The cluster of points with smaller predicted heights belong to the female children and the other cluster of points belong to the male children.

plot(m1$residuals ~ m1$fitted.values, 
     xlab="Predicted", ylab="Residuals", 
     pch=16, las=1)
abline(h=0)

Color-code the residuals by gender

plot(m1$residuals ~ m1$fitted.values, 
     col=GaltonFamilies$gender, 
     xlab="Predicted", ylab="Residuals", 
     pch=16, las=1)
abline(h=0)
legend("topleft",c("Male","Female"), pch=16, col=1:2)

Diagnostics

plot(m1)

Scale parents. See also https://stats.stackexchange.com/questions/254934/what-is-the-interpretation-of-scaled-regression-coefficients-when-only-the-predi/254982

m3 = lm(scale(childHeight) ~ gender + scale(father) + scale(mother), data=GaltonFamilies)
summary(m3)

## 
## Call:
## lm(formula = scale(childHeight) ~ gender + scale(father) + scale(mother), 
##     data = GaltonFamilies)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.66108 -0.40938  0.02635  0.41517  2.54804 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.70666    0.02758   25.62   <2e-16 ***
## genderfemale  -1.45701    0.03962  -36.77   <2e-16 ***
## scale(father)  0.27181    0.01984   13.70   <2e-16 ***
## scale(mother)  0.20329    0.01984   10.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6048 on 930 degrees of freedom
## Multiple R-squared:  0.6354, Adjusted R-squared:  0.6342 
## F-statistic: 540.3 on 3 and 930 DF,  p-value: < 2.2e-16

We would expect siblings to be somewhat similar in height as they share genetic factors through their parents and environmental factors through their shared upbringing.

We can model this structure of the data, children clustering in families, using linear mixed effects models. In addition to estimating population means (fixed effects) these models will also allow us to estimate how average family heights vary around these population means (random effects).

library(lme4)
library(lmerTest)

# The random effect for family indicates that the mean height of each family may differ from the population mean.
fit_me = lmer(childHeight ~ gender + father + mother + (1|family), data=GaltonFamilies)

# In addition to the gender fixed effect that we have already seen in the simple linear regression model, this model also provides us with an estimate of the variance in average height between families (2.43) as well as the remaining (residual) variance within families (3.81).
summary(fit_me)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: childHeight ~ gender + father + mother + (1 | family)
##    Data: GaltonFamilies
## 
## REML criterion at convergence: 4053.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2081 -0.5887  0.0073  0.6202  3.7316 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  family   (Intercept) 0.9073   0.9525  
##  Residual             3.8197   1.9544  
## Number of obs: 934, groups:  family, 205
## 
## Fixed effects:
##               Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   23.56134    3.65758 193.25285   6.442 9.16e-10 ***
## genderfemale  -5.22364    0.13522 877.55415 -38.631  < 2e-16 ***
## father         0.38161    0.03798 193.65643  10.049  < 2e-16 ***
## mother         0.30180    0.04245 178.66568   7.109 2.70e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) gndrfm father
## genderfemal  0.021              
## father      -0.670 -0.031       
## mother      -0.696 -0.022 -0.065

A dot plot, also known as a caterpillar plot, can help to visualise random effects. This plot shows the deviation from the mean population height for each family, together with standard errors. Note how some families fall clearly below or above the population mean.

library(lattice)

randoms = ranef(fit_me)
dotplot(randoms)

## $family

Model comparison with anova() and ranova(). In this case, the inclusion of the family random effect clearly improves model fit.

fit_lm = lm(childHeight ~ gender, data=GaltonFamilies)
## Re-fit model using ML, rather than REML
fit_me = lmer(childHeight ~ gender + (1|family), data=GaltonFamilies, REML=FALSE)
anova(fit_me, fit_lm)

ranova(fit_me)