Let’s create some data.
Here’s a positive relationship.
j = 50
a = data.frame(x=1:100, y=jitter(1:100, j))
plot(a)
Here’s a negative relationship.
b = data.frame(x=101:200, y=jitter(100:1, j))
bb = data.frame(x=101:200, y=jitter(seq(50,0.5,-0.5), 100))
plot(bb)
Are these any different?
boxplot(list(before=a$y,after=bb$y))
t.test(a$y,b$y)
##
## Welch Two Sample t-test
##
## data: a$y and b$y
## t = -0.0049783, df = 197.72, p-value = 0.996
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.343723 8.301703
## sample estimates:
## mean of x mean of y
## 51.07787 51.09888
Let’s display them side by side.
plot(x=1:200, y=rep(1,200), type="n", ylim=c(0,100),
xlab="time", ylab="y")
abline(v=100)
points(a$x, a$y, pch=2, col=2)
points(bb$x, bb$y, pch=3, col=4)
abline(lm(y~x, data=a), col=2)
lines(x=1:100, y=lm(y~x, data=a)$fit, col=2)
# abline(lm(y~x, data=bb), col=4)
lines(x=101:200, y=lm(y~x, data=bb)$fit, col=4)
Let’s simulate a change in level.
a2 = data.frame(x=101:200, y=jitter(1:100, j))
plot(x=1:200, y=rep(1,200), type="n", ylim=c(-50,150),
xlab="time", ylab="y")
abline(v=100)
points(a$x, a$y, pch=2, col=2)
points(a2$x, a2$y, pch=3, col=4)
abline(lm(y~x, data=a), col=2)
abline(lm(y~x, data=a2), col=4)
We can’t capture that with a simple test.
boxplot(list(before=a$y,after=a2$y))
t.test(a$y,a2$y)
##
## Welch Two Sample t-test
##
## data: a$y and a2$y
## t = 0.24522, df = 197.98, p-value = 0.8065
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.115953 9.137014
## sample estimates:
## mean of x mean of y
## 51.07787 50.06734
Now let’s go back to the previous example:
m = rbind(a, data.frame(x=101:200, y=jitter(seq(50,0.5,-0.5), j)))
plot(m$x, m$y, xlab="time", ylab="y")
Here’s what a simple model might look like:
summary(lm(y~x, data=m))
##
## Call:
## lm(formula = y ~ x, data = m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.354 -16.932 -3.208 10.347 68.753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.74621 3.62894 14.259 < 2e-16 ***
## x -0.13607 0.03131 -4.346 2.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.56 on 198 degrees of freedom
## Multiple R-squared: 0.08708, Adjusted R-squared: 0.08247
## F-statistic: 18.89 on 1 and 198 DF, p-value: 2.217e-05
Let’s see if we can model those trends and change in level explicitly.
m$time = m$x
m$intervention = m$time > 100
m$time_after_intervention = ifelse(m$time > 100, m$time - 100, 0)
m$time
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200
m$intervention
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
m$time_after_intervention
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [19] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [37] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [55] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [73] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [91] 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 7 8
## [109] 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## [127] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## [145] 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
## [163] 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## [181] 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
## [199] 99 100
rdd = lm(y ~ time + intervention + time_after_intervention, data=m)
summary(rdd)
##
## Call:
## lm(formula = y ~ time + intervention + time_after_intervention,
## data = m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0891 -3.2357 0.1754 3.3189 9.2600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.14244 0.93352 1.224 0.222
## time 0.98882 0.01605 61.614 <2e-16 ***
## interventionTRUE -48.87539 1.31041 -37.298 <2e-16 ***
## time_after_intervention -1.50534 0.02270 -66.325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.633 on 196 degrees of freedom
## Multiple R-squared: 0.9703, Adjusted R-squared: 0.9699
## F-statistic: 2136 on 3 and 196 DF, p-value: < 2.2e-16
Q: Can you achieve the same result (i.e., capture both trends and the change in level) with only two variables? A: Yes, with an interaction term!
rdd2 = lm(y ~ time * intervention, data=m)
summary(rdd2)
##
## Call:
## lm(formula = y ~ time * intervention, data = m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0891 -3.2357 0.1754 3.3189 9.2600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.14244 0.93352 1.224 0.222
## time 0.98882 0.01605 61.614 <2e-16 ***
## interventionTRUE 101.65813 2.63057 38.645 <2e-16 ***
## time:interventionTRUE -1.50534 0.02270 -66.325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.633 on 196 degrees of freedom
## Multiple R-squared: 0.9703, Adjusted R-squared: 0.9699
## F-statistic: 2136 on 3 and 196 DF, p-value: < 2.2e-16