Chapter 12 introduces the idea of modeling nonlinear relationships using linear regression. This is one of the many ways regression is flexible to many important situations. A nice, but incomplete, list of useful functions can be found here.
tidyverse
package, the furniture
package, and the educ7610
package.library(tidyverse)
library(furniture)
library(educ7610)
poli
data set into R.data(poli)
pknow
), age (age
), news use (npnews
), and SES (ses
).poli %>%
table1(pknow, age, npnews, ses,
type = "condense")
##
## ──────────────────────────
## Mean/Count (SD/%)
## n = 340
## pknow 11.3 (4.4)
## age 44.9 (14.6)
## npnews 3.6 (2.8)
## ses 0.0 (0.8)
## ──────────────────────────
pknow
) and age (age
).poli %>%
ggplot(aes(age, pknow)) +
geom_point() +
geom_smooth()
pknow
regressed on age
where we also include the quadratic effect of age
. To add the quadratic effect directly in the model we can use I(age^2)
.poli %>%
lm(pknow ~ age + I(age^2),
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ age + I(age^2), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4090 -3.2661 0.1098 3.1515 9.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0258076 2.0174926 2.987 0.00303 **
## age 0.2128720 0.0868390 2.451 0.01474 *
## I(age^2) -0.0019190 0.0008812 -2.178 0.03011 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.338 on 337 degrees of freedom
## Multiple R-squared: 0.02191, Adjusted R-squared: 0.01611
## F-statistic: 3.775 on 2 and 337 DF, p-value: 0.02392
poli %>%
mutate(age2 = age^2) %>%
lm(pknow ~ age + age2,
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ age + age2, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4090 -3.2661 0.1098 3.1515 9.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0258076 2.0174926 2.987 0.00303 **
## age 0.2128720 0.0868390 2.451 0.01474 *
## age2 -0.0019190 0.0008812 -2.178 0.03011 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.338 on 337 degrees of freedom
## Multiple R-squared: 0.02191, Adjusted R-squared: 0.01611
## F-statistic: 3.775 on 2 and 337 DF, p-value: 0.02392
poli %>%
mutate(age_centered = age - mean(age, na.rm = TRUE)) %>%
lm(pknow ~ age_centered + I(age_centered^2),
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ age_centered + I(age_centered^2), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4090 -3.2661 0.1098 3.1515 9.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7160277 0.3004971 38.989 <2e-16 ***
## age_centered 0.0404406 0.0172795 2.340 0.0198 *
## I(age_centered^2) -0.0019190 0.0008812 -2.178 0.0301 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.338 on 337 degrees of freedom
## Multiple R-squared: 0.02191, Adjusted R-squared: 0.01611
## F-statistic: 3.775 on 2 and 337 DF, p-value: 0.02392
poli %>%
mutate(age_centered = age - mean(age, na.rm = TRUE)) %>%
lm(pknow ~ age_centered + I(age_centered^2) + ses + npnews,
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ age_centered + I(age_centered^2) + ses +
## npnews, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7681 -2.3222 0.1151 2.4382 10.3364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.3023034 0.3729136 27.627 < 2e-16 ***
## age_centered 0.0231441 0.0155226 1.491 0.136904
## I(age_centered^2) -0.0000341 0.0007796 -0.044 0.965139
## ses 2.4877946 0.2648857 9.392 < 2e-16 ***
## npnews 0.2834595 0.0778497 3.641 0.000314 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.707 on 335 degrees of freedom
## Multiple R-squared: 0.2899, Adjusted R-squared: 0.2814
## F-statistic: 34.19 on 4 and 335 DF, p-value: < 2.2e-16
poli %>%
mutate(pknow_sesnpnews = resid(lm(pknow ~ ses + npnews))) %>%
ggplot(aes(age, pknow_sesnpnews)) +
geom_point() +
geom_smooth()
poli %>%
mutate(age_centered = age - mean(age, na.rm = TRUE)) %>%
lm(pknow ~ age_centered + ses + npnews,
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ age_centered + ses + npnews, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7828 -2.3256 0.1127 2.4347 10.3429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.29587 0.34221 30.086 < 2e-16 ***
## age_centered 0.02294 0.01475 1.555 0.121001
## ses 2.49079 0.25549 9.749 < 2e-16 ***
## npnews 0.28323 0.07756 3.652 0.000302 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.701 on 336 degrees of freedom
## Multiple R-squared: 0.2899, Adjusted R-squared: 0.2835
## F-statistic: 45.72 on 3 and 336 DF, p-value: < 2.2e-16
pknow
and npnews
is likely non-linear as well but we won’t look into that here; see spline regression below.)npnews == 1.5
and npnews == 6
. Let’s try these two and see what we find.poli %>%
ggplot(aes(npnews, pknow)) +
geom_point() +
geom_smooth()
poli %>%
mutate(j1 = ifelse(npnews >= 1.5, npnews, 0),
j2 = ifelse(npnews >= 6.0, npnews, 0)) %>%
lm(pknow ~ npnews + j1 + j2,
data = .) %>%
summary()
##
## Call:
## lm(formula = pknow ~ npnews + j1 + j2, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1401 -2.8996 0.0992 3.0352 10.0968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.90320 0.46076 21.493 <2e-16 ***
## npnews -0.46417 0.79952 -0.581 0.562
## j1 0.87649 0.74202 1.181 0.238
## j2 0.01557 0.14626 0.106 0.915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.184 on 336 degrees of freedom
## Multiple R-squared: 0.09273, Adjusted R-squared: 0.08463
## F-statistic: 11.45 on 3 and 336 DF, p-value: 3.643e-07
npnews
and pknow
? Is spline regression necessary here?poli %>%
mutate(j1 = ifelse(npnews >= 1.5, npnews, 0),
j2 = ifelse(npnews >= 6.0, npnews, 0)) %>%
mutate(preds = predict(lm(pknow ~ npnews + j1 + j2))) %>%
ggplot(aes(npnews, preds)) +
geom_line() +
geom_point(data = poli,
aes(npnews, pknow))
log()
. Since there are some zeros, the natural log isn’t defined. To fix this, we add 1 to pknow
before taking the log. This model would be considered a “log-lin” model.poli %>%
lm(log(pknow+1) ~ npnews,
data = .) %>%
summary()
##
## Call:
## lm(formula = log(pknow + 1) ~ npnews, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40289 -0.18703 0.05428 0.29848 0.77806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.26647 0.03734 60.700 < 2e-16 ***
## npnews 0.04547 0.00821 5.539 6.13e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4259 on 338 degrees of freedom
## Multiple R-squared: 0.08321, Adjusted R-squared: 0.0805
## F-statistic: 30.68 on 1 and 338 DF, p-value: 6.127e-08
npnews
?npnews
in the previous model. It has the same issue as pknow
so we can just add 1 to the values. This model would be considered a “log-log” model.poli %>%
lm(log(pknow+1) ~ log(npnews+1),
data = .) %>%
summary()
##
## Call:
## lm(formula = log(pknow + 1) ~ log(npnews + 1), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.44833 -0.17942 0.07177 0.28283 0.80547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.23906 0.04365 51.290 < 2e-16 ***
## log(npnews + 1) 0.15096 0.02938 5.138 4.69e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4284 on 338 degrees of freedom
## Multiple R-squared: 0.07246, Adjusted R-squared: 0.06971
## F-statistic: 26.4 on 1 and 338 DF, p-value: 4.695e-07
poli %>%
mutate(preds = predict(lm(log(pknow+1) ~ npnews))) %>%
ggplot(aes(npnews, preds)) +
geom_line() +
geom_point(data = poli,
aes(npnews, log(pknow+1)))
log()
will work? Let’s try +2
as well.poli %>%
lm(log(pknow+2) ~ npnews,
data = .) %>%
summary()
##
## Call:
## lm(formula = log(pknow + 2) ~ npnews, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80610 -0.18049 0.04676 0.26135 0.71333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.377712 0.032783 72.53 < 2e-16 ***
## npnews 0.040512 0.007208 5.62 3.99e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3739 on 338 degrees of freedom
## Multiple R-squared: 0.08547, Adjusted R-squared: 0.08276
## F-statistic: 31.59 on 1 and 338 DF, p-value: 3.992e-08
MASS
package, we can use the boxcox()
function. For this function to work, the outcome must be \(>0\).lm(pknow+1 ~ npnews, data = poli) %>%
MASS::boxcox()
pknow^1
). If the line was at .5, then that would be pknow^.5
or sqrt(pknow)
(same thing). The function tests powers between -2 and 2 for us, which is usually sufficient.Ironically, linear regression can model non-linear relationships very naturally. Each approach covered here are useful for specific situations and each has a decent interpretation.