5 GLM - Logistic:

library(tidyverse)
library(haven)        # read in SPSS dataset
library(furniture)    # nice table1() descriptives
library(stargazer)    # display nice tables: summary & regression
library(texreg)       # Convert Regression Output to LaTeX or HTML Tables
library(psych)        # contains some useful functions, like headTail
library(car)          # Companion to Applied Regression
library(pscl)         # psudo R-squared function

5.1 Background

More complex example demonstrating modeling decisions

Another set of data from a study investigating predictors of low birth weight

  • id infant id
  • low Low birth weight (outcome) (0 = birth weight >2500 g (normal), 1 = birth weight < 2500 g (low))
  • age Age of mother (yrs)
  • lwt Mother’s weight at last menstrual period (pounds)
  • race Race (1 = White, 2 = Black, 3 = Other)
  • smoke Smoking status during pregnancy (1 = Yes, 0 = No)
  • ptl History of premature labor (0 = None, 1 = One, etc)
  • ht History of hypertension (1 = Yes, 0 = No)
  • ui Uterine irritability (1 = Yes, 0 = No)
  • ftv Number of physician visits in 1st trimester (0=None, 1=One, etc)
  • bwt actual infant birth weight in g
lowbwt_raw <- read.table("https://raw.githubusercontent.com/CEHS-research/eBook_regression/master/data/lowbwt.txt?token=AScXBTYDbui3sy0ah-7-yknbzAyAwsUoks5bz51LwA%3D%3D", 
                         header = TRUE, 
                         sep = "", 
                         na.strings = "NA", 
                         dec = ".", 
                         strip.white = TRUE)

tibble::glimpse(lowbwt_raw)
Observations: 189
Variables: 11
$ id    <int> 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...
$ low   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ age   <int> 19, 33, 20, 21, 18, 21, 22, 17, 29, 26, 19, 19, 22, 30, ...
$ lwt   <int> 182, 155, 105, 108, 107, 124, 118, 103, 123, 113, 95, 15...
$ race  <int> 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 1, 2, 1, 3,...
$ smoke <int> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
$ ptl   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
$ ht    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
$ ui    <int> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,...
$ ftv   <int> 0, 3, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0,...
$ bwt   <int> 2523, 2551, 2557, 2594, 2600, 2622, 2637, 2637, 2663, 26...
lowbwt_clean <- lowbwt_raw %>% 
  dplyr::mutate(id = factor(id)) %>% 
  dplyr::mutate(low = factor(low,
                             levels = c(0, 1),
                             labels = c("birth weight >2500 g (normal)",
                                        "birth weight < 2500 g (low)"))) %>% 
  dplyr::mutate(race = factor(race,
                              levels = 1:3,
                              labels = c("White", "Black", "Other"))) %>% 
  dplyr::mutate_at(vars(smoke, ht, ui),
                   factor,
                   levels = 0:1,
                   labels = c("No", "Yes")) 
str(lowbwt_clean)
'data.frame':   189 obs. of  11 variables:
 $ id   : Factor w/ 189 levels "4","10","11",..: 60 61 62 63 64 65 66 67 68 69 ...
 $ low  : Factor w/ 2 levels "birth weight >2500 g (normal)",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
 $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
 $ race : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
 $ smoke: Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 2 2 ...
 $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ht   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ ui   : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 1 1 1 1 ...
 $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
 $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
lowbwt_clean %>% 
  furniture::table1(age, lwt, race, smoke, ptl, ht, ui, ftv, bwt,
                    splitby = ~ low,
                    test = TRUE,
                    output = "html")
birth weight >2500 g (normal) birth weight < 2500 g (low) P-Value
n = 130 n = 59
age 0.078
23.7 (5.6) 22.3 (4.5)
lwt 0.013
133.3 (31.7) 122.1 (26.6)
race 0.082
White 73 (56.2%) 23 (39%)
Black 15 (11.5%) 11 (18.6%)
Other 42 (32.3%) 25 (42.4%)
smoke 0.04
No 86 (66.2%) 29 (49.2%)
Yes 44 (33.8%) 30 (50.8%)
ptl 0.012
0.1 (0.5) 0.3 (0.5)
ht 0.076
No 125 (96.2%) 52 (88.1%)
Yes 5 (3.8%) 7 (11.9%)
ui 0.035
No 116 (89.2%) 45 (76.3%)
Yes 14 (10.8%) 14 (23.7%)
ftv 0.385
0.8 (1.1) 0.7 (1.0)
bwt <.001
3330.9 (476.2) 2097.1 (390.9)

5.2 Logistic Regression - Simple, un-adjusted

low1.age   <- glm(low ~ age,   family=binomial(logit), data=lowbwt_clean)
low1.lwt   <- glm(low ~ lwt,   family=binomial(logit), data=lowbwt_clean)
low1.race  <- glm(low ~ race,  family=binomial(logit), data=lowbwt_clean)
low1.smoke <- glm(low ~ smoke, family=binomial(logit), data=lowbwt_clean)
low1.ptl   <- glm(low ~ ptl,   family=binomial(logit), data=lowbwt_clean)
low1.ht    <- glm(low ~ ht,    family=binomial(logit), data=lowbwt_clean)
low1.ui    <- glm(low ~ ui,    family=binomial(logit), data=lowbwt_clean)
low1.ftv   <- glm(low ~ ftv,   family=binomial(logit), data=lowbwt_clean)
texreg::screenreg(list(low1.age, low1.lwt, low1.race, low1.smoke),
                  custom.model.names = c("Age", "Weight", "Race", "Smoker"),
                  caption = "Mother Factors")

=============================================================
                Age       Weight     Race         Smoker     
-------------------------------------------------------------
(Intercept)        0.38      1.00      -1.15 ***    -1.09 ***
                  (0.73)    (0.79)     (0.24)       (0.21)   
age               -0.05                                      
                  (0.03)                                     
lwt                         -0.01 *                          
                            (0.01)                           
raceBlack                               0.84                 
                                       (0.46)                
raceOther                               0.64                 
                                       (0.35)                
smokeYes                                             0.70 *  
                                                    (0.32)   
-------------------------------------------------------------
AIC              235.91    232.69     235.66       233.80    
BIC              242.40    239.17     245.39       240.29    
Log Likelihood  -115.96   -114.35    -114.83      -114.90    
Deviance         231.91    228.69     229.66       229.80    
Num. obs.        189       189        189          189       
=============================================================
*** p < 0.001, ** p < 0.01, * p < 0.05
texreg::screenreg(list(low1.ptl, low1.ht, low1.ui, low1.ftv),
                  custom.model.names = c("Pre-Labor", "Hypertension", "Uterine", "Visits"))

===================================================================
                Pre-Labor    Hypertension  Uterine      Visits     
-------------------------------------------------------------------
(Intercept)       -0.96 ***    -0.88 ***     -0.95 ***    -0.69 ***
                  (0.17)       (0.17)        (0.18)       (0.19)   
ptl                0.80 *                                          
                  (0.32)                                           
htYes                           1.21 *                             
                               (0.61)                              
uiYes                                         0.95 *               
                                             (0.42)                
ftv                                                       -0.14    
                                                          (0.16)   
-------------------------------------------------------------------
AIC              231.89       234.65        233.60       237.90    
BIC              238.38       241.13        240.08       244.38    
Log Likelihood  -113.95      -115.32       -114.80      -116.95    
Deviance         227.89       230.65        229.60       233.90    
Num. obs.        189          189           189          189       
===================================================================
*** p < 0.001, ** p < 0.01, * p < 0.05

5.3 Logistic Regression - Multivariate with Main Effects Only

Main-effects multiple logistic regression model

low1_1 <- glm(low ~ age + lwt + race + smoke + ptl + ht + ui,
              family = binomial(logit), 
              data = lowbwt_clean)

summary(low1_1)

Call:
glm(formula = low ~ age + lwt + race + smoke + ptl + ht + ui, 
    family = binomial(logit), data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9120  -0.8174  -0.5224   0.9714   2.1807  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.464403   1.204687   0.385  0.69987   
age         -0.027070   0.036452  -0.743  0.45772   
lwt         -0.015183   0.006928  -2.192  0.02841 * 
raceBlack    1.263219   0.526463   2.399  0.01642 * 
raceOther    0.861635   0.439191   1.962  0.04978 * 
smokeYes     0.923349   0.400853   2.303  0.02125 * 
ptl          0.541755   0.346264   1.565  0.11768   
htYes        1.833696   0.691765   2.651  0.00803 **
uiYes        0.758597   0.459389   1.651  0.09867 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 201.43  on 180  degrees of freedom
AIC: 219.43

Number of Fisher Scoring iterations: 4

5.4 Logistic Regression - Multivariate with Interactions

Before removing non-significant main effects, test plausible interactions

Try interactions between age and lwt, age and smoke, lwt and smoke, 1 at a time

5.4.1 Age and Weight

low1_2 <- glm(low ~ age + lwt + race + smoke + ptl + ht + ui + age:lwt,
                 family = binomial(logit), 
                 data = lowbwt_clean)

summary(low1_2)

Call:
glm(formula = low ~ age + lwt + race + smoke + ptl + ht + ui + 
    age:lwt, family = binomial(logit), data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9117  -0.8177  -0.5226   0.9719   2.1803  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)   
(Intercept)  6.375e-01  3.998e+00   0.159  0.87330   
age         -3.461e-02  1.699e-01  -0.204  0.83860   
lwt         -1.654e-02  3.074e-02  -0.538  0.59048   
raceBlack    1.261e+00  5.280e-01   2.389  0.01689 * 
raceOther    8.592e-01  4.421e-01   1.943  0.05197 . 
smokeYes     9.222e-01  4.015e-01   2.297  0.02163 * 
ptl          5.427e-01  3.470e-01   1.564  0.11779   
htYes        1.834e+00  6.919e-01   2.651  0.00803 **
uiYes        7.598e-01  4.603e-01   1.651  0.09882 . 
age:lwt      5.935e-05  1.306e-03   0.045  0.96375   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 201.42  on 179  degrees of freedom
AIC: 221.42

Number of Fisher Scoring iterations: 4
anova(low1_1, low1_2, test = 'Chi')
# A tibble: 2 x 5
  `Resid. Df` `Resid. Dev`    Df Deviance `Pr(>Chi)`
*       <dbl>        <dbl> <dbl>    <dbl>      <dbl>
1         180         201.    NA NA           NA    
2         179         201.     1  0.00206      0.964
Anova(low1_2, test = 'LR') #Type II Analysis of Deviance table
# A tibble: 8 x 3
  `LR Chisq`    Df `Pr(>Chisq)`
*      <dbl> <dbl>        <dbl>
1    0.559       1      0.455  
2    5.38        1      0.0204 
3    7.23        2      0.0270 
4    5.46        1      0.0194 
5    2.52        1      0.112  
6    7.39        1      0.00658
7    2.68        1      0.102  
8    0.00206     1      0.964  
Anova(low1_2, test = 'LR', type = 'III') #Type III Analysis of Deviance table
# A tibble: 8 x 3
  `LR Chisq`    Df `Pr(>Chisq)`
*      <dbl> <dbl>        <dbl>
1    0.0413      1      0.839  
2    0.290       1      0.590  
3    7.23        2      0.0270 
4    5.46        1      0.0194 
5    2.52        1      0.112  
6    7.39        1      0.00658
7    2.68        1      0.102  
8    0.00206     1      0.964  

5.4.2 Age and Smoking

low1_3 <- glm(low ~ age + lwt + race + smoke + ptl + ht + ui + age:smoke,
                 family = binomial(logit), 
                 data = lowbwt_clean)

summary(low1_3)

Call:
glm(formula = low ~ age + lwt + race + smoke + ptl + ht + ui + 
    age:smoke, family = binomial(logit), data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9578  -0.8387  -0.5121   0.9862   2.1932  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)   
(Intercept)   1.162890   1.455841   0.799  0.42442   
age          -0.058516   0.052407  -1.117  0.26418   
lwt          -0.014992   0.006878  -2.180  0.02929 * 
raceBlack     1.174317   0.537051   2.187  0.02877 * 
raceOther     0.825523   0.442058   1.867  0.06184 . 
smokeYes     -0.562054   1.742706  -0.323  0.74706   
ptl           0.525377   0.346143   1.518  0.12906   
htYes         1.862868   0.695622   2.678  0.00741 **
uiYes         0.834722   0.468785   1.781  0.07498 . 
age:smokeYes  0.065307   0.074818   0.873  0.38272   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 200.66  on 179  degrees of freedom
AIC: 220.66

Number of Fisher Scoring iterations: 4
anova(low1_1, low1_3, test = 'Chi')
# A tibble: 2 x 5
  `Resid. Df` `Resid. Dev`    Df Deviance `Pr(>Chi)`
*       <dbl>        <dbl> <dbl>    <dbl>      <dbl>
1         180         201.    NA   NA         NA    
2         179         201.     1    0.770      0.380

5.4.3 Weight and Smoking

low1_4 <- glm(low ~ age + lwt + race + smoke + ptl + ht + ui + lwt:smoke,
                 family = binomial(logit), 
                 data = lowbwt_clean)

summary(low1_4)

Call:
glm(formula = low ~ age + lwt + race + smoke + ptl + ht + ui + 
    lwt:smoke, family = binomial(logit), data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8924  -0.8089  -0.5399   0.9928   2.2129  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)  
(Intercept)   1.45738    1.55248   0.939   0.3479  
age          -0.02676    0.03670  -0.729   0.4659  
lwt          -0.02299    0.01051  -2.187   0.0287 *
raceBlack     1.28222    0.52798   2.429   0.0152 *
raceOther     0.78463    0.44213   1.775   0.0760 .
smokeYes     -0.85555    1.71464  -0.499   0.6178  
ptl           0.56700    0.34704   1.634   0.1023  
htYes         1.75771    0.69767   2.519   0.0118 *
uiYes         0.83264    0.46555   1.789   0.0737 .
lwt:smokeYes  0.01396    0.01313   1.063   0.2878  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 200.26  on 179  degrees of freedom
AIC: 220.26

Number of Fisher Scoring iterations: 5
anova(low1_1, low1_4, test = 'Chi')
# A tibble: 2 x 5
  `Resid. Df` `Resid. Dev`    Df Deviance `Pr(>Chi)`
*       <dbl>        <dbl> <dbl>    <dbl>      <dbl>
1         180         201.    NA    NA        NA    
2         179         200.     1     1.17      0.280

5.5 Logistic Regression - Multivariate, Simplify

No interactions are significant Remove non-significant main effects

5.5.1 Removing ui

low1_5 <- glm(low ~ age + lwt + race + smoke + ptl + ht,
                 family = binomial(logit), 
                 data = lowbwt_clean)

summary(low1_5)

Call:
glm(formula = low ~ age + lwt + race + smoke + ptl + ht, family = binomial(logit), 
    data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7699  -0.8502  -0.5482   0.9691   2.1526  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.74110    1.17816   0.629   0.5293  
age         -0.03194    0.03620  -0.882   0.3777  
lwt         -0.01560    0.00694  -2.248   0.0246 *
raceBlack    1.21986    0.52575   2.320   0.0203 *
raceOther    0.87396    0.43368   2.015   0.0439 *
smokeYes     0.93253    0.39737   2.347   0.0189 *
ptl          0.64623    0.33954   1.903   0.0570 .
htYes        1.72251    0.69012   2.496   0.0126 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 204.11  on 181  degrees of freedom
AIC: 220.11

Number of Fisher Scoring iterations: 4

age and ptl are theoretically meaningful variables, should probably retain them

Revise so that age is interpreted in 5-year and lwt in 20 lb increments

low1_6 <- glm(low ~ I(age*(1/5)) + I(lwt*(1/20)) + race + smoke + ptl + ht,
              family = binomial(logit), 
              data = lowbwt_clean)

summary(low1_6)

Call:
glm(formula = low ~ I(age * (1/5)) + I(lwt * (1/20)) + race + 
    smoke + ptl + ht, family = binomial(logit), data = lowbwt_clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7699  -0.8502  -0.5482   0.9691   2.1526  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)  
(Intercept)       0.7411     1.1782   0.629   0.5293  
I(age * (1/5))   -0.1597     0.1810  -0.882   0.3777  
I(lwt * (1/20))  -0.3120     0.1388  -2.248   0.0246 *
raceBlack         1.2199     0.5258   2.320   0.0203 *
raceOther         0.8740     0.4337   2.015   0.0439 *
smokeYes          0.9325     0.3974   2.347   0.0189 *
ptl               0.6462     0.3395   1.903   0.0570 .
htYes             1.7225     0.6901   2.496   0.0126 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 204.11  on 181  degrees of freedom
AIC: 220.11

Number of Fisher Scoring iterations: 4

5.5.2 Several \(R^2\) measures with the pscl::pR2() function

pscl::pR2(low1_6)
         llh      llhNull           G2     McFadden         r2ML 
-102.0532300 -117.3359981   30.5655362    0.1302479    0.1493227 
        r2CU 
   0.2099904 

5.5.3 Parameter Estiamtes

Parameters Exponentiated:

sjPlot::tab_model(low1_6,
                  emph.p = TRUE,
                  pred.labels = c("(Intercept)",
                                  "Age: 5 years",
                                  "Weight: 20 lbs",
                                  "Race: Black vs. White",
                                  "Race: Other vs. White",
                                  "Smoker During pregnancy",
                                  "History of Premature Labor",
                                  "History of Hypertension")) 
  low
Predictors Odds Ratios CI p
(Intercept) 2.10 0.22 – 22.38 0.529
Age: 5 years 0.85 0.59 – 1.21 0.378
Weight: 20 lbs 0.73 0.55 – 0.95 0.025
Race: Black vs. White 3.39 1.21 – 9.68 0.020
Race: Other vs. White 2.40 1.04 – 5.73 0.044
Smoker During pregnancy 2.54 1.18 – 5.66 0.019
History of Premature Labor 1.91 0.99 – 3.82 0.057
History of Hypertension 5.60 1.50 – 23.71 0.013
Observations 189
Cox & Snell’s R2 / Nagelkerke’s R2 0.149 / 0.210