## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = T, fig.width = 5, fig.height = 5)
options(width = 100, scipen = 5, digits = 5)
setwd("~/statistics/Rmedstats/")
Automated model selection is a controvertial method. Use with care if you do.
step() function in R is based on AIC, but F-test-based method is more common in other statistical environments.
http://www.umass.edu/statdata/statdata/data/lowbwt.txt
SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. These data are copyrighted by John Wiley & Sons Inc. and must be acknowledged and used accordingly. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986.
library(gdata)
lbw <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
names(lbw) <- tolower(names(lbw))
## Recoding
lbw <- within(lbw, {
## race relabeling
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## ftv (frequency of visit) relabeling
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
ftv.cat <- relevel(ftv.cat, ref = "Normal")
## ptl
preterm <- factor(ptl >= 1, levels = c(F,T), labels = c("0","1+"))
})
lm.full <- lm(bwt ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, data = lbw)
lm.null <- lm(bwt ~ 1, data = lbw)
drop1() shows the partial F-test p-value for each variable. The same p-value can be obtained by using anova(model.full, model.one.fewer.var) to compare two models. Remove the least significant variable using update() and do drop1() again until all remaining variables is significant at a pre-specified cutoff (0.1 here).
## age is the least significant by partial F test
drop1(lm.full, test = "F")
Single term deletions
Model:
bwt ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74494960 2457
age 1 38343 74533303 2455 0.09 0.76248
lwt 1 2508944 77003904 2461 5.99 0.01532 *
race.cat 2 5560980 80055939 2467 6.64 0.00165 **
smoke 1 3329939 77824899 2463 7.96 0.00534 **
preterm 1 971457 75466416 2458 2.32 0.12939
ht 1 3346518 77841478 2463 8.00 0.00522 **
ui 1 5425727 79920686 2468 12.96 0.00041 ***
ftv.cat 2 380072 74875032 2454 0.45 0.63577
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now ftv.cat is the least significant
drop1(update(lm.full, ~ . -age), test = "F")
Single term deletions
Model:
bwt ~ lwt + race.cat + smoke + preterm + ht + ui + ftv.cat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74533303 2455
lwt 1 2483344 77016647 2459 5.96 0.01557 *
race.cat 2 5607620 80140923 2465 6.73 0.00151 **
smoke 1 3295772 77829075 2461 7.92 0.00545 **
preterm 1 1052971 75586274 2456 2.53 0.11355
ht 1 3323302 77856605 2462 7.98 0.00526 **
ui 1 5390566 79923869 2466 12.95 0.00041 ***
ftv.cat 2 369667 74902970 2452 0.44 0.64224
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now preterm is least significat at p = 0.12.
drop1(update(lm.full, ~ . -age -ftv.cat), test = "F")
Single term deletions
Model:
bwt ~ lwt + race.cat + smoke + preterm + ht + ui
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74902970 2452
lwt 1 2413556 77316526 2456 5.83 0.01673 *
race.cat 2 6248590 81151560 2463 7.55 0.00071 ***
smoke 1 3933172 78836142 2460 9.50 0.00237 **
preterm 1 1008759 75911729 2453 2.44 0.12020
ht 1 3440574 78343544 2459 8.31 0.00441 **
ui 1 5376658 80279628 2463 12.99 0.00040 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now all variables are significant at p < 0.1
drop1(update(lm.full, ~ . -age -ftv.cat -preterm), test = "F")
Single term deletions
Model:
bwt ~ lwt + race.cat + smoke + ht + ui
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 75911729 2453
lwt 1 2671613 78583342 2457 6.41 0.01223 *
race.cat 2 6674129 82585858 2465 8.00 0.00047 ***
smoke 1 4911219 80822948 2463 11.77 0.00074 ***
ht 1 3583850 79495579 2459 8.59 0.00381 **
ui 1 6327025 82238754 2466 15.17 0.00014 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Show summary for final model
summary(update(lm.full, ~ . -age -ftv.cat -preterm))
Call:
lm(formula = bwt ~ lwt + race.cat + smoke + ht + ui, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1843 -433 67 461 1631
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2837.64 243.63 11.65 < 2e-16 ***
lwt 4.24 1.68 2.53 0.01223 *
race.catBlack -475.81 145.58 -3.27 0.00129 **
race.catOther -350.00 112.34 -3.12 0.00213 **
smoke -354.90 103.43 -3.43 0.00074 ***
ht -585.11 199.61 -2.93 0.00381 **
ui -524.44 134.65 -3.89 0.00014 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 646 on 182 degrees of freedom
Multiple R-squared: 0.24, Adjusted R-squared: 0.215
F-statistic: 9.59 on 6 and 182 DF, p-value: 0.00000000366
add1() shows the partial F-test p-value for each variable. The same p-value can be obtained by using anova(model.null, model.one.more.var) to compare two models. Add the most significant variable using update() and do add1() again until all remaining variables is non-significant at a pre-specified cutoff (0.1 here).
In the end, it resulted in the same model.
## ui is the most significant variable
add1(lm.null, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ 1
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 99917053 2493
age 1 806927 99110126 2493 1.52 0.2188
lwt 1 3448881 96468171 2488 6.69 0.0105 *
race.cat 2 5070608 94846445 2487 4.97 0.0079 **
smoke 1 3573406 96343646 2488 6.94 0.0092 **
preterm 1 4757523 95159530 2485 9.35 0.0026 **
ht 1 2132014 97785038 2491 4.08 0.0449 *
ui 1 8028747 91888305 2479 16.34 0.000077 ***
ftv.cat 2 2082321 97834732 2493 1.98 0.1410
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now race.cat is the most significant
add1(update(lm.null, ~ . +ui), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 91888305 2479
age 1 472355 91415950 2480 0.96 0.3282
lwt 1 2076990 89811315 2477 4.30 0.0395 *
race.cat 2 4767394 87120911 2473 5.06 0.0072 **
smoke 1 2949940 88938365 2475 6.17 0.0139 *
preterm 1 2837049 89051257 2475 5.93 0.0159 *
ht 1 3162469 88725836 2474 6.63 0.0108 *
ftv.cat 2 1847816 90040489 2479 1.90 0.1527
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now smoke is the most significant
add1(update(lm.null, ~ . +ui +race.cat), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 87120911 2473
age 1 57041 87063871 2475 0.12 0.72884
lwt 1 2234424 84886488 2470 4.84 0.02900 *
smoke 1 6079888 81041024 2461 13.80 0.00027 ***
preterm 1 2651610 84469302 2469 5.78 0.01724 *
ht 1 2688781 84432130 2469 5.86 0.01646 *
ftv.cat 2 1158673 85962238 2474 1.23 0.29373
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now ht is the most significant
add1(update(lm.null, ~ . +ui +race.cat +smoke), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 81041024 2461
age 1 326 81040698 2463 0.00 0.978
lwt 1 1545445 79495579 2459 3.56 0.061 .
preterm 1 1338799 79702225 2460 3.07 0.081 .
ht 1 2457682 78583342 2457 5.72 0.018 *
ftv.cat 2 331205 80709819 2464 0.37 0.689
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now lwt is the most significant
add1(update(lm.null, ~ . +ui +race.cat +smoke +ht), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 78583342 2457
age 1 882 78582460 2459 0.00 0.964
lwt 1 2671613 75911729 2453 6.41 0.012 *
preterm 1 1266816 77316526 2456 2.98 0.086 .
ftv.cat 2 244671 78338671 2461 0.28 0.754
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now no variable is significant at p < 0.1
add1(update(lm.null, ~ . +ui +race.cat +smoke +ht +lwt), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 75911729 2453
age 1 108807 75802922 2454 0.26 0.61
preterm 1 1008759 74902970 2452 2.44 0.12
ftv.cat 2 325455 75586274 2456 0.39 0.68
## Show summary for final model
summary(update(lm.null, ~ . +ui +race.cat +smoke +ht +lwt))
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1843 -433 67 461 1631
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2837.64 243.63 11.65 < 2e-16 ***
ui -524.44 134.65 -3.89 0.00014 ***
race.catBlack -475.81 145.58 -3.27 0.00129 **
race.catOther -350.00 112.34 -3.12 0.00213 **
smoke -354.90 103.43 -3.43 0.00074 ***
ht -585.11 199.61 -2.93 0.00381 **
lwt 4.24 1.68 2.53 0.01223 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 646 on 182 degrees of freedom
Multiple R-squared: 0.24, Adjusted R-squared: 0.215
F-statistic: 9.59 on 6 and 182 DF, p-value: 0.00000000366
library(rms)
ols.full <- ols(bwt ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, data = lbw)
fastbw(ols.full, rule = "p", sls = 0.1)
Deleted Chi-Sq d.f. P Residual d.f. P AIC R2
age 0.09 1 0.7621 0.09 1 0.7621 -1.91 0.254
ftv.cat 0.88 2 0.6430 0.97 3 0.8073 -5.03 0.250
preterm 2.41 1 0.1205 3.39 4 0.4955 -4.61 0.240
Approximate Estimates after Deleting Factors
Coef S.E. Wald Z P
Intercept 2837.638 244.047 11.627 0.0000000
lwt 4.239 1.678 2.527 0.0115179
race.cat=Black -475.808 145.824 -3.263 0.0011029
race.cat=Other -349.998 112.532 -3.110 0.0018696
smoke -354.900 103.601 -3.426 0.0006134
ht -585.112 199.948 -2.926 0.0034300
ui -524.439 134.880 -3.888 0.0001010
Factors in Final Model
[1] lwt race.cat smoke ht ui
model.aic.backward <- step(lm.full, direction = "backward", trace = 1)
Start: AIC=2457.2
bwt ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat
Df Sum of Sq RSS AIC
- ftv.cat 2 380072 74875032 2454
- age 1 38343 74533303 2455
<none> 74494960 2457
- preterm 1 971457 75466416 2458
- lwt 1 2508944 77003904 2461
- smoke 1 3329939 77824899 2463
- ht 1 3346518 77841478 2463
- race.cat 2 5560980 80055939 2467
- ui 1 5425727 79920686 2468
Step: AIC=2454.1
bwt ~ age + lwt + race.cat + smoke + preterm + ht + ui
Df Sum of Sq RSS AIC
- age 1 27938 74902970 2452
<none> 74875032 2454
- preterm 1 927891 75802922 2454
- lwt 1 2422527 77297558 2458
- ht 1 3464300 78339331 2461
- smoke 1 3957207 78832239 2462
- race.cat 2 6089519 80964551 2465
- ui 1 5404596 80279627 2465
Step: AIC=2452.2
bwt ~ lwt + race.cat + smoke + preterm + ht + ui
Df Sum of Sq RSS AIC
<none> 74902970 2452
- preterm 1 1008759 75911729 2453
- lwt 1 2413556 77316526 2456
- ht 1 3440574 78343544 2459
- smoke 1 3933172 78836142 2460
- ui 1 5376658 80279628 2463
- race.cat 2 6248590 81151560 2463
summary(model.aic.backward)
Call:
lm(formula = bwt ~ lwt + race.cat + smoke + preterm + ht + ui,
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1886.4 -441.0 53.5 494.2 1620.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2871.99 243.67 11.79 <2e-16 ***
lwt 4.04 1.67 2.42 0.0167 *
race.catBlack -466.32 145.13 -3.21 0.0016 **
race.catOther -335.68 112.28 -2.99 0.0032 **
smoke -323.57 104.96 -3.08 0.0024 **
preterm1+ -208.44 133.50 -1.56 0.1202
ht -573.69 198.96 -2.88 0.0044 **
ui -489.96 135.93 -3.60 0.0004 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643 on 181 degrees of freedom
Multiple R-squared: 0.25, Adjusted R-squared: 0.221
F-statistic: 8.64 on 7 and 181 DF, p-value: 0.00000000396
model.aic.forward <- step(lm.null, direction = "forward", trace = 1, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat)
Start: AIC=2492.7
bwt ~ 1
Df Sum of Sq RSS AIC
+ ui 1 8028747 91888305 2479
+ preterm 1 4757523 95159530 2485
+ race.cat 2 5070608 94846445 2487
+ smoke 1 3573406 96343646 2488
+ lwt 1 3448881 96468171 2488
+ ht 1 2132014 97785038 2491
<none> 99917053 2493
+ ftv.cat 2 2082321 97834732 2493
+ age 1 806927 99110126 2493
Step: AIC=2478.8
bwt ~ ui
Df Sum of Sq RSS AIC
+ race.cat 2 4767394 87120911 2473
+ ht 1 3162469 88725836 2474
+ smoke 1 2949940 88938365 2475
+ preterm 1 2837049 89051257 2475
+ lwt 1 2076990 89811315 2477
<none> 91888305 2479
+ ftv.cat 2 1847816 90040489 2479
+ age 1 472355 91415950 2480
Step: AIC=2472.8
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC
+ smoke 1 6079888 81041024 2461
+ ht 1 2688781 84432130 2469
+ preterm 1 2651610 84469302 2469
+ lwt 1 2234424 84886488 2470
<none> 87120911 2473
+ ftv.cat 2 1158673 85962238 2474
+ age 1 57041 87063871 2475
Step: AIC=2461.1
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC
+ ht 1 2457682 78583342 2457
+ lwt 1 1545445 79495579 2459
+ preterm 1 1338799 79702225 2460
<none> 81041024 2461
+ age 1 326 81040698 2463
+ ftv.cat 2 331205 80709819 2464
Step: AIC=2457.3
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC
+ lwt 1 2671613 75911729 2453
+ preterm 1 1266816 77316526 2456
<none> 78583342 2457
+ age 1 882 78582460 2459
+ ftv.cat 2 244671 78338671 2461
Step: AIC=2452.7
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC
+ preterm 1 1008759 74902970 2452
<none> 75911729 2453
+ age 1 108807 75802922 2454
+ ftv.cat 2 325455 75586274 2456
Step: AIC=2452.2
bwt ~ ui + race.cat + smoke + ht + lwt + preterm
Df Sum of Sq RSS AIC
<none> 74902970 2452
+ age 1 27938 74875032 2454
+ ftv.cat 2 369667 74533303 2455
summary(model.aic.forward)
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt + preterm,
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1886.4 -441.0 53.5 494.2 1620.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2871.99 243.67 11.79 <2e-16 ***
ui -489.96 135.93 -3.60 0.0004 ***
race.catBlack -466.32 145.13 -3.21 0.0016 **
race.catOther -335.68 112.28 -2.99 0.0032 **
smoke -323.57 104.96 -3.08 0.0024 **
ht -573.69 198.96 -2.88 0.0044 **
lwt 4.04 1.67 2.42 0.0167 *
preterm1+ -208.44 133.50 -1.56 0.1202
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643 on 181 degrees of freedom
Multiple R-squared: 0.25, Adjusted R-squared: 0.221
F-statistic: 8.64 on 7 and 181 DF, p-value: 0.00000000396
model.aic.both <- step(lm.null, direction = "both", trace = 1, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat)
Start: AIC=2492.7
bwt ~ 1
Df Sum of Sq RSS AIC
+ ui 1 8028747 91888305 2479
+ preterm 1 4757523 95159530 2485
+ race.cat 2 5070608 94846445 2487
+ smoke 1 3573406 96343646 2488
+ lwt 1 3448881 96468171 2488
+ ht 1 2132014 97785038 2491
<none> 99917053 2493
+ ftv.cat 2 2082321 97834732 2493
+ age 1 806927 99110126 2493
Step: AIC=2478.8
bwt ~ ui
Df Sum of Sq RSS AIC
+ race.cat 2 4767394 87120911 2473
+ ht 1 3162469 88725836 2474
+ smoke 1 2949940 88938365 2475
+ preterm 1 2837049 89051257 2475
+ lwt 1 2076990 89811315 2477
<none> 91888305 2479
+ ftv.cat 2 1847816 90040489 2479
+ age 1 472355 91415950 2480
- ui 1 8028747 99917053 2493
Step: AIC=2472.8
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC
+ smoke 1 6079888 81041024 2461
+ ht 1 2688781 84432130 2469
+ preterm 1 2651610 84469302 2469
+ lwt 1 2234424 84886488 2470
<none> 87120911 2473
+ ftv.cat 2 1158673 85962238 2474
+ age 1 57041 87063871 2475
- race.cat 2 4767394 91888305 2479
- ui 1 7725534 94846445 2487
Step: AIC=2461.1
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC
+ ht 1 2457682 78583342 2457
+ lwt 1 1545445 79495579 2459
+ preterm 1 1338799 79702225 2460
<none> 81041024 2461
+ age 1 326 81040698 2463
+ ftv.cat 2 331205 80709819 2464
- smoke 1 6079888 87120911 2473
- ui 1 6534324 87575348 2474
- race.cat 2 7897341 88938365 2475
Step: AIC=2457.3
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC
+ lwt 1 2671613 75911729 2453
+ preterm 1 1266816 77316526 2456
<none> 78583342 2457
+ age 1 882 78582460 2459
+ ftv.cat 2 244671 78338671 2461
- ht 1 2457682 81041024 2461
- smoke 1 5848788 84432130 2469
- race.cat 2 7314184 85897527 2470
- ui 1 7354645 85937987 2472
Step: AIC=2452.7
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC
+ preterm 1 1008759 74902970 2452
<none> 75911729 2453
+ age 1 108807 75802922 2454
+ ftv.cat 2 325455 75586274 2456
- lwt 1 2671613 78583342 2457
- ht 1 3583850 79495579 2459
- smoke 1 4911219 80822948 2463
- race.cat 2 6674129 82585858 2465
- ui 1 6327025 82238754 2466
Step: AIC=2452.2
bwt ~ ui + race.cat + smoke + ht + lwt + preterm
Df Sum of Sq RSS AIC
<none> 74902970 2452
- preterm 1 1008759 75911729 2453
+ age 1 27938 74875032 2454
+ ftv.cat 2 369667 74533303 2455
- lwt 1 2413556 77316526 2456
- ht 1 3440574 78343544 2459
- smoke 1 3933172 78836142 2460
- ui 1 5376658 80279628 2463
- race.cat 2 6248590 81151560 2463
summary(model.aic.both)
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt + preterm,
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1886.4 -441.0 53.5 494.2 1620.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2871.99 243.67 11.79 <2e-16 ***
ui -489.96 135.93 -3.60 0.0004 ***
race.catBlack -466.32 145.13 -3.21 0.0016 **
race.catOther -335.68 112.28 -2.99 0.0032 **
smoke -323.57 104.96 -3.08 0.0024 **
ht -573.69 198.96 -2.88 0.0044 **
lwt 4.04 1.67 2.42 0.0167 *
preterm1+ -208.44 133.50 -1.56 0.1202
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643 on 181 degrees of freedom
Multiple R-squared: 0.25, Adjusted R-squared: 0.221
F-statistic: 8.64 on 7 and 181 DF, p-value: 0.00000000396
glm.full <- glm(low ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, data = lbw, family = binomial)
glm.null <- glm(low ~ 1, data = lbw, family = binomial)
## ftv.cat least significant
drop1(glm.full, test = "LRT")
Single term deletions
Model:
low ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat
Df Deviance AIC LRT Pr(>Chi)
<none> 196 218
age 1 196 216 0.87 0.3513
lwt 1 201 221 5.15 0.0233 *
race.cat 2 201 219 5.79 0.0554 .
smoke 1 199 219 3.32 0.0683 .
preterm 1 203 223 7.36 0.0067 **
ht 1 202 222 6.80 0.0091 **
ui 1 198 218 2.40 0.1215
ftv.cat 2 197 215 1.33 0.5141
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## age least significant
drop1(update(glm.full, ~ . -ftv.cat), test = "LRT")
Single term deletions
Model:
low ~ age + lwt + race.cat + smoke + preterm + ht + ui
Df Deviance AIC LRT Pr(>Chi)
<none> 197 215
age 1 198 214 1.02 0.3130
lwt 1 202 218 5.00 0.0254 *
race.cat 2 203 217 6.41 0.0406 *
smoke 1 201 217 4.41 0.0357 *
preterm 1 204 220 7.11 0.0076 **
ht 1 204 220 7.18 0.0074 **
ui 1 199 215 2.32 0.1279
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## ui least significant
drop1(update(glm.full, ~ . -ftv.cat -age), test = "LRT")
Single term deletions
Model:
low ~ lwt + race.cat + smoke + preterm + ht + ui
Df Deviance AIC LRT Pr(>Chi)
<none> 198 214
lwt 1 204 218 5.96 0.0146 *
race.cat 2 206 218 7.61 0.0222 *
smoke 1 203 217 4.72 0.0299 *
preterm 1 204 218 6.37 0.0116 *
ht 1 205 219 7.31 0.0069 **
ui 1 200 214 2.63 0.1048
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## All significant at 0.1 level
drop1(update(glm.full, ~ . -ftv.cat -age -ui), test = "LRT")
Single term deletions
Model:
low ~ lwt + race.cat + smoke + preterm + ht
Df Deviance AIC LRT Pr(>Chi)
<none> 200 214
lwt 1 207 219 6.68 0.0097 **
race.cat 2 208 218 7.47 0.0238 *
smoke 1 205 217 4.91 0.0266 *
preterm 1 208 220 7.77 0.0053 **
ht 1 207 219 6.56 0.0104 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Result
summary(update(glm.full, ~ . -ftv.cat -age -ui))
Call:
glm(formula = low ~ lwt + race.cat + smoke + preterm + ht, family = binomial,
data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.819 -0.803 -0.546 0.967 2.153
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.09462 0.95704 0.10 0.9212
lwt -0.01673 0.00695 -2.41 0.0161 *
race.catBlack 1.26372 0.52933 2.39 0.0170 *
race.catOther 0.86418 0.43509 1.99 0.0470 *
smoke 0.87611 0.40071 2.19 0.0288 *
preterm1+ 1.23144 0.44625 2.76 0.0058 **
ht 1.76744 0.70841 2.49 0.0126 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 200.48 on 182 degrees of freedom
AIC: 214.5
Number of Fisher Scoring iterations: 4
## preterm is the most significant variable
add1(glm.null, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "LRT")
Single term additions
Model:
low ~ 1
Df Deviance AIC LRT Pr(>Chi)
<none> 235 237
age 1 232 236 2.76 0.09665 .
lwt 1 229 233 5.98 0.01446 *
race.cat 2 230 236 5.01 0.08166 .
smoke 1 230 234 4.87 0.02737 *
preterm 1 222 226 12.77 0.00035 ***
ht 1 231 235 4.02 0.04491 *
ui 1 230 234 5.08 0.02426 *
ftv.cat 2 231 237 3.94 0.13914
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now age is the most significant
add1(update(glm.null, ~ . +preterm), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "LRT")
Single term additions
Model:
low ~ preterm
Df Deviance AIC LRT Pr(>Chi)
<none> 222 226
age 1 217 223 4.60 0.032 *
lwt 1 218 224 4.40 0.036 *
race.cat 2 217 225 4.88 0.087 .
smoke 1 219 225 2.57 0.109
ht 1 218 224 4.24 0.040 *
ui 1 219 225 2.78 0.096 .
ftv.cat 2 217 225 4.61 0.100 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now ht is the most significant
add1(update(glm.null, ~ . +preterm +age), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "LRT")
Single term additions
Model:
low ~ preterm + age
Df Deviance AIC LRT Pr(>Chi)
<none> 217 223
lwt 1 214 222 2.98 0.084 .
race.cat 2 214 224 3.32 0.190
smoke 1 215 223 2.26 0.133
ht 1 213 221 4.18 0.041 *
ui 1 215 223 2.17 0.141
ftv.cat 2 214 224 3.22 0.200
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now lwt is the most significant
add1(update(glm.null, ~ . +preterm +age +ht), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "LRT")
Single term additions
Model:
low ~ preterm + age + ht
Df Deviance AIC LRT Pr(>Chi)
<none> 213 221
lwt 1 207 217 5.69 0.017 *
race.cat 2 210 222 3.06 0.216
smoke 1 211 221 2.23 0.135
ui 1 210 220 3.00 0.084 .
ftv.cat 2 210 222 2.73 0.255
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Now no variable is significant at p < 0.1
add1(update(glm.null, ~ . +preterm +age +ht +lwt), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "LRT")
Single term additions
Model:
low ~ preterm + age + ht + lwt
Df Deviance AIC LRT Pr(>Chi)
<none> 207 217
race.cat 2 204 218 3.66 0.16
smoke 1 205 217 2.04 0.15
ui 1 205 217 2.28 0.13
ftv.cat 2 205 219 2.69 0.26
## Show summary for final model
summary(update(glm.null, ~ . +preterm +age +ht +lwt))
Call:
glm(formula = low ~ preterm + age + ht + lwt, family = binomial,
data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.697 -0.815 -0.630 0.941 2.206
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.04619 1.07364 1.91 0.05667 .
preterm1+ 1.52899 0.44128 3.46 0.00053 ***
age -0.05648 0.03536 -1.60 0.11017
ht 1.82544 0.72031 2.53 0.01127 *
lwt -0.01536 0.00685 -2.24 0.02497 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 207.43 on 184 degrees of freedom
AIC: 217.4
Number of Fisher Scoring iterations: 4
lrm.full <- rms::lrm(low ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, data = lbw)
fastbw(lrm.full, rule = "p", sls = 0.1)
Deleted Chi-Sq d.f. P Residual d.f. P AIC
ftv.cat 1.35 2 0.5094 1.35 2 0.5094 -2.65
age 0.96 1 0.3279 2.31 3 0.5114 -3.69
ui 2.59 1 0.1076 4.90 4 0.2982 -3.10
Approximate Estimates after Deleting Factors
Coef S.E. Wald Z P
Intercept 0.09885 0.954307 0.1036 0.917500
lwt -0.01632 0.006891 -2.3681 0.017881
race.cat=Black 1.23475 0.530166 2.3290 0.019860
race.cat=Other 0.83587 0.447618 1.8674 0.061851
smoke 0.84895 0.410321 2.0690 0.038547
preterm=1+ 1.19061 0.449355 2.6496 0.008059
ht 1.72064 0.705279 2.4397 0.014701
Factors in Final Model
[1] lwt race.cat smoke preterm ht
model.aic.backward <- step(glm.full, direction = "backward", trace = 1)
Start: AIC=217.5
low ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat
Df Deviance AIC
- ftv.cat 2 197 215
- age 1 196 216
<none> 196 218
- ui 1 198 218
- smoke 1 199 219
- race.cat 2 201 219
- lwt 1 201 221
- ht 1 202 222
- preterm 1 203 223
Step: AIC=214.83
low ~ age + lwt + race.cat + smoke + preterm + ht + ui
Df Deviance AIC
- age 1 198 214
<none> 197 215
- ui 1 199 215
- race.cat 2 203 217
- smoke 1 201 217
- lwt 1 202 218
- preterm 1 204 220
- ht 1 204 220
Step: AIC=213.85
low ~ lwt + race.cat + smoke + preterm + ht + ui
Df Deviance AIC
<none> 198 214
- ui 1 200 214
- smoke 1 203 217
- race.cat 2 206 218
- lwt 1 204 218
- preterm 1 204 218
- ht 1 205 219
summary(model.aic.backward)
Call:
glm(formula = low ~ lwt + race.cat + smoke + preterm + ht + ui,
family = binomial, data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.731 -0.784 -0.514 0.954 2.198
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.12533 0.96756 -0.13 0.8969
lwt -0.01592 0.00695 -2.29 0.0221 *
race.catBlack 1.30086 0.52848 2.46 0.0138 *
race.catOther 0.85441 0.44091 1.94 0.0526 .
smoke 0.86658 0.40447 2.14 0.0322 *
preterm1+ 1.12886 0.45039 2.51 0.0122 *
ht 1.86690 0.70737 2.64 0.0083 **
ui 0.75065 0.45882 1.64 0.1018
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 197.85 on 181 degrees of freedom
AIC: 213.9
Number of Fisher Scoring iterations: 4
model.aic.forward <- step(glm.null, direction = "forward", trace = 1, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat)
Start: AIC=236.67
low ~ 1
Df Deviance AIC
+ preterm 1 222 226
+ lwt 1 229 233
+ ui 1 230 234
+ smoke 1 230 234
+ ht 1 231 235
+ race.cat 2 230 236
+ age 1 232 236
<none> 235 237
+ ftv.cat 2 231 237
Step: AIC=225.9
low ~ preterm
Df Deviance AIC
+ age 1 217 223
+ lwt 1 218 224
+ ht 1 218 224
+ race.cat 2 217 225
+ ui 1 219 225
+ ftv.cat 2 217 225
+ smoke 1 219 225
<none> 222 226
Step: AIC=223.3
low ~ preterm + age
Df Deviance AIC
+ ht 1 213 221
+ lwt 1 214 222
+ smoke 1 215 223
+ ui 1 215 223
<none> 217 223
+ race.cat 2 214 224
+ ftv.cat 2 214 224
Step: AIC=221.12
low ~ preterm + age + ht
Df Deviance AIC
+ lwt 1 207 217
+ ui 1 210 220
+ smoke 1 211 221
<none> 213 221
+ race.cat 2 210 222
+ ftv.cat 2 210 222
Step: AIC=217.43
low ~ preterm + age + ht + lwt
Df Deviance AIC
+ ui 1 205 217
+ smoke 1 205 217
<none> 207 217
+ race.cat 2 204 218
+ ftv.cat 2 205 219
Step: AIC=217.15
low ~ preterm + age + ht + lwt + ui
Df Deviance AIC
<none> 205 217
+ smoke 1 203 217
+ race.cat 2 201 217
+ ftv.cat 2 202 218
summary(model.aic.forward)
Call:
glm(formula = low ~ preterm + age + ht + lwt + ui, family = binomial,
data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.693 -0.783 -0.612 0.931 2.218
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.74560 1.09167 1.60 0.1098
preterm1+ 1.42123 0.44873 3.17 0.0015 **
age -0.05331 0.03555 -1.50 0.1337
ht 1.90580 0.71601 2.66 0.0078 **
lwt -0.01437 0.00682 -2.11 0.0351 *
ui 0.69193 0.45428 1.52 0.1277
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 205.15 on 183 degrees of freedom
AIC: 217.2
Number of Fisher Scoring iterations: 4
model.aic.both <- step(lm.null, direction = "both", trace = 1, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat)
Start: AIC=2492.7
bwt ~ 1
Df Sum of Sq RSS AIC
+ ui 1 8028747 91888305 2479
+ preterm 1 4757523 95159530 2485
+ race.cat 2 5070608 94846445 2487
+ smoke 1 3573406 96343646 2488
+ lwt 1 3448881 96468171 2488
+ ht 1 2132014 97785038 2491
<none> 99917053 2493
+ ftv.cat 2 2082321 97834732 2493
+ age 1 806927 99110126 2493
Step: AIC=2478.8
bwt ~ ui
Df Sum of Sq RSS AIC
+ race.cat 2 4767394 87120911 2473
+ ht 1 3162469 88725836 2474
+ smoke 1 2949940 88938365 2475
+ preterm 1 2837049 89051257 2475
+ lwt 1 2076990 89811315 2477
<none> 91888305 2479
+ ftv.cat 2 1847816 90040489 2479
+ age 1 472355 91415950 2480
- ui 1 8028747 99917053 2493
Step: AIC=2472.8
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC
+ smoke 1 6079888 81041024 2461
+ ht 1 2688781 84432130 2469
+ preterm 1 2651610 84469302 2469
+ lwt 1 2234424 84886488 2470
<none> 87120911 2473
+ ftv.cat 2 1158673 85962238 2474
+ age 1 57041 87063871 2475
- race.cat 2 4767394 91888305 2479
- ui 1 7725534 94846445 2487
Step: AIC=2461.1
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC
+ ht 1 2457682 78583342 2457
+ lwt 1 1545445 79495579 2459
+ preterm 1 1338799 79702225 2460
<none> 81041024 2461
+ age 1 326 81040698 2463
+ ftv.cat 2 331205 80709819 2464
- smoke 1 6079888 87120911 2473
- ui 1 6534324 87575348 2474
- race.cat 2 7897341 88938365 2475
Step: AIC=2457.3
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC
+ lwt 1 2671613 75911729 2453
+ preterm 1 1266816 77316526 2456
<none> 78583342 2457
+ age 1 882 78582460 2459
+ ftv.cat 2 244671 78338671 2461
- ht 1 2457682 81041024 2461
- smoke 1 5848788 84432130 2469
- race.cat 2 7314184 85897527 2470
- ui 1 7354645 85937987 2472
Step: AIC=2452.7
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC
+ preterm 1 1008759 74902970 2452
<none> 75911729 2453
+ age 1 108807 75802922 2454
+ ftv.cat 2 325455 75586274 2456
- lwt 1 2671613 78583342 2457
- ht 1 3583850 79495579 2459
- smoke 1 4911219 80822948 2463
- race.cat 2 6674129 82585858 2465
- ui 1 6327025 82238754 2466
Step: AIC=2452.2
bwt ~ ui + race.cat + smoke + ht + lwt + preterm
Df Sum of Sq RSS AIC
<none> 74902970 2452
- preterm 1 1008759 75911729 2453
+ age 1 27938 74875032 2454
+ ftv.cat 2 369667 74533303 2455
- lwt 1 2413556 77316526 2456
- ht 1 3440574 78343544 2459
- smoke 1 3933172 78836142 2460
- ui 1 5376658 80279628 2463
- race.cat 2 6248590 81151560 2463
summary(model.aic.both)
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt + preterm,
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1886.4 -441.0 53.5 494.2 1620.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2871.99 243.67 11.79 <2e-16 ***
ui -489.96 135.93 -3.60 0.0004 ***
race.catBlack -466.32 145.13 -3.21 0.0016 **
race.catOther -335.68 112.28 -2.99 0.0032 **
smoke -323.57 104.96 -3.08 0.0024 **
ht -573.69 198.96 -2.88 0.0044 **
lwt 4.04 1.67 2.42 0.0167 *
preterm1+ -208.44 133.50 -1.56 0.1202
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643 on 181 degrees of freedom
Multiple R-squared: 0.25, Adjusted R-squared: 0.221
F-statistic: 8.64 on 7 and 181 DF, p-value: 0.00000000396