-
Notifications
You must be signed in to change notification settings - Fork 0
/
Report.Rmd
1068 lines (879 loc) · 39.1 KB
/
Report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "A Logistic Regression Equation to Predict the Onset of Diabetes in Pima Indian Women"
subtitle: "Development, Validation, and Comparison"
author: "Girish Palya (425998) and Karina Norvoish (409636)"
date: "`r Sys.Date()`"
output:
tufte::tufte_html: default
tufte::tufte_handout:
citation_package: natbib
latex_engine: xelatex
tufte::tufte_book:
citation_package: natbib
latex_engine: xelatex
bibliography: skeleton.bib
link-citations: yes
---
```{r setup, include=FALSE, warning=F, message=F}
library(tufte)
library(tidyverse)
library(knitr)
#library(kableExtra)
# invalidate cache when the tufte version changes
knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tufte'))
options(htmltools.dir.version = FALSE)
display <- function(object, digits=4, detail=FALSE) {
out <- NULL
#out$call <- object$call
summ <- summary(object, dispersion = object$dispersion)
if(detail){
coef <- summ$coef[, , drop = FALSE]
out$z.value <- coef[,3]#,drop=FALSE]
out$p.value <- coef[,4]#,drop=FALSE]
}
else{
coef <- summ$coef[, 1:2, drop = FALSE]
}
dimnames(coef)[[2]][1:2] <- c("coef.est", "coef.se")
out$n <- summ$df[1] + summ$df[2]
out$k <- summ$df[1]
out$coef <- coef[,"coef.est"]
out$se <- coef[,"coef.se"]
#print(out$call)
arm::pfround(coef, digits)
out$deviance <- summ$deviance
out$null.deviance <- summ$null.deviance
cat("---\n")
cat(paste(" n = ", out$n, ", k = ", out$k, "\n residual deviance = ",
arm::fround(out$deviance, 1), ", null deviance = ", arm::fround(out$null.deviance, 1),
"\n (difference = ", arm::fround(summ$null.deviance - summ$deviance, 1), ")", "\n", sep = ""))
out$dispersion <- if (is.null(object$dispersion)){
summ$dispersion
} else {
object$dispersion
}
if (out$dispersion != 1) {
cat(paste(" overdispersion parameter = ", arm::fround(out$dispersion, 1), "\n", sep = ""))
if (family(object)$family=="gaussian") {
out$sigma.hat <- sqrt(out$dispersion)
cat(paste(" residual sd is sqrt(overdispersion) = ",
arm::fround(out$sigma.hat, digits), "\n", sep = ""))
}
}
return(invisible(out))
}
```
# Objective
To develop and validate an empirical equation in order to predict the onset of diabetes
in Pima Indian women, and compare the results to a prior attempt
by [Smith et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2245318/pdf/procascamc00018-0276.pdf)^[JW Smith,
JE Everhart, WC Dicksont,
WC Knowler, RS Johannes. 1988. *Using the ADAP Learning Algorithm to Forecast
the Onset of Diabetes Mellitus*]
# Research Design and Methods
A predictive equation was developed using logistic regression analysis and
data collected from 768 Pima Indian women. The equation incorporated age,
BMI, capillary plasma glucose, a hereditary function, and
pregnancy count as independent covariates for diagnosing onset of
diabetes within 5 years. The
equation was evaluated using binned residual plots^[A. Gelman and J. Hill,
_Data Analysis Using Regression and Multilevel/Hierarchical Models_,
Cambridge University Press, 2007.]. Its predictive performance
was compared against the results obtained by Smith et al.
# Results
The predictive equation was calculated using logistic regression
parameters as follows: $P(diagnosis = 1) = 1/(1 - e^{-x})$, where
_x_ = -19.457 + 0.046(*blood glucose in mg/dl*) + 3.352(log(*BMI in kg/in*))
+ 0.118(*pregnancy count*) + 4.175(*hereditary factor*) +
0.012(*age in years*) -
0.025(*blood glucose : hereditary factor*).
At a threshold of 0.5 for positive diagnosis, equation's sensitivity was 88%,
specificity was 59% and error rate was 22%.
At 0.35 cut-off point for positive diagnosis, equation's sensitivity was 76% and
specificity was 74%, while error rate remains the same.
When the model was
trained using 576 randomly selected cases (coefficients of logistic regression
recalculated) and prediction was performed on the remaining
192 cases, sensitivity remains the same while
specificity varies in the range of 72-74% (after 100 repetitions).
The area under
ROC curve was 84%. In contrast, Smith et al. report sensitivity and
specificity
of 76% when ADAP algorithm was trained on 576 cases.
They did not report error rate or the area under ROC curve; only
the shape of the ROC curve was reported.
# Conclusion
Performance of logistic regression model compares favorably with
neural network models like ADAP learning algorithm. Careful selection
of input variables, transformations, and interactions can result
in a logistic regression model whose performance will be
on par with more sophisticated techniques.
---
<br>
`r newthought('Diabetes mellitus')` afflicts
nearly 9% of world population (463 million) and causes 4 million
deaths every year. The ability to forecast is central to many
medical situations involving care and management. Although
many sophisticated models have been developed for discriminant analysis,
recent empirical comparisons indicate that standard methods such as
logistic regression work very well^[C B Begg. *Statistical Methods in Medical Diagnosis*].
# Data and Related Work
`r newthought('Smith et al')`.^[JW Smith, JE Everhart, WC Dicksont,
WC Knowler, RS Johannes. 1988. *Using the ADAP Learning Algorithm to Forecast
the Onset of Diabetes Mellitus*] have used the ADAP Learning Algorithm to forecast
the onset of diabetes mellitus.
They describe ADAP as "an adaptive learning routine that generates and executes digital analogs of perceptron-like devices".
Data used by Smith et al. is available from [Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database) via UCI Machine Learning Repository.
`r margin_note('A subset of data.')`
```{r echo=T}
diabetes <- read.csv(
str_c("https://raw.githubusercontent.com/",
"girishji/Pima/master/data/diabetes.csv"))
head(diabetes) %>% kable()
```
## Study Population
`r newthought('The population')` for this study was the Pima Indian population near
Phoenix, Arizona. That population has been under continuous
study since 1965 by the
[National Institute of Diabetes and Digestive and Kidney Diseases](https://www.niddk.nih.gov/)
because of its high incidence rate
of diabetes. Each community resident over 5 years of age
was asked to undergo a standardized examination every two years,
which included an oral glucose tolerance test. Diabetes was
diagnosed according to
World Health Organization Criteria^[World Health Organization, *Report of a Study Group: Diabetes Mellitus*. World Health Organization Technical
Report Series. Geneva, 727, 1985.];
that is, if the 2 hour post-load plasma glucose was at least 200
mg/dl (11.1 mmol/l) at any survey examination or if the
[Indian Health Service Hospital](https://www.ihs.gov/) serving the
community found a glucose
concentration of at least 200 mg/dl during the course of routine
medical care^[Knowler, W.C., P.H. Bennett, RF. Hamman, and M.
Milier. 1978. *Diabetes incidence and prevalence in Pima
Indians: a 19-fold greater incidence than in Rochester*,
Minnesota. Am J Epidemiol 108:497-505.].
## Variables
`r newthought('The subjects')` of the study are Pima Indian women over 21 years of age.
The following explanatory variables are found to be risk factors
for diabetes.
1. `Pregnancies`: Number of times pregnant
2. `Glucose`: Plasma Glucose Concentration at 2 Hours in an Oral
Glucose Tolerance Test (GTIT) (in mg/dl)
3. `BloodPressure`: Diastolic Blood Pressure (mm Hg)
4. `SkinThickness`: Triceps Skin Fold Thickness (mm)
5. `Insulin`: 2-Hour Serum Insulin (Uh/ml)
6. `BMI`: Body Mass Index (Weight in kg / (Height in in))
7. `DPF`^[Diabetes Pedigree Function was developed
by Smith et al.* to provide a measure of the expected
genetic influence of affected and unaffected relatives on the
subject's eventual diabetes
risk. It uses information from parents, grandparents,
full and half siblings, full and half aunts and uncles, and first
cousins.]: Diabetes Pedigree Function in the range of (0, 1)
`r margin_note("* JW Smith, JE Everhart, WC Dicksont, WC Knowler, RS Johannes. 1988. *Using the ADAP Learning Algorithm to Forecast the Onset of Diabetes Mellitus*")`
8. `Age`: Age (years)
The binary dependent variable (`Diagnosis`) describes
the onset of non-insulin-dependent diabetes
mellitus (DM) within a five-year period. This variable is 1 if diagnosis
is positive within five years, and 0 otherwise.
## Observations from Data
`r newthought('Preliminary examination')` of data reveals the following:
- There are many spurious 0 values in the data, especially in `Insulin` and
`SkinThickness`. These values skew the mean and cause excessive outliers.
Zero values are presumed to be errors in data, and suitable remedies
will be employed to address the skew.
- There are 500 negative instances (65.1%)
of the regressand (`Diagnosis`), compared to 258 positive instances (34.9%).
Even though only logit model is considered for this study, using probit
model will likely produce a similar result since the CDFs
underlying binomial logit and probit
models differ most in the tails.
- Density plot indicate that not all distributions are normal. Data is skewed
towards younger subjects. `Pregnancies` are also skewed towards
subjects having fewer pregnancies. `Insulin` and `SkinThickness` are
distored by spurious zero values.
`BMI`, `Insulin`, and `SkinThickness` have long tails on the right side.
`r margin_note('Zero values in some variables indicate errors in data.')`
```{r echo=T}
diabetes %>%
summarise_each(~ sum(.x == 0)) %>%
pivot_longer(everything(), names_to = 'Variable',
values_to = 'Count of Zero Values in Column') %>%
knitr::kable()
```
<br>
```{r fig.cap = "Box plot showing spurious outliers in `Insulin` and `SkinThickness`, and skewed mean (red dot).", cache=TRUE, echo = T}
diabetes %>%
select_at(vars(!Diagnosis)) %>%
pivot_longer(everything(), names_to = 'Variable',
values_to = 'Value') %>%
ggplot(aes(x = Variable, y = Value,
fill = Variable)) +
geom_jitter(size = 0.1, alpha = 0.2) +
stat_summary(fun = mean, geom = "point",
shape = 20, size = 4,
color = "red", fill = "red") +
geom_boxplot(alpha = 0.3) +
theme_light() +
theme(legend.position="none") +
xlab('') +
ylab('')
```
```{r fig.cap = "Density plot of regressors (including spurious zero values).", cache=TRUE, echo=T, fig.height=3}
diabetes %>%
select_at(vars(!Diagnosis)) %>%
pivot_longer(everything(), names_to='Regressor',
values_to='Value') %>%
ggplot(aes(x = Value, group = Regressor,
fill = Regressor)) +
geom_density(alpha = 0.3) +
facet_wrap(~Regressor, scales = "free", nrow = 2) +
theme_light() +
theme(legend.position="none") +
xlab('') +
ylab('')
```
```{r warning=F, message=F, fig.cap = "Correlation among regressors.", cache=TRUE, fig.margin = TRUE, echo = T}
# Correlation matrix displayed on the right side.
diabetes %>%
rename(Preg = 1, BloodPr = 3, SkinTh = 4) %>%
GGally::ggcorr(method = c("everything", "pearson"))
```
`r newthought('Scatter plots and correlation matrix')` reveal the following:
- Among all the regressors, `Glucose` concentration has the highest correlation (.467) with onset of diabetes.
- `Age` is not strongly correlated (.238) with onset of diabetes. This seems to suggest that, either accumulation of unhealthy lifestyle habits is not a factor, or that such factors have already expressed themselves by the age of 21 years.
- Diabetes Pedigree Function (`DPF`) is not highly correlated with onset of diabetes, which suggests that hereditary factors may be less important.
- `Insulin` and `SkinThickness` show significant correlation (.437), but this may be a result of both variables having too many spurious zero values.
- Presence of spurious zero values in `Insulin` could be masking its correlation with `Glucose`.
- None of the explanatory variables are correlated with
the response variable in a dominating way.
```{r warning=F, message=F, fig.cap = "Scatter plots and correlation matrix of regressors.", cache=TRUE, echo=T}
diabetes %>%
GGally::ggpairs(., lower = list(continuous =
GGally::wrap("points",
alpha = 0.3, size = 0.1)))
```
# Model
`r newthought('Logistic regression')` is the standard way to model binary outcomes. The probability that $Diagnosis = 1$ is modeled as
$$P(Diagnosis_i = 1) = logit^{−1}(X_iβ),$$
under the assumption that the outcomes $Diagnosis_i$ are independent given these probabilities.
$Xβ$ is the linear predictor. The function $logit^{-1}(x) = \frac{e^x}{1+e^x}$ transforms
the continuous values to the range (0, 1), which is necessary since probabilities must be between 0 and 1.
In building the regression model all input variables that are expected to play a
significant role in predicting the outcome are included; Their interactions are
also included if the variables have
large effects. Statistically non-significant variables are included as long as they have
expected sign in the coefficients, and excluded if they do not have expected sign.
The strategy outlined by
Gelman and Hill^[page 69, Andrew Gelman and
Jennifer Hill, _Data Analysis Using Regression and Multilevel/Hierarchical Models, 2006._]
is followed since it is intuitive. Essentially, a simple model is considered
first and additional complexity is progressively included, taking care to check
for problems along the way.
## Logistic regression with just one predictor
`r newthought('Glucose level in blood ')` is known to be a strong predictor of onset of diabetes, and the correlation
matrix reflects this relationship.
Fitting the model using just `Glucose`:^[display() is a [modification](https://github.com/girishji/Pima/blob/master/Report.Rmd) of arm::display() to remove the echo of formula.]
```{r}
fit.1 <- glm(Diagnosis ~ Glucose, diabetes,
family = binomial(link = "logit"))
display(fit.1)
```
The coefficient for `Glucose` is .0379, which seems low, but it is measured in mg/dl. The mean
value for `Glucose` is 120 mg/dl. Multiplicative effect is still significant.
## Graphing the fitted model
```{r warning=F, message=F,fig.width = 5, fig.height = 3}
diabetes %>% mutate(Predicted = fitted(fit.1)) %>%
filter(Glucose != 0) %>%
ggplot(aes(Glucose, Diagnosis)) + geom_point() +
stat_smooth(aes(Glucose, Predicted), se = F) +
ylab('Probability of diabetes') + theme_light()
```
```{r echo=T, message=F, fig.margin=T, fig.cap="In the range of data, the solid line shows the best-fit logistic regression, and the light lines show uncertainty in the fit."}
# Best-fit and uncertainity of logistic regression
# shown on the right side.
sim.1 <- arm::sim(fit.1)
plot_uncertainity <- function(.data) {
plt <- .data %>%
mutate(Predicted = fitted(fit.1)) %>%
filter(Glucose != 0)
for (j in 1:10) {
plt <- plt %>%
mutate(!!sym(str_c('s_', j)) :=
arm::invlogit(sim.1@coef[j,1] +
sim.1@coef[j,2] * Glucose))
}
plt <- plt %>%
ggplot(aes(Glucose, Diagnosis)) + geom_point()
for (j in 1:10) {
plt <- plt + stat_smooth(aes(Glucose,
!!sym(str_c('s_', j))),
se = F, color = "gray")
}
plt <- plt +
stat_smooth(aes(Glucose, Predicted), se = F) +
labs(y = "Probability (diabetes)", title = "") +
theme_light()
return(plt)
}
plot_uncertainity(diabetes)
```
## Interpreting the logistic regression coefficients
Our model is $$P(diabetes) = logit^{−1}(-5.3501 + 0.0379 · Glucose)$$
- The constant term can be interpreted when `Glucose` = 0, in which case the probability of
diagnosing diabetes is $logit^{−1}(-5.3501) = `r round(arm::invlogit(-5.3501), digits=4)`.$
However `Glucose` level at 0 does not make sense, so constant term is not interpreted.
- Predictive difference with respect to `Glucose` is evaluated by computing the
derivative at the average value of `Glucose` in the data set, which is 120.9 mg/dl. The
value of the linear predictor here is -5.3501 + 0.0379 · 120.9 = -0.76799, and so
the slope of the curve at this point is $0.0379e^{-0.76799}/(1+e^{-0.76799})^2$ = 0.0082.
Thus, adding 10 mg/dl to `Glucose` corresponds to a positive difference in the probability of
diagnosing diabetes by about 8.2%.
- Considering the statistical significance of the coefficient for `Glucose`, the
slope is estimated well; the standard error is only 0.0033, which is tiny compared
to the coefficient estimate of 0.0379. The approximate 95% (2 standard errors) interval
is [0.0313, 0.0445]. This is statistically significant.
## Adding a second input variable
`r newthought('We extend the model by adding BMI')`. The coefficient is expected
to be positive.
::: {.fullwidth}
```{r}
fit.2 <- glm(Diagnosis ~ Glucose + BMI, diabetes,
family = binomial(link = "logit"))
display(fit.2)
```
:::
Comparing two individuals, the one with 1 mg/dl higher blood glucose will encounter
0.0352 logit probability of diabetes diagnosis. Similarly, an increase of 1 kg/in in
BMI corresponds to an increase of 0.0763 logit probability of diagnosis. Both
coefficients are statistically significant, each being more than 2 standard errors
away from zero. And both their signs make sense: glucose level and BMI are known
risk factors for diabetes.
For a quick interpretation, the “divide by 4 rule”^[page 82, Andrew Gelman and
Jennifer Hill, _Data Analysis Using Regression and Multilevel/Hierarchical Models, 2006._]
comes handy. Applying the rule on the coefficients, 1 mg/dl increase in glucose
leads to increas in probability
of diabetes diagnosis by 0.88%. Similarly, 1 kg/in increase in BMI increases probability
of diabetes diagnosis by 1.9%.
Comparing these two coefficients, it may appear that `BMI` is more a important factor.
But this is **incorrect**. The standard deviation of `BMI` is 7.9 and
for `Glucose` it is 31.9.
The logistic regression coefficients corresponding to 1-standard-deviation differences
are 0.0352x31.9 for `Glucose` and 0.0763x7.9 for `BMI` respectively. Again, applying
the "divide by 4 rule", 1-standard deviation difference of `Glucose` yields a 28%
increase in probability, while in `BMI` it is only 15%.
## Comparing the coefficient estimates when adding a predictor
The coefficient for `Glucose` changes from 0.379 in the original model to 0.352.
This is because people who have high `Glucose` are likely to have higher `BMI`.
The two factors have a small positive correlation (0.221) as indicated
in the correlation matrix.
## Adding additional input variables
`r newthought('Pregnancies')` is added to the model and checked for
significance.
::: {.fullwidth}
```{r}
fit.3 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies,
diabetes, family = binomial(link = "logit"))
display(fit.3)
```
:::
Number of pregnancies has small standard error and is statistically significant from 0
at 95% interval (0.0835, 0.1907). It is not obvious why high number of
pregnancies, in and of itself, is a risk factor.
`r newthought('DPF')`, the hereditary factor, is added to the model.
Although diabetes mellitus is not
genetic per se, DNA may influence the risk of developing it. This type of
diabetes tends to run in families.
::: {.fullwidth}
```{r}
fit.4 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies + DPF,
diabetes, family = binomial(link = "logit"))
display(fit.4)
```
:::
DPF has a positive coefficient as expected, but has some standard error.
At 95% interval of (0.3179, 1.4847) it is still statistically
significant from 0.
`r newthought('Age')` is generally not known to be a risk factor for
this type of diabetes.
::: {.fullwidth}
```{r}
fit.5 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies +
DPF + Age,
diabetes, family = binomial(link = "logit"))
display(fit.5)
```
:::
`Age` is not statistically significant, since it has a large
standard error. However, it has the correct sign in the coefficient.
This input may not influence the predictive power of the model
very much but will not hurt either. It will not be discarded.
`r newthought('Blood Pressure')` is added to the model
and checked for significance.
::: {.fullwidth}
```{r}
fit.5 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies +
DPF + Age + BloodPressure,
diabetes, family = binomial(link = "logit"))
display(fit.5)
```
:::
`BloodPressure` is only marginally statistically significant from 0.
The standard error is large, and consequently, its 95% interval is
[-0.0237, -0.0033]. Moreover, it has -ve coefficient. It is
known that diabetes damages arteries
and makes them targets for hardening, called atherosclerosis. This
can cause high blood pressure, which would indicate a +ve sign
for the coefficient. This is a dubious input, and we will
removed from the model.
`r newthought('SkinThickness and Insulin')` are added to the model.
It is worth noting beforehand that
both of these inputs have proportionately large number of zeros
in their columns.
::: {.fullwidth}
```{r}
fit.6 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies +
DPF + Age + SkinThickness + Insulin,
diabetes, family = binomial(link = "logit"))
display(fit.6)
```
:::
Both `SkinThickness` and `Insulin` suffer from large standard error
and are not statistically significant from 0.
`SkinThickness` measures subcutaneous fat; sign of the coefficient
is expected to be positive. Given the unexpected -ve
sign of the coefficient and large standard error, in addition to
the occurrence
of zeros in the data, this input variable is discarded
from the model.
`Insulin` is also excluded from the model since it has large standard error
and large number of observation errors.
Our model so far is as follows:
::: {.fullwidth}
```{r warning=F}
fit.7 <- glm(Diagnosis ~ Glucose + BMI + Pregnancies +
DPF + Age,
diabetes, family = binomial(link = "logit"))
stargazer::stargazer(fit.7, type = 'text', single.row = TRUE,
dep.var.caption = "")
```
:::
In order to understand the contribution of each coefficient
to the linear predictor, value of coefficient is multiplied by
the mean value of the corresponding input variable.
Mean values:
```{r}
mean_v <- diabetes %>%
select(!one_of('Diagnosis', 'SkinThickness',
'Insulin', 'BloodPressure')) %>%
summarise_all(mean)
mean_v %>% kable()
```
Multiplying coefficients by the respective mean value of input variable:
```{r}
mean_v %>% map2_df(names(.), ~ mean_v[[`.y`]] *
fit.7$coefficients[[`.y`]]) %>%
kable()
```
This shows that `Glucose` has the highest effect on linear predictor,
followed by `BMI`. Other variables only have a small contribution.
## Check for interactions
`r newthought('A quick search of all possible two-factor interactions')` based
on reducing AIC
values reveals the following candidates
for inclusion.
It is not clear if any of these interactions will improve
prediction, or that they even make sense in combination.
More insight is needed about their behavior. This
topic will be revisited later.
```{r results=F}
search = step(fit.7, ~.^2)
```
::: {.fullwidth}
```{r}
search$anova
```
:::
## Evaluating, checking, and comparing fitted logistic regressions
`r newthought('Residuals')` for logistic regression are defined as observed minus expected values:
$$residual_i = y_i − E(y_i|X_i) = y_i − logit^{−1}(X_iβ).$$
Data values $y_i$ are discrete, and $logit^{−1}(X_iβ)$ values are continuous.
The value of residuals are also discrete and depends on whether $y_i$ is 0 or 1.
Therefore, plots of raw residuals from logistic regressions are not useful.
```{r}
plot(fit.7, which = 1)
```
A binned residual plot^[page 97, Andrew Gelman and Jennifer Hill, _Data Analysis Using Regression and Multilevel/Hierarchical Models, 2006._] is better.
From the authors:
> We plot binned residuals by dividing the data into categories (bins) based
on their fitted values, and then plotting the average residual versus the average fitted
value for each bin. ... here we divided the data into 40 bins of equal size.
The dotted lines (computed as $2\sqrt{p(1 − p)/n}$, where n is the number of points
per bin) indicate ±2 standard-error bounds, within which one would expect
about 95% of the binned residuals to fall, if the model were actually true.
>The bins are not equally spaced; rather, each bin has an equal number of
data points. The light lines in the binned residual plot indicate theoretical 95% error bounds.
In the model, 3 out of 27 bins fall outside 95% error bounds. This means
roughly 3 out of every 27 predictions are incorrect. All the outliers
are in the lower left quadrant.
Residual is the difference between actual
and predicted values.
Negative residuals below 0.2 indicates that at low expected values
our model overpredicts diabetes (more false positives). This is also
observed later from ROC curve.
```{marginfigure}
The dotted lines in the binned residual plot indicate theoretical 95% error bounds
that would be appropriate if the model were true.
```
```{r fig.width=6, fig.height=4}
arm::binnedplot(fitted(fit.7),
residuals(fit.7, type = "response"))
```
## Plotting binned residuals versus inputs of interest
`r newthought('To understand the deviation better')`, residuals are binned
with respect to
individual input variables, and plotted.
::: {.fullwidth}
```{r fig.fullwidth = TRUE, fig.width=7, fig.height=4}
par(mfrow = c(2, 3))
for (inp in c('Glucose', 'BMI', 'Pregnancies', 'DPF', 'Age')) {
arm::binnedplot(diabetes[[inp]],
residuals(fit.7, type = "response"),
main = inp)
}
```
:::
Only `BMI` and `Age` exhibit outliers. `BMI`
has 6 outliers and 4 of them are on the lower left quadrant (similar
to the fitted model).
The raising and falling nature of residuals of `BMI` suggests that
a log transformation on `BMI` may help. Applying log
transformation (after temporarily removing 11 spurious 0 values
in the data) reduces outlier count from 6 to 4.
::: {.fullwidth}
```{r fig.width=5, fig.height=4}
fit.8 <- glm(Diagnosis ~ Glucose + log(BMI) + Pregnancies +
DPF + Age,
diabetes %>% filter(BMI != 0),
family = binomial(link = "logit"))
arm::binnedplot(fitted(fit.8),
residuals(fit.8, type = "response"),
main = 'Binned residuals with log(BMI)')
```
:::
To evaluate if log transformation improves predictive
power, a comparison of confusion matrix is considered.
Confusion matrix, so named because the matrix summarizes how the
model is confused, summarizes different types of model errors,
such as false positives (Type 1 Error) and false
negatives (Type 2 Error).
Applying log transformation makes false positives jump
from 119 to 120, and
false negatives reduce from 70 to 68.
This is only a marginal improvement if any.
However, it does not hurt to keep the log
transformation on `BMI`.
```{r}
# With log(BMI)
caret::confusionMatrix(
factor(if_else(fitted(fit.8) > 0.34, 1, 0)),
factor(diabetes %>% filter(BMI != 0) %>%
pull(Diagnosis)))$table
```
```{r}
# Without any transformation on BMI
caret::confusionMatrix(
factor(if_else(fitted(fit.7) > 0.34, 1, 0)),
factor(diabetes %>% pull(Diagnosis)))$table
```
## Check for interactions (again)
Combining inputs leads to increase or decrease in
the influence produced by individual input, depending
on the sign of the coefficient of the combined variable
in relation to the sign of the input variable.
From AIC analysis we have 4 candidates for inclusion, namely, `Pregnancies:Age`,
`Glucose:DPF`, `Glucose:Age`, and `BMI:Pregnancies`.
Plotting the binned residuals after including
each of these interactions one at a time:
::: {.fullwidth}
```{r}
#graphics.off()
#par(mar=c(1,1,1,1))
base <- "Diagnosis ~ Glucose + log(BMI) + Pregnancies + DPF + Age + "
par(mfrow = c(2, 2))
for (inter in c('Pregnancies:Age', 'Glucose:DPF',
'Glucose:Age', 'BMI:Pregnancies')) {
fit.temp <- glm(str_c(base, inter),
diabetes %>% filter(BMI != 0),
family = binomial(link = "logit"))
arm::binnedplot(fitted(fit.temp),
residuals(fit.temp, type = "response"),
main = str_c('With ', inter))
}
```
:::
`Glucose:DPF` is the most promising with only 2 outliers. Others
have at least 3 outliers. `Glucose:DPF` also has tighter
arrangement of residuals around 0 (horizontal axis).
`DPF` influences `Glucose`, a highly significant input, in
a meaningful way (discussed later).
Incorporating `Glucose:DPF` into the model, we observe
that there are only 2 outliers close to 95% error boundary.
This is a reasonably
good model.
::: {.fullwidth}
```{r fig.width=6, fig.height=4}
fit.9 <- glm(Diagnosis ~ Glucose + log(BMI) + Pregnancies +
DPF + Age + Glucose:DPF,
diabetes %>% filter(BMI != 0),
family = binomial(link = "logit"))
arm::binnedplot(fitted(fit.9),
residuals(fit.9, type = "response"),
main = 'Binned residuals of fitted values')
```
:::
# Results
## Description of the model
In summary, our model is as follows:
$$\small P(Diagnosis_i = 1) = logit^{−1}(X_iβ)$$
where, $$\small Xβ = -19.457 + 0.046(Glucose) + 3.352(\log(BMI)) + 0.118(Pregnancies) +$$
$$\small 4.175(DPF) + 0.012(Age) - 0.025(Glucose:DPF)$$
::: {.fullwidth}
```{r warning=F}
formula <- Diagnosis ~ Glucose + log(BMI) + Pregnancies +
DPF + Age + DPF:Glucose
fit.f <- glm(formula, diabetes %>% filter(BMI != 0),
family = binomial(link = "logit"))
stargazer::stargazer(fit.f, type = 'text', single.row = TRUE,
dep.var.caption = "")
```
:::
The sign of the coefficient of `Glucose:DPF` is negative. This
interaction can be
interpreted in two ways:
- For every additional 0.5 increase of hereditary proclivity to
diabetes (`DPF`),
a value of `r -0.5*round(fit.f$coefficients[[7]], digits=3)` is subtracted
from the coefficient
of `Glucose`. Mean value of `Glucose` is `r round(mean(diabetes$Glucose), 2)`.
Multiplying this by the coefficient of `Glucose` yields
`r round(mean(diabetes$Glucose)*fit.f$coefficients[[2]],3)`.
Additional 0.5 increase of `DPF` results in the
coefficient of `Glucose` decreasing by a very small amount
indeed (to `r 5.579-0.0125`). Inverse logit function will only cause a small
change of predicted values to be flipped as a result
of this interaction. Also,
effect of blood glucose wane as
hereditary effects become
more prominent.
- For every additional 10 mg/dl increase in blood glucose, a value
of `r -10*round(fit.f$coefficients[[7]], digits=3)` is subtracted from
the coefficient of `DPF` at mean value (reduction from
`r round(mean(diabetes$DPF)*4.175, 3)` to `r round(mean(diabetes$DPF)*4.175, 3)-0.25`). This
is consistent with the observation that as blood glucose
level increases (due to poor eating habits, for instance)
hereditary plays less of a role in determining
the onset of diabetes.
`BMI` has a multiplicative relationship with dependent variable
`Diagnosis` owing to the log transformation. For every 10% increase in
`BMI`, linear predictor increases by `r round(fit.f$coefficients[[3]] * log(1.1), 3)`
(`r round(fit.f$coefficients[[3]],3)`x$\log(1.1)$). Inverse of logit
function has to be applied on the linear predictor to
calculate the increase in shadow (dependent) variable.
Interpretation of other coefficients is already covered in previous
sections.
## Error rate and comparison to the null model
The error rate is defined as the proportion of cases for
which the deterministic prediction $y_i$ = 1 if
$logit^{−1}(X_iβ) >= 0.5$ and guessing $y_i = 0$
if $logit^{−1}(X_iβ) < 0.5$ is wrong.
Our model has an error rate of 22% (compared to 35%
for null model).
::: {.fullwidth}
```{r}
# Our model
error_rate <- function(predicted, reference, threshold = 0.5) {
round(sum((predicted >= threshold & reference == 0) |
(predicted < threshold & reference == 1)) /
length(reference) * 100, digits = 2)
}
error_rate(fitted(fit.f), diabetes %>% filter(BMI != 0) %>%
pull(Diagnosis))
```
```{r}
# Null model
round(min(sum(diabetes$Diagnosis == 1),
sum(diabetes$Diagnosis == 0)) /
length(diabetes$Diagnosis) * 100, digits = 2)
```
:::
## ROC curve
A receiver operating characteristic (ROC) curve is a graphical representation
of the trade-offs between type-1 (false positive) and type-2 (false
negative) errors. AUC (area under curve) is the summary of how
the model performs at different decision thresholds.
In diagnostic medicine, testing the hypothesis that the ROC Curve area or partial
area has a specific value is a common practice^[*Statistical Methods in
Diagnostic Medicine*, Second Edition, Ch.4,
Author(s): Xiao‐Hua Zhou Nancy A. Obuchowski Donna K. McClish].
Our model has AUC of 84%. This is a reasonably good value.
::: {.fullwidth}
```{r message=F, fig.width=6, fig.height=4}
plot(pROC::roc(diabetes %>% filter(BMI != 0) %>%
pull(Diagnosis), fitted(fit.f)),
col = "#0042A0",
print.thres = c(0.5, 0.345), print.auc = T)
```
:::
## Sensitivity and Specificity
Sensitivity measures the proportion of correct forecast
of presence of diabetes (true positive rate) w.r.t. to all positive
diagnosis, while specificity measures
the proportion of correct forecast of absence of diabetes (true negative rate)
w.r.t. all negative diagnosis.
At the cut-off point of 0.5 for fitted values (refer to ROC curve above),
the sensitivity is 88% and specificity is 59%. By adjusting
the cut-off point for positive diagnosis to 0.345, a sensitivity
of 76% and specificity of 74% can be achieved.
## Comparison with Smith et al.
Smith et al. describe their methodology as follows:
> "Once the algorithm had been trained using 576 cases, ADAP was used to forecast whether another 192 test cases would develop diabetes within five years. Forcing ADAP to conclude on all test cases produced a sensitivity and specificity of 76 percent. A receiver operating characteristic (ROC) curve was determined."
In order to make a direct comparison with Smith et al., data is
split into two groups: a
training set with sample size of 576 and a test set with the remaining
192 observations. Model is trained using training set (new coefficients of
logistic regression are calculated) and forecasting performance
of the trained model on the test set is assessed in terms of error rate,
sensitivity and specificity. This procedure is repeated 100 times
and summary statistics are presented for error rate, sensitivity
and specificity.
Smith et al. report sensitivity, specificity, and ROC curve when ADAP
was applied on all test cases. Logistic model's sensitivity, specificity,
and ROC curve, when regressed on all
test cases, has already been presented in the earlier sections. Smith et al.
did not report the error rate of their learning algorithm.
The forecasting evaluation procedure is presented below:
_Define helper functions_
Define a function to split data randomly into training set
of 576 observations and
data set of remaining 192 observations.
```{r}
# Split data (randomly) into training and
# test data sets.
split_data <- function(sample_size = 576) {
out <- list()
out$training <- diabetes %>%
mutate(id = row_number()) %>%
sample_n(sample_size) %>%
filter(BMI != 0)
out$test <- diabetes %>%
mutate(id = row_number()) %>%
anti_join(out$training, by = 'id') %>%
filter(BMI != 0)
return(out)
}
```
Define a prediction function. Using training set, calculate the
coefficients of the regressors. Use
the trained model on the test data to predict the outcome of
diabetes diagnosis.
```{r}
# Fit the model on the training data, validate on
# test data, and return predicted results.
predict_diabetes <- function(training_data, test_data) {
formula <- Diagnosis ~ Glucose + log(BMI) +
Pregnancies + DPF + Age + DPF:Glucose
fit.model <- glm(formula, training_data,
family = binomial(link = "logit"))
shadow_val <- predict(fit.model,
newdata = test_data)
return(arm::invlogit(shadow_val))
}
```
_Error rate_
Compare the
predicted outcome to data (truth)
and report error rate. The procedure is repeated 100 times to reveal
the distribution characteristics.
::: {.fullwidth}
```{r}
validate_error_rate <- function(threshold) {
return(c(1:100) %>%
map_dbl(~ {
splitted <- split_data()
predicted <- predict_diabetes(splitted$training,
splitted$test)
error_rate(predicted, splitted$test$Diagnosis,
threshold = threshold)
}))
}
result <- validate_error_rate(threshold = 0.5)
summary(result)
```
:::
At predictor threshold of 0.5 for positive values, the average
error rate is `r str_c(round(summary(result)[[4]], digits = 1), '%')`.
Predictor performance depends on the threshold for deciding positive
diagnosis. A threshold of 0.35 (instead of 0.5) improves specificity at
the expense of sensitivity. Average error rate is also slightly higher.
```{r}
summary(validate_error_rate(threshold = 0.35))
```
_Sensitivity_
Sensitivity is the ability of the model to correctly