Display system information and load tidyverse and faraway packages

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.2  magrittr_1.5    tools_3.6.2     htmltools_0.4.0
##  [5] yaml_2.2.1      Rcpp_1.0.4.6    stringi_1.4.6   rmarkdown_2.1  
##  [9] knitr_1.28      stringr_1.4.0   xfun_0.13       digest_0.6.25  
## [13] rlang_0.4.5     evaluate_0.14
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(faraway)

faraway package contains the datasets in the ELMR book.

Latent variables

tibble(t = seq(-6, 6, 0.1), pdf = dlogis(t, location = 0, scale = 1)) %>%
  ggplot() + 
  geom_line(mapping = aes(x = t, y = pdf)) + 
  labs(x = "t", y = "Density", title = "Logistic(0, 1) distribution")

Bliss data

bliss data records the number of insects dying at different levels of insecticide concentration.

bliss
##   dead alive conc
## 1    2    28    0
## 2    8    22    1
## 3   15    15    2
## 4   23     7    3
## 5   27     3    4

We fit bliss data using different link functions

mlogit   <- glm(cbind(dead, alive) ~ conc, family = binomial,                 data = bliss)
mprobit  <- glm(cbind(dead, alive) ~ conc, family = binomial(link = probit),  data = bliss)
mcloglog <- glm(cbind(dead, alive) ~ conc, family = binomial(link = cloglog), data = bliss)
mcauchit <- glm(cbind(dead, alive) ~ conc, family = binomial(link = cauchit), data = bliss)

and compare their deviances (equivalent to comparing their log-likelihoods)

# logLik(mlogit)
# logLik(mprobit)
# logLik(mcloglog)
# logLik(mcauchit)
mlogit$deviance
## [1] 0.3787483
mprobit$deviance
## [1] 0.3136684
mcloglog$deviance
## [1] 2.230479
mcauchit$deviance
## [1] 1.609425

Probit seems to give a better fit.

tibble(conc    = bliss$conc,
       logit   = predict(mlogit  , type = "response"),
       probit  = predict(mprobit , type = "response"),
       cloglog = predict(mcloglog, type = "response"),
       cauchit = predict(mcauchit, type = "response"))
## # A tibble: 5 x 5
##    conc  logit probit cloglog cauchit
##   <int>  <dbl>  <dbl>   <dbl>   <dbl>
## 1     0 0.0892 0.0842   0.127   0.119
## 2     1 0.238  0.245    0.250   0.213
## 3     2 0.5    0.498    0.455   0.506
## 4     3 0.762  0.752    0.722   0.791
## 5     4 0.911  0.914    0.933   0.882

we don’t see vast difference in the predictions from 4 link functions.

df <- tibble(conc = seq(-4, 8, 0.2))
tibble(dose    = df$conc,
       logit   = predict(mlogit  , type = "response", newdata = df),
       probit  = predict(mprobit , type = "response", newdata = df),
       cloglog = predict(mcloglog, type = "response", newdata = df),
       cauchit = predict(mcauchit, type = "response", newdata = df)) %>%
  pivot_longer(logit:cauchit, names_to = "link", values_to = "pred") %>%
  ggplot() + 
  geom_line(mapping = aes(x = dose, y = pred, color = link)) + 
  labs(x = "Dose", y = "Predicted probability", 
       title = "Predicions from different link functions")

Prospective and retrospective sampling

babyfood
##   disease nondisease  sex   food
## 1      77        381  Boy Bottle
## 2      19        128  Boy  Suppl
## 3      47        447  Boy Breast
## 4      48        336 Girl Bottle
## 5      16        111 Girl  Suppl
## 6      31        433 Girl Breast
xtabs(disease / (disease + nondisease) ~ sex + food, babyfood)
##       food
## sex        Bottle     Breast      Suppl
##   Boy  0.16812227 0.09514170 0.12925170
##   Girl 0.12500000 0.06681034 0.12598425
library(gtsummary)

babyfood %>%
  # change reference level to Breast and Girl
  mutate(food = relevel(food, ref = "Breast"), sex = relevel(sex, ref = "Girl")) %>% 
  glm(cbind(disease, nondisease) ~ sex + food, family = binomial, data = .) %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE)
Characteristic OR1 95% CI1 p-value
(Intercept) 0.07 0.06, 0.10 <0.001
sex
Girl
Boy 1.37 1.04, 1.81 0.027
food
Breast
Bottle 1.95 1.45, 2.64 <0.001
Suppl 1.64 1.06, 2.49 0.022

1 OR = Odds Ratio, CI = Confidence Interval

The regression coefficient 0.67 for Bottle respresents the increased risk of developing respiratory disease incurred by bottle feeding relative to breast feeding. Similarly the regression coefficient 0.31 represents the increased risk of boys relative to girls.

babyfood %>%
  # change reference level to Breast and Girl
  mutate(food = relevel(food, ref = "Breast"), sex = relevel(sex, ref = "Girl")) %>%
  pivot_longer(disease:nondisease, names_to = "dis", values_to = "count") %>%
  mutate(dis = relevel(as.factor(dis), ref = "nondisease")) %>%
  glm(sex ~ dis + food, family = binomial, weights = count, data = .) %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE)
Characteristic OR1 95% CI1 p-value
(Intercept) 1.04 0.91, 1.18 0.6
dis
nondisease
disease 1.37 1.04, 1.81 0.027
food
Breast
Bottle 1.10 0.91, 1.32 0.3
Suppl 1.07 0.82, 1.40 0.6

1 OR = Odds Ratio, CI = Confidence Interval

Prediction

# prediction at linear predictor scale
predict(mlogit, newdata = tibble(conc = 2.5), type = "link", se.fit = TRUE)
## $fit
##         1 
## 0.5809475 
## 
## $se.fit
## [1] 0.2262995
## 
## $residual.scale
## [1] 1
# prediction at probability scale
predict(mlogit, newdata = tibble(conc = 2.5), type = "response", se.fit = TRUE)
## $fit
##         1 
## 0.6412854 
## 
## $se.fit
##          1 
## 0.05205758 
## 
## $residual.scale
## [1] 1

To manually check:

lmodsum <- summary(mlogit)
x0 <- c(1, 2.5)
eta0 <- sum(x0 * coef(mlogit))
ilogit(eta0)
## [1] 0.6412854

To quantify the uncertainty in this prediction, we need the matrix \(\left( \mathbf{X}^T \widehat{\mathbf{W}} \mathbf{X} \right)^{-1}\):

(cm <-lmodsum$cov.unscaled)
##             (Intercept)        conc
## (Intercept)  0.17463024 -0.06582336
## conc        -0.06582336  0.03291168

So the standard error of \(\widehat{\eta}\) is

(se <- sqrt(t(x0) %*% cm %*% x0))
##           [,1]
## [1,] 0.2262995

and a 95% confidence interval on the probability scale is

ilogit(c(eta0 - 1.96 * se, eta0 + 1.96 * se))
## [1] 0.5342962 0.7358471

Effective doses, ED50 and LD50

(ld50 <- - mlogit$coef[1] / mlogit$coef[2])
## (Intercept) 
##           2
dr <- c(- 1 / mlogit$coef[2], mlogit$coef[1] / mlogit$coef[2]^2)
(se <- sqrt(dr %*% lmodsum$cov.unscaled %*% dr)[1, 1])
## [1] 0.1784367

leading to a 95% confidence interval

c(ld50 - 1.96 * se, ld50 + 1.96 * se)
## (Intercept) (Intercept) 
##    1.650264    2.349736
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
dose.p(mlogit, p = c(0.25, 0.5, 0.75))
##               Dose        SE
## p = 0.25: 1.054465 0.2315932
## p = 0.50: 2.000000 0.1784367
## p = 0.75: 2.945535 0.2315932

Conditional logistic regression for matched case-control studies

(amlxray <- as_tibble(amlxray))
## # A tibble: 238 x 11
##    ID    disease Sex   downs   age Mray  MupRay MlowRay Fray  Cray  CnRay
##    <fct>   <dbl> <fct> <fct> <int> <fct> <fct>  <fct>   <fct> <fct> <ord>
##  1 7004        1 F     no        0 no    no     no      no    no    1    
##  2 7004        0 F     no        0 no    no     no      no    no    1    
##  3 7006        1 M     no        6 no    no     no      no    yes   3    
##  4 7006        0 M     no        6 no    no     no      no    yes   2    
##  5 7009        1 F     no        8 no    no     no      no    no    1    
##  6 7009        0 F     no        8 no    no     no      no    no    1    
##  7 7010        1 M     yes       1 no    no     no      no    no    1    
##  8 7010        0 M     no        1 no    no     no      no    no    1    
##  9 7013        1 M     no        4 yes   no     yes     no    no    1    
## 10 7013        0 M     no        4 no    no     no      no    yes   2    
## # … with 228 more rows

Downs syndrome is a known risk factor. In this data set, all Downs syndrome children are cases. The coefficient for downs will be infinity.

amlxray %>%
  filter(downs == "yes") %>%
  print(width = Inf)
## # A tibble: 7 x 11
##   ID    disease Sex   downs   age Mray  MupRay MlowRay Fray  Cray  CnRay
##   <fct>   <dbl> <fct> <fct> <int> <fct> <fct>  <fct>   <fct> <fct> <ord>
## 1 7010        1 M     yes       1 no    no     no      no    no    1    
## 2 7018        1 F     yes       0 no    no     no      no    no    1    
## 3 7066        1 F     yes       1 no    no     no      no    no    1    
## 4 7077        1 M     yes       1 yes   yes    no      no    no    1    
## 5 7146        1 F     yes       1 yes   no     yes     yes   no    1    
## 6 7176        1 F     yes       1 no    no     no      no    no    1    
## 7 7189        1 F     yes       1 no    no     no      no    no    1

So we exclude Donws syndrome cases and their matched subjects.

(downs_ids <- amlxray %>% 
   filter(downs == "yes") %>% 
   .$ID)
## [1] 7010 7018 7066 7077 7146 7176 7189
## 112 Levels: 7004 7006 7009 7010 7013 7014 7015 7017 7018 7019 7021 7023 ... 7211
ramlxray <- amlxray %>%
  filter(!(ID %in% downs_ids))

Now we can fit a conditional logit model using predictors Sex, Mray (mother exposure to x-ray), Fray (father exposure to x-ray), and CnRay (child exposure to x-ray). Since CnRray is an ordered factor, linear, quadratic, and cubic contrasts (from orthogonal polynomial coding) are estimated. Only the linear effect is significant.

library(survival)
## 
## Attaching package: 'survival'
## The following objects are masked from 'package:faraway':
## 
##     rats, solder
cmod <- clogit(disease ~ Sex + Mray + Fray + CnRay + strata(ID), data = ramlxray)
summary(cmod)
## Call:
## coxph(formula = Surv(rep(1, 223L), disease) ~ Sex + Mray + Fray + 
##     CnRay + strata(ID), data = ramlxray, method = "exact")
## 
##   n= 223, number of events= 104 
## 
##            coef exp(coef) se(coef)      z Pr(>|z|)   
## SexM     0.1563    1.1691   0.3861  0.405  0.68566   
## Mrayyes  0.2276    1.2556   0.5821  0.391  0.69573   
## Frayyes  0.6933    2.0003   0.3512  1.974  0.04839 * 
## CnRay.L  1.9408    6.9641   0.6207  3.127  0.00177 **
## CnRay.Q -0.2480    0.7803   0.5819 -0.426  0.66993   
## CnRay.C -0.5801    0.5599   0.5906 -0.982  0.32598   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## SexM       1.1691     0.8553    0.5486     2.492
## Mrayyes    1.2556     0.7964    0.4013     3.929
## Frayyes    2.0003     0.4999    1.0049     3.982
## CnRay.L    6.9641     0.1436    2.0631    23.507
## CnRay.Q    0.7803     1.2815    0.2495     2.441
## CnRay.C    0.5599     1.7862    0.1759     1.781
## 
## Concordance= 0.662  (se = 0.056 )
## Likelihood ratio test= 20.89  on 6 df,   p=0.002
## Wald test            = 14.49  on 6 df,   p=0.02
## Score (logrank) test = 18.6  on 6 df,   p=0.005

We drop the qudratic and cubic effects of CnRray and the insignificant predictors Mray and Sex.

clogit(disease ~ Fray + unclass(CnRay) + strata(ID), data = ramlxray) %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE)
Characteristic OR1 95% CI1 p-value
Fray
no
yes 1.96 1.00, 3.84 0.051
unclass(CnRay) 2.26 1.42, 3.59 <0.001

1 OR = Odds Ratio, CI = Confidence Interval

Separation (ELMR 2.7)

(irisr <- iris %>%
  as_tibble() %>%
  filter(Species == "setosa" | Species == "versicolor") %>%
  dplyr::select(Sepal.Width, Sepal.Length, Species))
## # A tibble: 100 x 3
##    Sepal.Width Sepal.Length Species
##          <dbl>        <dbl> <fct>  
##  1         3.5          5.1 setosa 
##  2         3            4.9 setosa 
##  3         3.2          4.7 setosa 
##  4         3.1          4.6 setosa 
##  5         3.6          5   setosa 
##  6         3.9          5.4 setosa 
##  7         3.4          4.6 setosa 
##  8         3.4          5   setosa 
##  9         2.9          4.4 setosa 
## 10         3.1          4.9 setosa 
## # … with 90 more rows
lmod <- glm(Species ~ Sepal.Width + Sepal.Length, family = binomial, data = irisr)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lmod)
## 
## Call:
## glm(formula = Species ~ Sepal.Width + Sepal.Length, family = binomial, 
##     data = irisr)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -4.249e-05  -2.100e-08   0.000e+00   2.100e-08   4.522e-05  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -360.6   195972.9  -0.002    0.999
## Sepal.Width    -110.1    55361.5  -0.002    0.998
## Sepal.Length    131.8    64577.0   0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.3863e+02  on 99  degrees of freedom
## Residual deviance: 7.1185e-09  on 97  degrees of freedom
## AIC: 6
## 
## Number of Fisher Scoring iterations: 25

glm function is complaining there is some issue in algorithmic convergence. The residual deviance is essential 0, indicating a perfect fit, but none of the predictors are significant.

irisr %>%
  ggplot() + 
  geom_point(mapping = aes(x = Sepal.Width, y = Sepal.Length, color = Species)) + 
  geom_abline(intercept = - lmod$coef[1] / lmod$coef[3], 
              slope     = - lmod$coef[2] / lmod$coef[3]) + 
  labs(title = "Perfect separation in Iris data")

library(brglm)
## Loading required package: profileModel
## 'brglm' will gradually be superseded by 'brglm2' (https://cran.r-project.org/package=brglm2), which provides utilities for mean and median bias reduction for all GLMs and methods for the detection of infinite estimates in binomial-response models.
bmod <- brglm(Species ~ Sepal.Width + Sepal.Length, family = binomial, data = irisr)
summary(bmod)
## 
## Call:
## brglm(formula = Species ~ Sepal.Width + Sepal.Length, family = binomial, 
##     data = irisr)
## 
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -24.508     12.493  -1.962  0.04979 * 
## Sepal.Width    -8.897      2.748  -3.237  0.00121 **
## Sepal.Length    9.732      3.334   2.919  0.00351 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 130.638  on 99  degrees of freedom
## Residual deviance:   3.323  on 97  degrees of freedom
## Penalized deviance: 6.60971 
## AIC:  9.323