Display system information and load tidyverse and faraway packages

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.2  magrittr_1.5    tools_3.6.2     htmltools_0.4.0
##  [5] yaml_2.2.1      Rcpp_1.0.3      stringi_1.4.6   rmarkdown_2.1  
##  [9] knitr_1.28      stringr_1.4.0   xfun_0.12       digest_0.6.24  
## [13] rlang_0.4.4     evaluate_0.14
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(faraway)

faraway package contains the datasets in the ELMR book.

GA 2000 US Presidential Election Data

The gavote data contains the voting data of Georgia (GA) in the 2000 presidential election. It is available as a dataframe.

# equivalent to head(gavote, 10)
gavote %>% head(10)
##          equip   econ perAA rural    atlanta gore  bush other votes ballots
## APPLING  LEVER   poor 0.182 rural notAtlanta 2093  3940    66  6099    6617
## ATKINSON LEVER   poor 0.230 rural notAtlanta  821  1228    22  2071    2149
## BACON    LEVER   poor 0.131 rural notAtlanta  956  2010    29  2995    3347
## BAKER    OS-CC   poor 0.476 rural notAtlanta  893   615    11  1519    1607
## BALDWIN  LEVER middle 0.359 rural notAtlanta 5893  6041   192 12126   12785
## BANKS    LEVER middle 0.024 rural notAtlanta 1220  3202   111  4533    4773
## BARROW   OS-CC middle 0.079 urban notAtlanta 3657  7925   520 12102   12522
## BARTOW   OS-PC middle 0.079 urban    Atlanta 7508 14720   552 22780   23735
## BEN.HILL OS-PC   poor 0.282 rural notAtlanta 2234  2381    46  4661    5741
## BERRIEN  OS-CC   poor 0.107 rural notAtlanta 1640  2718    52  4410    4475

We convert it into a tibble for easy handling by tidyverse.

gavote <- gavote %>% 
  as_tibble(rownames = "county") %>%
  print(width = Inf)
## # A tibble: 159 x 11
##    county   equip econ   perAA rural atlanta     gore  bush other votes ballots
##    <chr>    <fct> <fct>  <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
##  1 APPLING  LEVER poor   0.182 rural notAtlanta  2093  3940    66  6099    6617
##  2 ATKINSON LEVER poor   0.23  rural notAtlanta   821  1228    22  2071    2149
##  3 BACON    LEVER poor   0.131 rural notAtlanta   956  2010    29  2995    3347
##  4 BAKER    OS-CC poor   0.476 rural notAtlanta   893   615    11  1519    1607
##  5 BALDWIN  LEVER middle 0.359 rural notAtlanta  5893  6041   192 12126   12785
##  6 BANKS    LEVER middle 0.024 rural notAtlanta  1220  3202   111  4533    4773
##  7 BARROW   OS-CC middle 0.079 urban notAtlanta  3657  7925   520 12102   12522
##  8 BARTOW   OS-PC middle 0.079 urban Atlanta     7508 14720   552 22780   23735
##  9 BEN.HILL OS-PC poor   0.282 rural notAtlanta  2234  2381    46  4661    5741
## 10 BERRIEN  OS-CC poor   0.107 rural notAtlanta  1640  2718    52  4410    4475
## # … with 149 more rows

str function is useful for inspecting contents of any R object.

str(gavote)
## Classes 'tbl_df', 'tbl' and 'data.frame':    159 obs. of  11 variables:
##  $ county : chr  "APPLING" "ATKINSON" "BACON" "BAKER" ...
##  $ equip  : Factor w/ 5 levels "LEVER","OS-CC",..: 1 1 1 2 1 1 2 3 3 2 ...
##  $ econ   : Factor w/ 3 levels "middle","poor",..: 2 2 2 2 1 1 1 1 2 2 ...
##  $ perAA  : num  0.182 0.23 0.131 0.476 0.359 0.024 0.079 0.079 0.282 0.107 ...
##  $ rural  : Factor w/ 2 levels "rural","urban": 1 1 1 1 1 1 2 2 1 1 ...
##  $ atlanta: Factor w/ 2 levels "Atlanta","notAtlanta": 2 2 2 2 2 2 2 1 2 2 ...
##  $ gore   : int  2093 821 956 893 5893 1220 3657 7508 2234 1640 ...
##  $ bush   : int  3940 1228 2010 615 6041 3202 7925 14720 2381 2718 ...
##  $ other  : int  66 22 29 11 192 111 520 552 46 52 ...
##  $ votes  : int  6099 2071 2995 1519 12126 4533 12102 22780 4661 4410 ...
##  $ ballots: int  6617 2149 3347 1607 12785 4773 12522 23735 5741 4475 ...
gavote <- gavote %>%
  mutate(undercount = (ballots - votes) / ballots) %>%
  print(width = Inf)
## # A tibble: 159 x 12
##    county   equip econ   perAA rural atlanta     gore  bush other votes ballots
##    <chr>    <fct> <fct>  <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
##  1 APPLING  LEVER poor   0.182 rural notAtlanta  2093  3940    66  6099    6617
##  2 ATKINSON LEVER poor   0.23  rural notAtlanta   821  1228    22  2071    2149
##  3 BACON    LEVER poor   0.131 rural notAtlanta   956  2010    29  2995    3347
##  4 BAKER    OS-CC poor   0.476 rural notAtlanta   893   615    11  1519    1607
##  5 BALDWIN  LEVER middle 0.359 rural notAtlanta  5893  6041   192 12126   12785
##  6 BANKS    LEVER middle 0.024 rural notAtlanta  1220  3202   111  4533    4773
##  7 BARROW   OS-CC middle 0.079 urban notAtlanta  3657  7925   520 12102   12522
##  8 BARTOW   OS-PC middle 0.079 urban Atlanta     7508 14720   552 22780   23735
##  9 BEN.HILL OS-PC poor   0.282 rural notAtlanta  2234  2381    46  4661    5741
## 10 BERRIEN  OS-CC poor   0.107 rural notAtlanta  1640  2718    52  4410    4475
##    undercount
##         <dbl>
##  1     0.0783
##  2     0.0363
##  3     0.105 
##  4     0.0548
##  5     0.0515
##  6     0.0503
##  7     0.0335
##  8     0.0402
##  9     0.188 
## 10     0.0145
## # … with 149 more rows

Descriptive statistics

Numerical summaries:

summary(gavote)
##     county            equip        econ        perAA          rural    
##  Length:159         LEVER:74   middle:69   Min.   :0.0000   rural:117  
##  Class :character   OS-CC:44   poor  :72   1st Qu.:0.1115   urban: 42  
##  Mode  :character   OS-PC:22   rich  :18   Median :0.2330              
##                     PAPER: 2               Mean   :0.2430              
##                     PUNCH:17               3rd Qu.:0.3480              
##                                            Max.   :0.7650              
##        atlanta         gore             bush            other       
##  Atlanta   : 15   Min.   :   249   Min.   :   271   Min.   :   5.0  
##  notAtlanta:144   1st Qu.:  1386   1st Qu.:  1804   1st Qu.:  30.0  
##                   Median :  2326   Median :  3597   Median :  86.0  
##                   Mean   :  7020   Mean   :  8929   Mean   : 381.7  
##                   3rd Qu.:  4430   3rd Qu.:  7468   3rd Qu.: 210.0  
##                   Max.   :154509   Max.   :140494   Max.   :7920.0  
##      votes           ballots         undercount     
##  Min.   :   832   Min.   :   881   Min.   :0.00000  
##  1st Qu.:  3506   1st Qu.:  3694   1st Qu.:0.02779  
##  Median :  6299   Median :  6712   Median :0.03983  
##  Mean   : 16331   Mean   : 16926   Mean   :0.04379  
##  3rd Qu.: 11846   3rd Qu.: 12251   3rd Qu.:0.05647  
##  Max.   :263211   Max.   :280975   Max.   :0.18812
(gavote <- gavote %>%
  rename(usage = rural) %>%
  mutate(pergore = gore / votes, perbush = bush / votes)) %>%
  print(width = Inf)
## # A tibble: 159 x 14
##    county   equip econ   perAA usage atlanta     gore  bush other votes ballots
##    <chr>    <fct> <fct>  <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
##  1 APPLING  LEVER poor   0.182 rural notAtlanta  2093  3940    66  6099    6617
##  2 ATKINSON LEVER poor   0.23  rural notAtlanta   821  1228    22  2071    2149
##  3 BACON    LEVER poor   0.131 rural notAtlanta   956  2010    29  2995    3347
##  4 BAKER    OS-CC poor   0.476 rural notAtlanta   893   615    11  1519    1607
##  5 BALDWIN  LEVER middle 0.359 rural notAtlanta  5893  6041   192 12126   12785
##  6 BANKS    LEVER middle 0.024 rural notAtlanta  1220  3202   111  4533    4773
##  7 BARROW   OS-CC middle 0.079 urban notAtlanta  3657  7925   520 12102   12522
##  8 BARTOW   OS-PC middle 0.079 urban Atlanta     7508 14720   552 22780   23735
##  9 BEN.HILL OS-PC poor   0.282 rural notAtlanta  2234  2381    46  4661    5741
## 10 BERRIEN  OS-CC poor   0.107 rural notAtlanta  1640  2718    52  4410    4475
##    undercount pergore perbush
##         <dbl>   <dbl>   <dbl>
##  1     0.0783   0.343   0.646
##  2     0.0363   0.396   0.593
##  3     0.105    0.319   0.671
##  4     0.0548   0.588   0.405
##  5     0.0515   0.486   0.498
##  6     0.0503   0.269   0.706
##  7     0.0335   0.302   0.655
##  8     0.0402   0.330   0.646
##  9     0.188    0.479   0.511
## 10     0.0145   0.372   0.616
## # … with 149 more rows
ggplot(data = gavote) +
  geom_histogram(mapping = aes(x = undercount)) +
  labs(x = "Percent Undercount", y = "Number of Counties")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = gavote, mapping = aes(x = perAA, y = undercount)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

for (var in c("equip", "econ", "usage", "atlanta")) {
  plot <- ggplot(data = gavote) + 
    geom_point(mapping = aes(x = perAA, y = undercount, color = get(var)))
  print(plot)
}

for (var in c("equip", "econ", "usage", "atlanta")) {
  plot <- ggplot(data = gavote) + 
    geom_boxplot(mapping = aes(x = get(var), y = undercount))
  print(plot)
}

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:faraway':
## 
##     happy
## The following object is masked from 'package:dplyr':
## 
##     nasa
ggpairs(data = gavote, columns = c(4, 13, 14, 12)) + 
  labs(title = "GA 2000 Presidential Vote Data")

Linear model

A model with two quantitative predictors

lm

(lmod <- lm(undercount ~ pergore + perAA, gavote))
## 
## Call:
## lm(formula = undercount ~ pergore + perAA, data = gavote)
## 
## Coefficients:
## (Intercept)      pergore        perAA  
##     0.03238      0.01098      0.02853

Inspection of the result, stored in lmod, shows that it contains rich regression information.

str(lmod)
## List of 12
##  $ coefficients : Named num [1:3] 0.0324 0.011 0.0285
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "pergore" "perAA"
##  $ residuals    : Named num [1:159] 0.03695 -0.00699 0.06555 0.00235 0.00359 ...
##   ..- attr(*, "names")= chr [1:159] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:159] -5.52e-01 6.87e-02 -2.27e-02 1.20e-05 -7.94e-05 ...
##   ..- attr(*, "names")= chr [1:159] "(Intercept)" "pergore" "perAA" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:159] 0.0413 0.0433 0.0396 0.0524 0.048 ...
##   ..- attr(*, "names")= chr [1:159] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 2
##  $ qr           :List of 5
##   ..$ qr   : num [1:159, 1:3] -12.6095 0.0793 0.0793 0.0793 0.0793 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:159] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "pergore" "perAA"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 2
##   ..$ qraux: num [1:3] 1.08 1.01 1.01
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 156
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = undercount ~ pergore + perAA, data = gavote)
##  $ terms        :Classes 'terms', 'formula'  language undercount ~ pergore + perAA
##   .. ..- attr(*, "variables")= language list(undercount, pergore, perAA)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "undercount" "pergore" "perAA"
##   .. .. .. ..$ : chr [1:2] "pergore" "perAA"
##   .. ..- attr(*, "term.labels")= chr [1:2] "pergore" "perAA"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(undercount, pergore, perAA)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "undercount" "pergore" "perAA"
##  $ model        :'data.frame':   159 obs. of  3 variables:
##   ..$ undercount: num [1:159] 0.0783 0.0363 0.1052 0.0548 0.0515 ...
##   ..$ pergore   : num [1:159] 0.343 0.396 0.319 0.588 0.486 ...
##   ..$ perAA     : num [1:159] 0.182 0.23 0.131 0.476 0.359 0.024 0.079 0.079 0.282 0.107 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language undercount ~ pergore + perAA
##   .. .. ..- attr(*, "variables")= language list(undercount, pergore, perAA)
##   .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:3] "undercount" "pergore" "perAA"
##   .. .. .. .. ..$ : chr [1:2] "pergore" "perAA"
##   .. .. ..- attr(*, "term.labels")= chr [1:2] "pergore" "perAA"
##   .. .. ..- attr(*, "order")= int [1:2] 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(undercount, pergore, perAA)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:3] "undercount" "pergore" "perAA"
##  - attr(*, "class")= chr "lm"
  • The regression coefficient \(\widehat{\boldsymbol{\beta}}\) can be retrieved by
# same lmod$coefficients
coef(lmod)
## (Intercept)     pergore       perAA 
##  0.03237600  0.01097872  0.02853314
  • The fitted values or predicted values are \[ \widehat{\mathbf{y}} = \mathbf{X} \widehat{\boldsymbol{\beta}} \]
# same as lmod$fitted.values
predict(lmod) %>% head()
##          1          2          3          4          5          6 
## 0.04133661 0.04329088 0.03961823 0.05241202 0.04795484 0.03601558

and the residuals are \[ \widehat{\boldsymbol{\epsilon}} = \mathbf{y} - \widehat{\mathbf{y}} = \mathbf{y} - \mathbf{X} \widehat{\boldsymbol{\beta}}. \]

# same as lmod$residuals
residuals(lmod) %>% head()
##            1            2            3            4            5            6 
##  0.036946603 -0.006994927  0.065550577  0.002348407  0.003589940  0.014267264
  • The residual sum of squares (RSS), also called deviance, is \(\|\widehat{\boldsymbol{\epsilon}}\|^2\).
deviance(lmod)
## [1] 0.09324918
  • The degree of freedom of a linear model is \(n-p\).
nrow(gavote) - length(coef(lmod))
## [1] 156
df.residual(lmod)
## [1] 156

summary

  • The summary command computes some more regression quantities.
(lmodsum <- summary(lmod))
## 
## Call:
## lm(formula = undercount ~ pergore + perAA, data = gavote)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.046013 -0.014995 -0.003539  0.011784  0.142436 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.03238    0.01276   2.537   0.0122 *
## pergore      0.01098    0.04692   0.234   0.8153  
## perAA        0.02853    0.03074   0.928   0.3547  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02445 on 156 degrees of freedom
## Multiple R-squared:  0.05309,    Adjusted R-squared:  0.04095 
## F-statistic: 4.373 on 2 and 156 DF,  p-value: 0.01419
str(lmodsum)
## List of 11
##  $ call         : language lm(formula = undercount ~ pergore + perAA, data = gavote)
##  $ terms        :Classes 'terms', 'formula'  language undercount ~ pergore + perAA
##   .. ..- attr(*, "variables")= language list(undercount, pergore, perAA)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "undercount" "pergore" "perAA"
##   .. .. .. ..$ : chr [1:2] "pergore" "perAA"
##   .. ..- attr(*, "term.labels")= chr [1:2] "pergore" "perAA"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(undercount, pergore, perAA)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "undercount" "pergore" "perAA"
##  $ residuals    : Named num [1:159] 0.03695 -0.00699 0.06555 0.00235 0.00359 ...
##   ..- attr(*, "names")= chr [1:159] "1" "2" "3" "4" ...
##  $ coefficients : num [1:3, 1:4] 0.0324 0.011 0.0285 0.0128 0.0469 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "pergore" "perAA"
##   .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
##  $ aliased      : Named logi [1:3] FALSE FALSE FALSE
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "pergore" "perAA"
##  $ sigma        : num 0.0244
##  $ df           : int [1:3] 3 156 3
##  $ r.squared    : num 0.0531
##  $ adj.r.squared: num 0.0409
##  $ fstatistic   : Named num [1:3] 4.37 2 156
##   ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
##  $ cov.unscaled : num [1:3, 1:3] 0.272 -0.964 0.524 -0.964 3.683 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "pergore" "perAA"
##   .. ..$ : chr [1:3] "(Intercept)" "pergore" "perAA"
##  - attr(*, "class")= chr "summary.lm"
  • An unbiased estimate of the error variance \(\sigma^2\) is \[ \widehat{\sigma} = \sqrt{\frac{\text{RSS}}{\text{df}}} \]
sqrt(deviance(lmod) / df.residual(lmod))
## [1] 0.02444895
lmodsum$sigma
## [1] 0.02444895
  • A commonly used goodness of fit mesure is \(R^2\), or coefficient of determination or percentage of variance explained \[ R^2 = 1 - \frac{\sum_i (y_i - \widehat{y}_i)^2}{\sum_i (y_i - \bar{y})^2} = 1 - \frac{\text{RSS}}{\text{TSS}}, \] where \(\text{TSS} = \sum_i (y_i - \bar{y})^2\) is the total sum of squares.
lmodsum$r.squared
## [1] 0.05308861

An \(R^2\) of about 5% indicates the model has a poor fit. \(R^2\) can also be interpreted as the (squared) correlation between the predicted values and the response

cor(predict(lmod), gavote$undercount)^2
## [1] 0.05308861
  • Add more predictors into a model always increase \(R^2\). The adjusted \(R^2\) adjusts to the fact that a larger model also uses more parameters. Adding a predictor will only increase \(R_a^2\) if it has some predictive value. \[ R_a^2 = 1 - \frac{\text{RSS} / (n - p)}{\text{TSS} / (n - 1)}. \]
lmodsum$adj.r.squared
## [1] 0.04094872

A model with both quantitative and qualitative predictors

gavote <- gavote %>%
   mutate(cpergore = pergore - mean(pergore), cperAA = perAA - mean(perAA)) %>%
   print(width = Inf)
## # A tibble: 159 x 16
##    county   equip econ   perAA usage atlanta     gore  bush other votes ballots
##    <chr>    <fct> <fct>  <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
##  1 APPLING  LEVER poor   0.182 rural notAtlanta  2093  3940    66  6099    6617
##  2 ATKINSON LEVER poor   0.23  rural notAtlanta   821  1228    22  2071    2149
##  3 BACON    LEVER poor   0.131 rural notAtlanta   956  2010    29  2995    3347
##  4 BAKER    OS-CC poor   0.476 rural notAtlanta   893   615    11  1519    1607
##  5 BALDWIN  LEVER middle 0.359 rural notAtlanta  5893  6041   192 12126   12785
##  6 BANKS    LEVER middle 0.024 rural notAtlanta  1220  3202   111  4533    4773
##  7 BARROW   OS-CC middle 0.079 urban notAtlanta  3657  7925   520 12102   12522
##  8 BARTOW   OS-PC middle 0.079 urban Atlanta     7508 14720   552 22780   23735
##  9 BEN.HILL OS-PC poor   0.282 rural notAtlanta  2234  2381    46  4661    5741
## 10 BERRIEN  OS-CC poor   0.107 rural notAtlanta  1640  2718    52  4410    4475
##    undercount pergore perbush cpergore  cperAA
##         <dbl>   <dbl>   <dbl>    <dbl>   <dbl>
##  1     0.0783   0.343   0.646  -0.0652 -0.0610
##  2     0.0363   0.396   0.593  -0.0119 -0.0130
##  3     0.105    0.319   0.671  -0.0891 -0.112 
##  4     0.0548   0.588   0.405   0.180   0.233 
##  5     0.0515   0.486   0.498   0.0777  0.116 
##  6     0.0503   0.269   0.706  -0.139  -0.219 
##  7     0.0335   0.302   0.655  -0.106  -0.164 
##  8     0.0402   0.330   0.646  -0.0787 -0.164 
##  9     0.188    0.479   0.511   0.0710  0.0390
## 10     0.0145   0.372   0.616  -0.0364 -0.136 
## # … with 149 more rows
lmodi <- lm(undercount ~ cperAA + cpergore * usage + equip, gavote)
summary(lmodi)
## 
## Call:
## lm(formula = undercount ~ cperAA + cpergore * usage + equip, 
##     data = gavote)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.059530 -0.012904 -0.002180  0.009013  0.127496 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.043297   0.002839  15.253  < 2e-16 ***
## cperAA               0.028264   0.031092   0.909   0.3648    
## cpergore             0.008237   0.051156   0.161   0.8723    
## usageurban          -0.018637   0.004648  -4.009 9.56e-05 ***
## equipOS-CC           0.006482   0.004680   1.385   0.1681    
## equipOS-PC           0.015640   0.005827   2.684   0.0081 ** 
## equipPAPER          -0.009092   0.016926  -0.537   0.5920    
## equipPUNCH           0.014150   0.006783   2.086   0.0387 *  
## cpergore:usageurban -0.008799   0.038716  -0.227   0.8205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02335 on 150 degrees of freedom
## Multiple R-squared:  0.1696, Adjusted R-squared:  0.1253 
## F-statistic: 3.829 on 8 and 150 DF,  p-value: 0.0004001

The gtsummary package offers a more sensible diplay of regression results.

library(gtsummary)
lmodi %>%
  tbl_regression() %>%
  bold_labels() %>%
  bold_p(t = 0.05)
## Results will be printed using `knitr::kable()` and do not 
## support footnotes, spanning headers, or indentation. 
## 
## For tables styled by the gt package, use the installation code below.
## `remotes::install_github("rstudio/gt", ref = gtsummary::gt_sha)`
## 
## If you prefer to always use `knitr::kable()`, add the option
## `options(gtsummary.print_engine = "kable")` to your script
## or in a user- or project-level startup file, '.Rprofile'.
Characteristic Beta 95% CI p-value
cperAA 0.03 -0.03, 0.09 0.4
cpergore 0.01 -0.09, 0.11 0.9
usage
rural
urban -0.02 -0.03, -0.01 <0.001
equip
LEVER
OS-CC 0.01 0.00, 0.02 0.2
OS-PC 0.02 0.00, 0.03 0.008
PAPER -0.01 -0.04, 0.02 0.6
PUNCH 0.01 0.00, 0.03 0.039
cpergore * usage
cpergore * urban -0.01 -0.09, 0.07 0.8
gavote %>%
  select(cperAA, cpergore, equip, usage) %>%
  head(10)
## # A tibble: 10 x 4
##     cperAA cpergore equip usage
##      <dbl>    <dbl> <fct> <fct>
##  1 -0.0610  -0.0652 LEVER rural
##  2 -0.0130  -0.0119 LEVER rural
##  3 -0.112   -0.0891 LEVER rural
##  4  0.233    0.180  OS-CC rural
##  5  0.116    0.0777 LEVER rural
##  6 -0.219   -0.139  LEVER rural
##  7 -0.164   -0.106  OS-CC urban
##  8 -0.164   -0.0787 OS-PC urban
##  9  0.0390   0.0710 OS-PC rural
## 10 -0.136   -0.0364 OS-CC rural
model.matrix(lmodi) %>% head(10)
##    (Intercept)      cperAA    cpergore usageurban equipOS-CC equipOS-PC
## 1            1 -0.06098113 -0.06515076          0          0          0
## 2            1 -0.01298113 -0.01189493          0          0          0
## 3            1 -0.11198113 -0.08912311          0          0          0
## 4            1  0.23301887  0.17956499          0          1          0
## 5            1  0.11601887  0.07765876          0          0          0
## 6            1 -0.21898113 -0.13918434          0          0          0
## 7            1 -0.16398113 -0.10614032          1          1          0
## 8            1 -0.16398113 -0.07873442          1          0          1
## 9            1  0.03901887  0.07097452          0          0          1
## 10           1 -0.13598113 -0.03643969          0          1          0
##    equipPAPER equipPUNCH cpergore:usageurban
## 1           0          0          0.00000000
## 2           0          0          0.00000000
## 3           0          0          0.00000000
## 4           0          0          0.00000000
## 5           0          0          0.00000000
## 6           0          0          0.00000000
## 7           0          0         -0.10614032
## 8           0          0         -0.07873442
## 9           0          0          0.00000000
## 10          0          0          0.00000000

Hypothesis testing

anova(lmod, lmodi)
## Analysis of Variance Table
## 
## Model 1: undercount ~ pergore + perAA
## Model 2: undercount ~ cperAA + cpergore * usage + equip
##   Res.Df      RSS Df Sum of Sq      F   Pr(>F)   
## 1    156 0.093249                                
## 2    150 0.081775  6  0.011474 3.5077 0.002823 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(lmodi, test = "F")
## Single term deletions
## 
## Model:
## undercount ~ cperAA + cpergore * usage + equip
##                Df Sum of Sq      RSS     AIC F value  Pr(>F)  
## <none>                      0.081775 -1186.1                  
## cperAA          1 0.0004505 0.082226 -1187.2  0.8264 0.36479  
## equip           4 0.0054438 0.087219 -1183.8  2.4964 0.04521 *
## cpergore:usage  1 0.0000282 0.081804 -1188.0  0.0517 0.82051  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We also see \(F\)-test for quantitative variables, e.g., cperAA, conincides with the \(t\)-test reported by the lm function. Question: why drop1 function does not drop predictors cpergore and usage?

Confidence intervals

confint(lmodi)
##                             2.5 %       97.5 %
## (Intercept)          0.0376884415  0.048906189
## cperAA              -0.0331710614  0.089699222
## cpergore            -0.0928429315  0.109316616
## usageurban          -0.0278208965 -0.009452268
## equipOS-CC          -0.0027646444  0.015729555
## equipOS-PC           0.0041252334  0.027153973
## equipPAPER          -0.0425368415  0.024352767
## equipPUNCH           0.0007477196  0.027551488
## cpergore:usageurban -0.0852990903  0.067700182

Diagnostics

plot(lmodi)

gavote %>% 
  mutate(cook = cooks.distance(lmodi)) %>%
  filter(cook >= 0.1) %>%
  print(width = Inf)
## # A tibble: 2 x 17
##   county   equip econ  perAA usage atlanta     gore  bush other votes ballots
##   <chr>    <fct> <fct> <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
## 1 BEN.HILL OS-PC poor  0.282 rural notAtlanta  2234  2381    46  4661    5741
## 2 RANDOLPH OS-PC poor  0.527 rural notAtlanta  1381  1174    14  2569    3021
##   undercount pergore perbush cpergore cperAA  cook
##        <dbl>   <dbl>   <dbl>    <dbl>  <dbl> <dbl>
## 1      0.188   0.479   0.511   0.0710 0.0390 0.234
## 2      0.150   0.538   0.457   0.129  0.284  0.138
# this function is available from faraway package
halfnorm(hatvalues(lmodi), ylab = "Sorted leverages")

These two counties have unusually large leverages. They are actually the only counties that use paper ballot.

gavote %>%
  # mutate(hi = hatvalues(lmodi)) %>%
  # filter(hi > 0.4) %>%
  slice(c(103, 131)) %>%
  print(width = Inf)
## # A tibble: 2 x 16
##   county     equip econ  perAA usage atlanta     gore  bush other votes ballots
##   <chr>      <fct> <fct> <dbl> <fct> <fct>      <int> <int> <int> <int>   <int>
## 1 MONTGOMERY PAPER poor  0.243 rural notAtlanta  1013  1465    31  2509    2573
## 2 TALIAFERRO PAPER poor  0.596 rural notAtlanta   556   271     5   832     881
##   undercount pergore perbush cpergore    cperAA
##        <dbl>   <dbl>   <dbl>    <dbl>     <dbl>
## 1     0.0249   0.404   0.584 -0.00458 0.0000189
## 2     0.0556   0.668   0.326  0.260   0.353

Robust regression

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
rlmodi <- rlm(undercount ~ cperAA + cpergore * usage + equip, gavote)
summary(rlmodi)
## 
## Call: rlm(formula = undercount ~ cperAA + cpergore * usage + equip, 
##     data = gavote)
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.026e-02 -1.165e-02 -6.587e-06  1.100e-02  1.379e-01 
## 
## Coefficients:
##                     Value   Std. Error t value
## (Intercept)          0.0414  0.0023    17.8662
## cperAA               0.0327  0.0254     1.2897
## cpergore            -0.0082  0.0418    -0.1972
## usageurban          -0.0167  0.0038    -4.4063
## equipOS-CC           0.0069  0.0038     1.8019
## equipOS-PC           0.0081  0.0048     1.6949
## equipPAPER          -0.0059  0.0138    -0.4269
## equipPUNCH           0.0170  0.0055     3.0720
## cpergore:usageurban  0.0073  0.0316     0.2298
## 
## Residual standard error: 0.01722 on 150 degrees of freedom

Notably the regression coefficient for equipOS-PC changed from 0.0156 to 0.0081. It downweights the influence of two observations with equi being OS-PC.

p-values are not reported because inference for robust model is much harder than regular linear regression.

Transformation

plmodi <- lm(undercount ~ cperAA + poly(cpergore, 4) + usage + equip, gavote)
# summary(plmodi)
plmodi %>%
  tbl_regression() %>%
  bold_labels() %>%
  bold_p(t = 0.05)
## Results will be printed using `knitr::kable()` and do not 
## support footnotes, spanning headers, or indentation. 
## 
## For tables styled by the gt package, use the installation code below.
## `remotes::install_github("rstudio/gt", ref = gtsummary::gt_sha)`
## 
## If you prefer to always use `knitr::kable()`, add the option
## `options(gtsummary.print_engine = "kable")` to your script
## or in a user- or project-level startup file, '.Rprofile'.
Characteristic Beta 95% CI p-value
cperAA 0.02 -0.04, 0.08 0.5
poly(cpergore, 4)1 0.02 -0.10, 0.14 0.8
poly(cpergore, 4)2 0.00 -0.05, 0.05 >0.9
poly(cpergore, 4)3 -0.03 -0.08, 0.01 0.2
poly(cpergore, 4)4 0.01 -0.04, 0.05 0.8
usage
rural
urban -0.02 -0.03, -0.01 <0.001
equip
LEVER
OS-CC 0.01 0.00, 0.02 0.15
OS-PC 0.02 0.00, 0.03 0.010
PAPER -0.01 -0.04, 0.03 0.7
PUNCH 0.01 0.00, 0.03 0.074
`termplot` graphically summarizes the effect of each predictor. `terms` argument of `termplot` function specifies which term to plot. Pratial residuals are predictor values plus residual values.
termplot(plmodi, partial.resid = TRUE, terms = 2)

- _B-slines_:
library(splines)
blmodi <- lm(undercount ~ cperAA + bs(cpergore, 4) + usage + equip, gavote)
termplot(blmodi, partial.resid = TRUE, terms = 2)

Variable selection

Variable selection concerns the problem of selecting the best set of variables to put in a model. There are several approaches and it is still an active research area.

biglm <- lm(undercount ~ (equip + econ + usage + atlanta)^2 + (equip + econ + 
  usage + atlanta) * (perAA + pergore), gavote)

The step command sequentially eliminates terms to minimize the AIC:

(smallm <- step(biglm, trace = TRUE))
## Start:  AIC=-1203.9
## undercount ~ (equip + econ + usage + atlanta)^2 + (equip + econ + 
##     usage + atlanta) * (perAA + pergore)
## 
##                   Df Sum of Sq      RSS     AIC
## - econ:pergore     2 0.0000231 0.048896 -1207.8
## - equip:atlanta    2 0.0000367 0.048909 -1207.8
## - econ:perAA       2 0.0000993 0.048972 -1207.6
## - econ:usage       2 0.0003513 0.049224 -1206.8
## - equip:usage      3 0.0011993 0.050072 -1206.0
## - equip:econ       6 0.0031630 0.052035 -1205.9
## - usage:perAA      1 0.0000015 0.048874 -1205.9
## - atlanta:pergore  1 0.0000055 0.048878 -1205.9
## - atlanta:perAA    1 0.0000195 0.048892 -1205.8
## - usage:pergore    1 0.0001094 0.048982 -1205.5
## - usage:atlanta    1 0.0001774 0.049050 -1205.3
## - econ:atlanta     1 0.0004666 0.049339 -1204.4
## <none>                         0.048872 -1203.9
## - equip:pergore    3 0.0020637 0.050936 -1203.3
## - equip:perAA      3 0.0022945 0.051167 -1202.6
## 
## Step:  AIC=-1207.83
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + equip:usage + equip:atlanta + econ:usage + econ:atlanta + 
##     usage:atlanta + equip:perAA + equip:pergore + econ:perAA + 
##     usage:perAA + usage:pergore + atlanta:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - equip:atlanta    2 0.0000473 0.048943 -1211.7
## - econ:perAA       2 0.0002786 0.049174 -1210.9
## - econ:usage       2 0.0003498 0.049245 -1210.7
## - equip:usage      3 0.0011820 0.050078 -1210.0
## - atlanta:pergore  1 0.0000016 0.048897 -1209.8
## - usage:perAA      1 0.0000108 0.048906 -1209.8
## - atlanta:perAA    1 0.0000342 0.048930 -1209.7
## - equip:econ       6 0.0032132 0.052109 -1209.7
## - usage:pergore    1 0.0000891 0.048985 -1209.5
## - usage:atlanta    1 0.0001901 0.049086 -1209.2
## - econ:atlanta     1 0.0005762 0.049472 -1208.0
## <none>                         0.048896 -1207.8
## - equip:pergore    3 0.0020798 0.050975 -1207.2
## - equip:perAA      3 0.0023430 0.051239 -1206.4
## 
## Step:  AIC=-1211.68
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + equip:usage + econ:usage + econ:atlanta + usage:atlanta + 
##     equip:perAA + equip:pergore + econ:perAA + usage:perAA + 
##     usage:pergore + atlanta:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - econ:perAA       2 0.0002668 0.049210 -1214.8
## - econ:usage       2 0.0003377 0.049281 -1214.6
## - equip:usage      3 0.0012371 0.050180 -1213.7
## - equip:econ       6 0.0031734 0.052116 -1213.7
## - atlanta:perAA    1 0.0000090 0.048952 -1213.7
## - usage:perAA      1 0.0000102 0.048953 -1213.6
## - atlanta:pergore  1 0.0000205 0.048963 -1213.6
## - usage:pergore    1 0.0000858 0.049029 -1213.4
## - usage:atlanta    1 0.0001657 0.049109 -1213.1
## - econ:atlanta     1 0.0005519 0.049495 -1211.9
## <none>                         0.048943 -1211.7
## - equip:pergore    3 0.0020975 0.051040 -1211.0
## - equip:perAA      3 0.0023204 0.051263 -1210.3
## 
## Step:  AIC=-1214.81
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + equip:usage + econ:usage + econ:atlanta + usage:atlanta + 
##     equip:perAA + equip:pergore + usage:perAA + usage:pergore + 
##     atlanta:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - econ:usage       2 0.0003518 0.049561 -1217.7
## - usage:perAA      1 0.0000020 0.049212 -1216.8
## - atlanta:pergore  1 0.0000033 0.049213 -1216.8
## - atlanta:perAA    1 0.0000133 0.049223 -1216.8
## - equip:usage      3 0.0012737 0.050483 -1216.8
## - usage:pergore    1 0.0001640 0.049374 -1216.3
## - usage:atlanta    1 0.0002014 0.049411 -1216.2
## - equip:econ       6 0.0035240 0.052734 -1215.8
## - econ:atlanta     1 0.0003777 0.049587 -1215.6
## <none>                         0.049210 -1214.8
## - equip:pergore    3 0.0020308 0.051240 -1214.4
## - equip:perAA      3 0.0022547 0.051464 -1213.7
## 
## Step:  AIC=-1217.68
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + equip:usage + econ:atlanta + usage:atlanta + 
##     equip:perAA + equip:pergore + usage:perAA + usage:pergore + 
##     atlanta:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - equip:usage      3 0.0009571 0.050519 -1220.6
## - atlanta:perAA    1 0.0000023 0.049564 -1219.7
## - atlanta:pergore  1 0.0000110 0.049572 -1219.6
## - usage:pergore    1 0.0000774 0.049639 -1219.4
## - equip:econ       6 0.0033163 0.052878 -1219.4
## - usage:perAA      1 0.0001487 0.049710 -1219.2
## - usage:atlanta    1 0.0002271 0.049788 -1219.0
## - econ:atlanta     1 0.0003666 0.049928 -1218.5
## <none>                         0.049561 -1217.7
## - equip:pergore    3 0.0021013 0.051663 -1217.1
## - equip:perAA      3 0.0022522 0.051814 -1216.6
## 
## Step:  AIC=-1220.64
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + econ:atlanta + usage:atlanta + equip:perAA + 
##     equip:pergore + usage:perAA + usage:pergore + atlanta:perAA + 
##     atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - atlanta:perAA    1 0.0000013 0.050520 -1222.6
## - atlanta:pergore  1 0.0000365 0.050555 -1222.5
## - usage:pergore    1 0.0000781 0.050597 -1222.4
## - usage:perAA      1 0.0001133 0.050632 -1222.3
## - econ:atlanta     1 0.0002236 0.050742 -1221.9
## - equip:pergore    3 0.0017063 0.052225 -1221.4
## - usage:atlanta    1 0.0004128 0.050931 -1221.3
## <none>                         0.050519 -1220.6
## - equip:perAA      3 0.0023712 0.052890 -1219.3
## - equip:econ       6 0.0060440 0.056563 -1214.7
## 
## Step:  AIC=-1222.63
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + econ:atlanta + usage:atlanta + equip:perAA + 
##     equip:pergore + usage:perAA + usage:pergore + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - usage:pergore    1 0.0000828 0.050603 -1224.4
## - usage:perAA      1 0.0001124 0.050632 -1224.3
## - econ:atlanta     1 0.0002392 0.050759 -1223.9
## - atlanta:pergore  1 0.0002764 0.050796 -1223.8
## - equip:pergore    3 0.0017064 0.052226 -1223.3
## - usage:atlanta    1 0.0004166 0.050936 -1223.3
## <none>                         0.050520 -1222.6
## - equip:perAA      3 0.0023889 0.052909 -1221.3
## - equip:econ       6 0.0061358 0.056656 -1216.4
## 
## Step:  AIC=-1224.37
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + econ:atlanta + usage:atlanta + equip:perAA + 
##     equip:pergore + usage:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - econ:atlanta     1 0.0002142 0.050817 -1225.7
## - atlanta:pergore  1 0.0003310 0.050934 -1225.3
## - usage:atlanta    1 0.0003813 0.050984 -1225.2
## - equip:pergore    3 0.0017434 0.052346 -1225.0
## <none>                         0.050603 -1224.4
## - equip:perAA      3 0.0025505 0.053153 -1222.5
## - usage:perAA      1 0.0017947 0.052397 -1220.8
## - equip:econ       6 0.0064632 0.057066 -1217.3
## 
## Step:  AIC=-1225.7
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + usage:atlanta + equip:perAA + equip:pergore + 
##     usage:perAA + atlanta:pergore
## 
##                   Df Sum of Sq      RSS     AIC
## - atlanta:pergore  1 0.0001492 0.050966 -1227.2
## - equip:pergore    3 0.0016209 0.052438 -1226.7
## - usage:atlanta    1 0.0003540 0.051171 -1226.6
## <none>                         0.050817 -1225.7
## - equip:perAA      3 0.0026213 0.053438 -1223.7
## - usage:perAA      1 0.0018250 0.052642 -1222.1
## - equip:econ       6 0.0065521 0.057369 -1218.4
## 
## Step:  AIC=-1227.23
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + usage:atlanta + equip:perAA + equip:pergore + 
##     usage:perAA
## 
##                 Df Sum of Sq      RSS     AIC
## - equip:pergore  3 0.0017302 0.052696 -1227.9
## - usage:atlanta  1 0.0005194 0.051485 -1227.6
## <none>                       0.050966 -1227.2
## - equip:perAA    3 0.0028555 0.053821 -1224.6
## - usage:perAA    1 0.0019326 0.052899 -1223.3
## - equip:econ     6 0.0064078 0.057374 -1220.4
## 
## Step:  AIC=-1227.93
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + usage:atlanta + equip:perAA + usage:perAA
## 
##                 Df Sum of Sq      RSS     AIC
## - usage:atlanta  1 0.0002442 0.052940 -1229.2
## <none>                       0.052696 -1227.9
## - pergore        1 0.0006680 0.053364 -1227.9
## - usage:perAA    1 0.0014526 0.054149 -1225.6
## - equip:econ     6 0.0076395 0.060336 -1218.4
## - equip:perAA    4 0.0075787 0.060275 -1214.6
## 
## Step:  AIC=-1229.19
## undercount ~ equip + econ + usage + atlanta + perAA + pergore + 
##     equip:econ + equip:perAA + usage:perAA
## 
##               Df Sum of Sq      RSS     AIC
## - atlanta      1 0.0001706 0.053111 -1230.7
## - pergore      1 0.0005771 0.053518 -1229.5
## <none>                     0.052940 -1229.2
## - usage:perAA  1 0.0013140 0.054254 -1227.3
## - equip:econ   6 0.0074104 0.060351 -1220.4
## - equip:perAA  4 0.0074178 0.060358 -1216.3
## 
## Step:  AIC=-1230.68
## undercount ~ equip + econ + usage + perAA + pergore + equip:econ + 
##     equip:perAA + usage:perAA
## 
##               Df Sum of Sq      RSS     AIC
## - pergore      1 0.0005162 0.053627 -1231.1
## <none>                     0.053111 -1230.7
## - usage:perAA  1 0.0012458 0.054357 -1229.0
## - equip:econ   6 0.0072977 0.060409 -1222.2
## - equip:perAA  4 0.0072498 0.060361 -1218.3
## 
## Step:  AIC=-1231.14
## undercount ~ equip + econ + usage + perAA + equip:econ + equip:perAA + 
##     usage:perAA
## 
##               Df Sum of Sq      RSS     AIC
## <none>                     0.053627 -1231.1
## - usage:perAA  1 0.0010214 0.054649 -1230.1
## - equip:econ   6 0.0075232 0.061150 -1222.3
## - equip:perAA  4 0.0068439 0.060471 -1220.0
## 
## Call:
## lm(formula = undercount ~ equip + econ + usage + perAA + equip:econ + 
##     equip:perAA + usage:perAA, data = gavote)
## 
## Coefficients:
##         (Intercept)           equipOS-CC           equipOS-PC  
##           0.0435310           -0.0128784            0.0034922  
##          equipPAPER           equipPUNCH             econpoor  
##          -0.0578329           -0.0142618            0.0180113  
##            econrich           usageurban                perAA  
##          -0.0157358           -0.0006736           -0.0389879  
## equipOS-CC:econpoor  equipOS-PC:econpoor  equipPAPER:econpoor  
##          -0.0114503            0.0424178                   NA  
## equipPUNCH:econpoor  equipOS-CC:econrich  equipOS-PC:econrich  
##          -0.0160832            0.0047127           -0.0111987  
## equipPAPER:econrich  equipPUNCH:econrich     equipOS-CC:perAA  
##                  NA            0.0168340            0.1181524  
##    equipOS-PC:perAA     equipPAPER:perAA     equipPUNCH:perAA  
##           0.0321434            0.1260840            0.1243346  
##    usageurban:perAA  
##          -0.0472147
drop1(smallm, test = "F")
## Single term deletions
## 
## Model:
## undercount ~ equip + econ + usage + perAA + equip:econ + equip:perAA + 
##     usage:perAA
##             Df Sum of Sq      RSS     AIC F value   Pr(>F)   
## <none>                   0.053627 -1231.1                    
## equip:econ   6 0.0075232 0.061150 -1222.3  3.2500 0.005084 **
## equip:perAA  4 0.0068439 0.060471 -1220.0  4.4348 0.002101 **
## usage:perAA  1 0.0010214 0.054649 -1230.1  2.6474 0.105984   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It shows usage:perAA term can be dropped.

Final model

finalm <- lm(undercount ~ equip + econ + perAA + equip:econ + equip:perAA, gavote)
summary(finalm)
## 
## Call:
## lm(formula = undercount ~ equip + econ + perAA + equip:econ + 
##     equip:perAA, data = gavote)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.057011 -0.010632 -0.000069  0.009198  0.082545 
## 
## Coefficients: (2 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.041871   0.005028   8.328  6.5e-14 ***
## equipOS-CC          -0.011327   0.007373  -1.536 0.126700    
## equipOS-PC           0.008575   0.011178   0.767 0.444288    
## equipPAPER          -0.058427   0.037014  -1.579 0.116687    
## equipPUNCH          -0.015751   0.018745  -0.840 0.402175    
## econpoor             0.020266   0.005529   3.665 0.000349 ***
## econrich            -0.016966   0.012392  -1.369 0.173128    
## perAA               -0.042040   0.016594  -2.534 0.012385 *  
## equipOS-CC:econpoor -0.010964   0.009885  -1.109 0.269224    
## equipOS-PC:econpoor  0.048385   0.013795   3.507 0.000608 ***
## equipPAPER:econpoor        NA         NA      NA       NA    
## equipPUNCH:econpoor -0.003560   0.012427  -0.286 0.774921    
## equipOS-CC:econrich  0.002278   0.015378   0.148 0.882465    
## equipOS-PC:econrich -0.013318   0.017054  -0.781 0.436149    
## equipPAPER:econrich        NA         NA      NA       NA    
## equipPUNCH:econrich  0.020031   0.021997   0.911 0.364045    
## equipOS-CC:perAA     0.107249   0.032855   3.264 0.001377 ** 
## equipOS-PC:perAA    -0.005906   0.043414  -0.136 0.891981    
## equipPAPER:perAA     0.129136   0.081806   1.579 0.116676    
## equipPUNCH:perAA     0.086849   0.046500   1.868 0.063875 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02 on 141 degrees of freedom
## Multiple R-squared:  0.4276, Adjusted R-squared:  0.3585 
## F-statistic: 6.195 on 17 and 141 DF,  p-value: 1.318e-10

The coefficient for equipPAPER:econrich is not estimated because there are no counties that are rich and use paper ballot. The corresponding column in \(\mathbf{X}\) matrix is all zeros.

gavote %>%
  filter(equip == "PAPER" & econ == "rich") %>%
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

The coefficient for equipPAPER:econpoor is not estimated because there are only two counties that use paper ballot and both of them are poor. So the corresponding colmuns for equipPAPER and equipPAPER:econpoor are identical. In other words these two predictors are not identifiable. Therefore only equipPAPER is estimated but not equipPAPER:econpoor.

gavote %>%
  filter(equip == "PAPER")
## # A tibble: 2 x 16
##   county equip econ  perAA usage atlanta  gore  bush other votes ballots
##   <chr>  <fct> <fct> <dbl> <fct> <fct>   <int> <int> <int> <int>   <int>
## 1 MONTG… PAPER poor  0.243 rural notAtl…  1013  1465    31  2509    2573
## 2 TALIA… PAPER poor  0.596 rural notAtl…   556   271     5   832     881
## # … with 5 more variables: undercount <dbl>, pergore <dbl>, perbush <dbl>,
## #   cpergore <dbl>, cperAA <dbl>
(pdf <- tibble(econ  = rep(levels(gavote$econ), 5), 
               equip = rep(levels(gavote$equip), rep(3, 5)),
               perAA = 0.233))
## # A tibble: 15 x 3
##    econ   equip perAA
##    <chr>  <chr> <dbl>
##  1 middle LEVER 0.233
##  2 poor   LEVER 0.233
##  3 rich   LEVER 0.233
##  4 middle OS-CC 0.233
##  5 poor   OS-CC 0.233
##  6 rich   OS-CC 0.233
##  7 middle OS-PC 0.233
##  8 poor   OS-PC 0.233
##  9 rich   OS-PC 0.233
## 10 middle PAPER 0.233
## 11 poor   PAPER 0.233
## 12 rich   PAPER 0.233
## 13 middle PUNCH 0.233
## 14 poor   PUNCH 0.233
## 15 rich   PUNCH 0.233
pp <- predict(finalm, new = pdf)
## Warning in predict.lm(finalm, new = pdf): prediction from a rank-deficient fit
## may be misleading
xtabs(round(pp, 3) ~ econ + equip, pdf)
##         equip
## econ      LEVER  OS-CC  OS-PC  PAPER  PUNCH
##   middle  0.032  0.046  0.039  0.004  0.037
##   poor    0.052  0.055  0.108  0.024  0.053
##   rich    0.015  0.031  0.009 -0.013  0.040
pdf <- tibble(econ  = rep("middle", 15), 
              equip = rep(levels(gavote$equip), rep(3, 5)), 
              perAA = rep(c(.11, 0.23, 0.35), 5))
pp <- predict(finalm, new = pdf)
## Warning in predict.lm(finalm, new = pdf): prediction from a rank-deficient fit
## may be misleading
propAA <- gl(3, 1, 15, labels = c("low", "medium", "high"))
xtabs(round(pp, 3) ~ propAA + equip, pdf)
##         equip
## propAA    LEVER  OS-CC  OS-PC  PAPER  PUNCH
##   low     0.037  0.038  0.045 -0.007  0.031
##   medium  0.032  0.046  0.039  0.003  0.036
##   high    0.027  0.053  0.034  0.014  0.042