Display system information and load tidyverse and faraway packages

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.2  magrittr_1.5    tools_3.6.2     htmltools_0.4.0
##  [5] yaml_2.2.1      Rcpp_1.0.4.6    stringi_1.4.6   rmarkdown_2.1  
##  [9] knitr_1.28      stringr_1.4.0   xfun_0.13       digest_0.6.25  
## [13] rlang_0.4.5     evaluate_0.14
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(faraway)

Parametric vs nonparametric models

Kernel estimators

for (bw in c(0.1, 0.5, 2)) {
  with(faithful, {
    plot(waiting ~ eruptions, col = gray(0.75))
    # Nadaraya–Watson kernel estimate with normal kernel
    lines(ksmooth(eruptions, waiting, "normal", bw))
  })
}

library(sm)
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
with(faithful,
     sm.regression(eruptions, waiting, h = h.select(eruptions, waiting)))

with(exa, sm.regression(x, y, h = h.select(x, y)))

with(exb, sm.regression(x, y, h = h.select(x, y)))

Splines

Smoothing splines

  • Smoothing spline approach chooses \(\hat f\) to minize the modified least squares criterion \[ \frac 1n \sum_i [y_i - f(x_i)]^2 + \lambda \int [f''(x)]^2 \, dx, \] where \(\lambda > 0\) is the smoothing paramter and \(\int [f''(x)]^2 \, dx\) is a roughness penalty. For large \(\lambda\), the minimizer \(\hat f\) is smoother; for smaller \(\lambda\), the minizer \(\hat f\) is rougher. This is the smoothing spline fit.

  • The minimizer takes a special form: \(\hat f\) is a cubic spline (piecewise cubic polynomial in each interval \((x_i, x_{i+1})\)).

  • The tuning parameter \(\lambda\) is chosen by cross-validation (either leave-one-out (LOO) or generalized (GCV)) in R.

with(faithful, {
  plot(waiting ~ eruptions, col = gray(0.75))
  lines(smooth.spline(eruptions, waiting), lty = 2)
})

with(exa, {
  plot(y ~ x, col = gray(0.75))
  lines(x, m) # true model
  lines(smooth.spline(x, y), lty = 2)
})

with(exb, {
  plot(y ~ x, col = gray(0.75))
  lines(x, m) # true model
  lines(smooth.spline(x, y), lty = 2)
})

The last example exb shows that automatic choice of tuning parameter is not foolproof.

Regression splines

  • The regresison spline fit differs from the smoothing splines in that the number of knots can be much smaller than the sample size.

  • Piecewise linear splines:

# right hockey stick function (RHS) with a knot at c
rhs <- function(x, c) ifelse(x > c, x - c, 0)
curve(rhs(x, 0.5), 0, 1)

  • Define some knots for Example A
(knots <- 0:9 / 10)
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9

and compute a design matrix of splines with knots at these points for each \(x\):

# each column is a RHS function with a specific knot 
dm <- outer(exa$x, knots, rhs)
dim(dm)
## [1] 256  10
matplot(exa$x, dm, type = "l", col = 1, xlab = "x", ylab="")

  • Compute and dipslay the regression spline fit.
lmod <- lm(exa$y ~ dm)
plot(y ~ x, exa, col = gray(0.75))
lines(exa$x, predict(lmod))

  • We can acheive better fit by using more knots in denser regions of greater curvature.
newknots <- c(0, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95)
dmn <- outer(exa$x, newknots, rhs)
lmod <- lm(exa$y ~ dmn)
plot(y ~ x, data = exa, col = gray(0.75))
lines(exa$x, predict(lmod))

  • High-order splines can produce a smoother fit. The bs() function can be used to generate the appropriate spline basis. The default is cubic B-splines.
library(splines)

matplot(bs(seq(0, 1, length = 1000), df = 12), type = "l", ylab="", col = 1)

# generate design matrix using B-splines with df = 12
lmod <- lm(y ~ bs(x, 12), exa)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa) # true model
lines(predict(lmod) ~ x, exa, lty = 2) # Cubic B-spline fit

Local polynomials

with(faithful, {
  plot(waiting ~ eruptions, col = gray(0.75))
  f <- loess(waiting ~ eruptions)
  i <- order(eruptions)
  lines(f$x[i], f$fitted[i])
})

with(exa, {
  plot(y ~ x, col = gray(0.75))
  lines(m ~ x)
  f <- loess(y ~ x) # default span = 0.75
  lines(f$x, f$fitted, lty = 2)
  # try smaller span (proportion of the range)
  f <- loess(y ~ x, span = 0.22)
  lines(f$x, f$fitted, lty = 5)
})

with(exb, {
  plot(y ~ x, col = gray(0.75))
  lines(m ~ x) 
  f <- loess(y ~ x) # default span = 0.75
  lines(f$x, f$fitted, lty = 2)
  # span = 1 means whole span of data (smoothest)
  f <- loess(y ~ x, span = 1)
  lines(f$x, f$fitted, lty = 5)
})

ggplot(data = exa, mapping = aes(x = x, y = y)) +
  geom_point(alpha = 0.25) + 
  geom_smooth(method = "loess", span = 0.22) + 
  geom_line(mapping = aes(x = x, y = m), linetype = 2)
## `geom_smooth()` using formula 'y ~ x'

library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
ggplot(data = exa, mapping = aes(x = x, y = y)) +
  geom_point(alpha = 0.25) + 
  geom_smooth(method = "gam", span = 0.22, formula = y ~ s(x, k = 20)) + 
  geom_line(mapping = aes(x = x, y = m), linetype = 2)

Wavelets (not covered in this course)

library(wavethresh)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:sm':
## 
##     muscle
## The following object is masked from 'package:dplyr':
## 
##     select
## WaveThresh: R wavelet software, release 4.6.8, installed
## Copyright Guy Nason and others 1993-2016
## Note: nlevels has been renamed to nlevelsWT
wds <- wd(exa$y, filter.number = 1, family = "DaubExPhase")
draw(wds)

plot(wds)

## [1] 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868

Let’s only retain 3 levels of coefficients.

wtd <- threshold(wds, policy = "manual", value = 9999)
fd <- wr(wtd)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd ~ x, exa, lty = 5, lwd = 2)

Or we can zero out only the small coefficients.

wtd2 <- threshold(wds)
fd2 <- wr(wtd2)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd2 ~ x, exa, lty = 5, lwd = 2)

wds <- wd(exa$y, filter.number = 2, bc = "interval")
draw(filter.number = 2, family = "DaubExPhase")

plot(wds)

## [1] 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061

Now we zero out small coefficients.

wtd <- threshold(wds)
fd <- wr(wtd)
## Warning in wr.wd(wtd): All optional arguments ignored for "wavelets on the
## interval" transform
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd ~ x, exa, lty=2)

x <- with(faithful, (eruptions - min(eruptions)) / (max(eruptions) - min(eruptions)))
gridof <- makegrid(x, faithful$waiting)
wdof <- irregwd(gridof, bc="symmetric")
wtof <- threshold(wdof)
wrof <- wr(wtof)
plot(waiting ~ eruptions, faithful, col = grey(0.75))
with(faithful, lines(seq(min(eruptions), max(eruptions), len=512), wrof))