- recap of last time
- likelihood principle
- some notes on Bayes factor
- linear regression
- motivating examples
- least squares estimation
adv <- read.csv('data/Advertising.csv')[, -1] head(adv)
## TV Radio Newspaper Sales ## 1 230.1 37.8 69.2 22.1 ## 2 44.5 39.3 45.1 10.4 ## 3 17.2 45.9 69.3 9.3 ## 4 151.5 41.3 58.5 18.5 ## 5 180.8 10.8 58.4 12.9 ## 6 8.7 48.9 75.0 7.2
with(adv, plot(TV, Sales, col = 'red')) abline(lm(Sales ~ TV, data = adv), col = 'blue', lwd = 2)
m <- lm(Sales ~ TV, data = adv) summary(m)
## ## Call: ## lm(formula = Sales ~ TV, data = adv) ## ## Residuals: ## Min 1Q Median 3Q Max ## -8.3860 -1.9545 -0.1913 2.0671 7.2124 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 7.032594 0.457843 15.36 <2e-16 *** ## TV 0.047537 0.002691 17.67 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 3.259 on 198 degrees of freedom ## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099 ## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
m <- lm(Sales ~ TV + Radio, data = adv) summary(m)
## ## Call: ## lm(formula = Sales ~ TV + Radio, data = adv) ## ## Residuals: ## Min 1Q Median 3Q Max ## -8.7977 -0.8752 0.2422 1.1708 2.8328 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.92110 0.29449 9.919 <2e-16 *** ## TV 0.04575 0.00139 32.909 <2e-16 *** ## Radio 0.18799 0.00804 23.382 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.681 on 197 degrees of freedom ## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962 ## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
set.seed(1774) n <- 50 hat <- rnorm(n, mean = 50, sd = 5) nohat <- rnorm(n, mean = 50, sd = 5) dat <- data.frame(creativity = c(nohat, hat), hat = rep(0:1, each = n)) head(dat)
## creativity hat ## 1 45.42617 0 ## 2 56.49689 0 ## 3 51.56313 0 ## 4 48.18651 0 ## 5 46.22818 0 ## 6 49.85087 0
m <- lm(creativity ~ hat, data = dat) summary(m)
## ## Call: ## lm(formula = creativity ~ hat, data = dat) ## ## Residuals: ## Min 1Q Median 3Q Max ## -12.1598 -2.7061 -0.1059 2.7183 9.8746 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 49.2720 0.6617 74.467 <2e-16 *** ## hat 1.3873 0.9357 1.483 0.141 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 4.679 on 98 degrees of freedom ## Multiple R-squared: 0.02194, Adjusted R-squared: 0.01196 ## F-statistic: 2.198 on 1 and 98 DF, p-value: 0.1414
\[ y = f(X) + \epsilon \]
set.seed(1774) # make example reproducible n <- 100 x <- rnorm(n, mean = 50, sd = 10) error <- rnorm(n, mean = 0, sd = 20) f <- function(x, b0 = 10, b1 = 3) b0 + b1*x y <- f(x) + error head(cbind(y, x), 10)
## y x ## [1,] 165.36433 57.97829 ## [2,] 120.13588 47.69310 ## [3,] 188.86584 55.68723 ## [4,] 134.62419 41.57817 ## [5,] 185.08043 50.68255 ## [6,] 134.88544 49.36588 ## [7,] 231.22224 63.81083 ## [8,] 143.19750 54.03224 ## [9,] 121.70756 37.80916 ## [10,] 93.51492 39.35512
\[ f(X) = X\beta \]
which yields \[ y = X\beta + \epsilon \]
need to estimate the coefficient vector \(\hat \beta\) and \(\hat \sigma^2\) such that
\[ \hat \beta = \underset{\beta} {\text{argmin}} \, \sum_{i=1}^n (y_i - \hat y_i)^2 \]
\[ \begin{align*} 1b_1 + 1b_2 &= 1 \\ 1b_1 + 2b_2 &= 2 \\ 1b_1 + 3b_2 &= 2 \end{align*} \]
or more concisely \[ \begin{pmatrix} 1 & 1 \\ 1 & 2 \\ 1 & 3 \end{pmatrix} \begin{pmatrix} b_1 \\ b_2 \end{pmatrix} = \begin{pmatrix} 1 \\ 2 \\ 2 \end{pmatrix} \]
however, we want to find the best solution
\[ \begin{align*} RSS &= \sum_{i=1}^{n} (y_i - \hat y_i)^2 \\[2ex] &= (y - X\hat \beta)^2 \\[2ex] &= (y - X\hat \beta)^T(y - X\hat \beta) \\[2ex] &= ||y - X\hat \beta||^2 \\[2ex] &= ||\mathbf{e}||^2 \end{align*} \]
\[ \begin{align*} x^Te &= 0 \\[1ex] x^T(y - x \hat b) &= 0 \\[1ex] x^Ty &= x^Tx \hat b \\[1ex] \hat b &= \frac{x^Ty}{x^Tx} \end{align*} \]
\[ \begin{align*} X^Te &= 0 \\[1ex] X^T(y - X \hat b) &= 0 \\[1ex] X^T X \hat b &= X^Ty \\[1ex] \hat b &= (X^T X)^{-1} X^Ty \end{align*} \]
\[ X\beta = y \]
\[ \begin{align*} X^T X \hat \beta &= X^T y \\[2ex] \hat \beta &= (X^TX)^{-1} X^T y \end{align*} \]
\[ \begin{align*} 1b_1 + 1b_2 &= 1 \\ 1b_1 + 2b_2 &= 2 \\ 1b_1 + 3b_2 &= 2 \end{align*} \]
X <- unname(cbind(1, xx)) estimate <- function(X, y) { solve(t(X) %*% X) %*% (t(X) %*% y) } estimate(X, yy)
## [,1] ## [1,] 0.6666667 ## [2,] 0.5000000
coef(lm(yy ~ xx))
## (Intercept) xx ## 0.6666667 0.5000000
\[ \begin{align*} 2/3 + 1/2 &\approx 1 \\ 2/3 + 2 \times 1/2 &\approx 2 \\ 2/3 + 3 \times 1/2 &\approx 2 \end{align*} \]
\[ \begin{align*} \text{-}1/6 &= \epsilon_1 \\ 1/3 &= \epsilon_2 \\ \text{-}1/6 & = \epsilon_3 \end{align*} \]
\[ \begin{align*} 1b_1 + 1b_2 + 0b_3 &= 1 \\ 1b_1 + 2b_2 + 2b_3 &= 2 \\ 1b_1 + 1b_2 + 0b_3 &= 2 \\ 1b_1 + 2b_2 - 2b_3 &= 3 \end{align*} \]
\[ \begin{pmatrix} 1 & 1 & 0 \\ 1 & 2 & 2 \\ 1 & 1 & 0 \\ 1 & 2 & \text{-} 2 \end{pmatrix} \begin{pmatrix} b_1 \\ b_2 \\ b_3 \end{pmatrix} = \begin{pmatrix} 1 \\ 2 \\ 2 \\ 3 \end{pmatrix} \]
\[ \begin{align*} X \hat \beta &= y \\[2ex] X^T X \hat \beta &= X^T y \\[2ex] \hat \beta &= (X^TX)^{-1} X^T y \end{align*} \]
X <- unname(cbind(1, x1, x2)) estimate(X, yy2)
## [,1] ## [1,] 0.50 ## [2,] 1.00 ## [3,] -0.25
coef(lm(yy2 ~ x1 + x2))
## (Intercept) x1 x2 ## 0.50 1.00 -0.25
recall \[ y = X\beta + \epsilon \]
under these assumptions, the least squares estimator corresponds to the maximum likelihood estimator
\[
t_j = \frac{\hat \beta_j - 0}{\sqrt{\mathrm{Var}(\hat \beta)_{jj}}}
\]
- use an F-test for the whole model \[
F = \frac{(RSS_0 - RSS_1) / (p_1 - p_0)}{\sigma_1^2}
\]
estimate <- function(y, X) { p <- ncol(X) n <- length(y) df <- n - p - 1 inv <- solve(t(X) %*% X) beta <- inv %*% (t(X) %*% y) err <- y - X %*% beta sigma2 <- 1/df * sum(err^2) covmat <- solve(t(X) %*% X) * as.numeric(sigma2) varbeta <- diag(covmat) # run t-tests tstats <- beta / sqrt(varbeta) pvals <- (1 - pt(tstats, df = df)) * 2 # wrap it up res <- cbind(beta, tstats, pvals) colnames(res) <- c('beta', 'tstat', 'pval') res }
glm.bayes <- function(y, X, n.iter = 10000) { ms <- ' model { sigma ~ dunif(0, 100) tau <- pow(sigma, -2) for (i in 1:p) { b[i] ~ dnorm(0, .001) } mu <- X %*% b for (i in 1:n) { y[i] ~ dnorm(mu[i], tau) } }' params <- c('b', 'sigma') jags.dat <- list('n' = length(y), 'y' = y, 'X' = X, 'p' = ncol(X)) model <- jags.model(textConnection(ms), data = jags.dat, n.chains = 3, quiet = TRUE) samples <- coda.samples(model, variable.names = params, n.iter = n.iter, progress.bar = 'none') samples }
library('rjags') X <- cbind(1, adv[, 1:2]) # TV, Radio samples <- glm.bayes(adv$Sales, X) head(samples)[[1]] # look at first chain
## Markov Chain Monte Carlo (MCMC) output: ## Start = 1001 ## End = 1007 ## Thinning interval = 1 ## b[1] b[2] b[3] sigma ## [1,] 3.245187 0.04469467 0.1798017 1.567023 ## [2,] 3.084721 0.04468296 0.1833741 1.619081 ## [3,] 2.762136 0.04673752 0.1818689 1.750262 ## [4,] 2.974022 0.04465194 0.1907283 1.681974 ## [5,] 2.788118 0.04436529 0.1966705 1.715625 ## [6,] 2.698615 0.04629846 0.1952554 1.694359 ## [7,] 2.713838 0.04561158 0.2020620 1.700925
summary(samples)
## ## Iterations = 1001:11000 ## Thinning interval = 1 ## Number of chains = 3 ## Sample size per chain = 10000 ## ## 1. Empirical mean and standard deviation for each variable, ## plus standard error of the mean: ## ## Mean SD Naive SE Time-series SE ## b[1] 2.91883 0.302523 1.747e-03 5.382e-03 ## b[2] 0.04577 0.001427 8.236e-06 2.117e-05 ## b[3] 0.18799 0.008219 4.745e-05 1.179e-04 ## sigma 1.69201 0.085606 4.942e-04 6.426e-04 ## ## 2. Quantiles for each variable: ## ## 2.5% 25% 50% 75% 97.5% ## b[1] 2.32780 2.71275 2.91880 3.12309 3.50848 ## b[2] 0.04297 0.04482 0.04577 0.04673 0.04856 ## b[3] 0.17171 0.18250 0.18798 0.19349 0.20417 ## sigma 1.53555 1.63254 1.68827 1.74783 1.86926
library('coda') # for convergence checks gelman.diag(samples)
## Potential scale reduction factors: ## ## Point est. Upper C.I. ## b[1] 1 1 ## b[2] 1 1 ## b[3] 1 1 ## sigma 1 1 ## ## Multivariate psrf ## ## 1
library('ggmcmc') ss <- ggs(samples) ggs_autocorrelation(ss)
instead of simply using the residual sum of squares, we add some penalization: \[ \hat \beta = \underset{\beta} {\text{argmin}} \, \left( \sum_{i=1}^n (y_i - \hat y_i)^2 + \lambda \sum_{j=1}^p |\beta_j|^q \right) \]
lasso.bayes <- function(y, X, n.iter = 10000) { ms <- ' model { sigma ~ dunif(0, 100) tau <- pow(sigma, -2) lambda ~ dunif(0, 10) b[1] ~ dnorm(0, .001) for (i in 2:p) { b[i] ~ ddexp(0, lambda) } mu <- X %*% b for (i in 1:n) { y[i] ~ dnorm(mu[i], tau) } }' # for the paper, see Park & Casella (2004) params <- c('b', 'sigma', 'lambda') jags.dat <- list('n' = length(y), 'y' = y, 'X' = X, 'p' = ncol(X)) model <- jags.model(textConnection(ms), data = jags.dat, quiet = TRUE) samples <- coda.samples(model, variable.names = params, n.iter = n.iter, progress.bar = 'none') samples }
library('BayesFactor') bf <- regressionBF(Sales ~ TV*Radio*Newspaper, data = adv, progress = FALSE) bf
## Bayes factor analysis ## -------------- ## [1] TV : 9.455818e+38 ±0% ## [2] Radio : 9.019635e+15 ±0% ## [3] Newspaper : 22.92162 ±0% ## [4] TV + Radio : 1.391438e+94 ±0% ## [5] TV + Newspaper : 4.806461e+41 ±0% ## [6] Radio + Newspaper : 1.131977e+15 ±0.01% ## [7] TV + Radio + Newspaper : 5.437431e+92 ±0% ## ## Against denominator: ## Intercept only ## --- ## Bayes factor type: BFlinearModel, JZS
post <- posterior(bf, index = 4, iterations = 10000, progress = FALSE) head(post)
## Markov Chain Monte Carlo (MCMC) output: ## Start = 1 ## End = 7 ## Thinning interval = 1 ## TV Radio sig2 g ## [1,] 0.04885797 0.3061050 5.103436 1.298927 ## [2,] 0.04563330 0.1842089 2.478365 10.798266 ## [3,] 0.04478827 0.1928241 2.803676 6.213414 ## [4,] 0.04558144 0.1813746 2.704055 5.332902 ## [5,] 0.04567917 0.1926027 2.933725 3.606492 ## [6,] 0.04645037 0.1959740 2.273238 1.335846 ## [7,] 0.04468148 0.1888816 2.370580 3.017834
plot(post[, 1:2])
library('dplyr') library('ggplot2') library('magrittr') library('babynames') babynames
## Source: local data frame [1,792,091 x 5] ## ## year sex name n prop ## 1 1880 F Mary 7065 0.07238359 ## 2 1880 F Anna 2604 0.02667896 ## 3 1880 F Emma 2003 0.02052149 ## 4 1880 F Elizabeth 1939 0.01986579 ## 5 1880 F Minnie 1746 0.01788843 ## 6 1880 F Margaret 1578 0.01616720 ## 7 1880 F Ida 1472 0.01508119 ## 8 1880 F Alice 1414 0.01448696 ## 9 1880 F Bertha 1320 0.01352390 ## 10 1880 F Sarah 1288 0.01319605 ## .. ... ... ... ... ...
babynames %>% filter(name %>% substr(1, 3) %>% equals("Mar")) %>% group_by(year, sex) %>% summarize(total = sum(n)) %>% qplot(year, total, color = sex, data = ., geom = "line") %>% add(ggtitle('Names starting with "Mar"'))