12 Regression R code reference
To model response variable \(Y\) by predictors \((X_1, X_2, \ldots, X_p)\), suppose we collect \(n\) data points. Let \((Y_i, X_{i1}, \ldots, X_{ip})\) denote the observed data on each case \(i \in \{1, 2, \ldots, n\}\). Then the Bayesian Normal regression model is as follows:
\[\begin{split} Y_i | \beta_0, \beta_1, \ldots, \beta_p, \sigma & \stackrel{ind}{\sim} N(\mu_i, \sigma^2) \;\; \text{ where } \mu_i = \beta_0 + \beta_1 X_i + \cdots + \beta_p X_p\\ \beta_{0c} & \sim N(m_0, s_0^2) \\ \beta_1 & \sim N(m_1, s_1^2) \\ \vdots & \\ \beta_p & \sim N(m_p, s_p^2) \\ \sigma & \sim \text{Exp}(l) \\ \end{split}\]
In the examples below, let:
dat
= name of your data sety
= variable withindat
representing your response variablex1
,x2
, etc = variables withindat
representing predictor variables
12.1 Simulating & checking the prior models
# Simulate the prior model
<- stan_glm(
prior_model ~ x1 + x2 + ... + xp, data = dat,
y family = gaussian,
prior_intercept = normal(m0, s0),
prior = normal([m1, m2, ..., mp], [s0, s1, ..., sp]),
prior_aux = exponential(l),
prior_PD = TRUE,
chains = ___, iter = ___, seed = ___)
Performing a prior check
# Summarize the prior models
prior_summary(prior_model)
# Prior predictive check: simulate 50 samples of y from the prior models
# We can do this no matter how many predictors we have
pp_check(prior_model) +
xlab("y")
# Plot 200 prior model lines
# Assuming 1 quantitative predictor x1
%>%
dat add_fitted_draws(prior_model, n = 200) %>%
ggplot(aes(x = x1, y = y)) +
geom_line(aes(y = .value, group = .draw), alpha = 0.15)
# Plot 200 prior model lines
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
%>%
dat add_fitted_draws(prior_model, n = 200) %>%
ggplot(aes(x = x1, y = y, color = x2)) +
geom_line(aes(y = .value, group = paste(x2, .draw)), alpha = 0.15)
# Plot 4 prior simulated datasets
# Assuming 1 quantitative predictor x1
%>%
dat add_predicted_draws(prior_model, n = 4) %>%
ggplot(aes(x = x1, y = y)) +
geom_point(aes(y = .prediction, group = .draw), size = 0.1) +
facet_wrap(~ .draw)
# Plot 4 prior simulated datasets
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
%>%
dat add_predicted_draws(prior_model, n = 4) %>%
ggplot(aes(x = x1, y = y, color = x2)) +
geom_point(aes(y = .prediction, group = paste(x2, .draw)), size = 0.1) +
facet_wrap(~ .draw)
12.2 Simulating the posterior
# Simulate the posterior model
# Assuming the prior_model was already simulated
<- update(prior_model, prior_PD = FALSE)
posterior_model
# Simulate the posterior model
# Assuming the prior_model was NOT already simulated
<- stan_glm(
posterior_model ~ x1 + x2 + ... + xp, data = dat,
y family = gaussian,
prior_intercept = normal(m0, s0),
prior = normal([m1, m2, ..., mp], [s0, s1, ..., sp]),
prior_aux = exponential(l),
prior_PD = FALSE,
chains = ___, iter = ___, seed = ___)
MCMC simulation diagnostics
# Trace plots
mcmc_trace(posterior_model)
# Overlaid density plots
mcmc_dens_overlay(posterior_model)
12.3 Posterior analysis & prediction
Posterior summaries
# Plot 200 posterior model lines
# Assuming 1 quantitative predictor x1
%>%
dat add_fitted_draws(posterior_model, n = 200) %>%
ggplot(aes(x = x1, y = y)) +
geom_line(aes(y = .value, group = .draw), alpha = 0.15) +
geom_point(data = dat)
# Plot 200 posterior model lines
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
%>%
dat add_fitted_draws(posterior_model, n = 200) %>%
ggplot(aes(x = x1, y = y, color = x2)) +
geom_line(aes(y = .value, group = paste(x2, .draw)), alpha = 0.15) +
geom_point(data = dat)
# Get posterior summaries of the "fixed" parameters (betas)
# and "auxiliary" parameter (sigma)
tidy(temp_posterior, effects = c("fixed", "aux"),
conf.int = TRUE, conf.level = ___)
Posterior prediction for a SINGLE data point (using a shortcut)
# Simulate the posterior predictive model
set.seed(___)
<- posterior_predict(
shortcut_prediction newdata = data.frame(x1 = ___, x2 = ___, etc))
posterior_model,
# Construct a posterior credible interval
posterior_interval(shortcut_prediction, prob = ___)
# Plot the approximate predictive model
mcmc_areas(shortcut_prediction, prob = ___) +
xlab("y")
12.4 Posterior model evaluation
How wrong is the model?
# Construct a posterior predictive check
pp_check(posterior_model) +
xlab("y")
How accurate are the posterior predictions? (Visual summaries)
# Simulate the predictive models for all data points in dat
set.seed(___)
<- posterior_predict(posterior_model, newdata = dat)
predictions
# Plot posterior predictive models for all data points
# Assuming 1 quantitative predictor x1
ppc_intervals(dat$y, yrep = predictions, x = dat$x1,
prob = 0.5, prob_outer = 0.95) +
labs(x = "x1", y = "ys")
# Plot posterior predictive models for all data points
# Assuming 1 categorical predictor x2
ppc_violin_grouped(dat$y, yrep = predictions,
group = dat$x2, y_draw = "points") +
labs(y = "y")
# Plot posterior predictive models for all data points
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
ppc_intervals_grouped(dat$y, yrep = predictions,
x = dat$x1, group = dat$x2,
prob = 0.5, prob_outer = 0.95,
facet_args = list(scales = "fixed")) +
labs(x = "x1", y = "y")
How accurate are the posterior predictions? (Numerical summaries)
# Calculate posterior prediction summaries (without cross-validation)
set.seed(___)
prediction_summary(model = posterior_model, data = dat)
# Calculate posterior prediction summaries (with cross-validation)
set.seed(___)
prediction_summary_cv(model = posterior_model, data = dat, k = 10)