12 Regression R code reference

To model response variable \(Y\) by predictors \((X_1, X_2, \ldots, X_p)\), suppose we collect \(n\) data points. Let \((Y_i, X_{i1}, \ldots, X_{ip})\) denote the observed data on each case \(i \in \{1, 2, \ldots, n\}\). Then the Bayesian Normal regression model is as follows:

\[\begin{split} Y_i | \beta_0, \beta_1, \ldots, \beta_p, \sigma & \stackrel{ind}{\sim} N(\mu_i, \sigma^2) \;\; \text{ where } \mu_i = \beta_0 + \beta_1 X_i + \cdots + \beta_p X_p\\ \beta_{0c} & \sim N(m_0, s_0^2) \\ \beta_1 & \sim N(m_1, s_1^2) \\ \vdots & \\ \beta_p & \sim N(m_p, s_p^2) \\ \sigma & \sim \text{Exp}(l) \\ \end{split}\]

In the examples below, let:

dat = name of your data set
y = variable within dat representing your response variable
x1, x2, etc = variables within dat representing predictor variables

12.1 Simulating & checking the prior models

# Simulate the prior model
prior_model <- stan_glm(
  y ~ x1 + x2 + ... + xp, data = dat,
  family = gaussian,
  prior_intercept = normal(m0, s0),
  prior = normal([m1, m2, ..., mp], [s0, s1, ..., sp]),
  prior_aux = exponential(l),
  prior_PD = TRUE,
  chains = ___, iter = ___, seed = ___)

Performing a prior check

# Summarize the prior models
prior_summary(prior_model)

# Prior predictive check: simulate 50 samples of y from the prior models
# We can do this no matter how many predictors we have
pp_check(prior_model) + 
    xlab("y")

# Plot 200 prior model lines
# Assuming 1 quantitative predictor x1
dat %>%
  add_fitted_draws(prior_model, n = 200) %>%
  ggplot(aes(x = x1, y = y)) +
    geom_line(aes(y = .value, group = .draw), alpha = 0.15)

# Plot 200 prior model lines
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
dat %>%
  add_fitted_draws(prior_model, n = 200) %>%
  ggplot(aes(x = x1, y = y, color = x2)) +
    geom_line(aes(y = .value, group = paste(x2, .draw)), alpha = 0.15)

# Plot 4 prior simulated datasets
# Assuming 1 quantitative predictor x1
dat %>%
  add_predicted_draws(prior_model, n = 4) %>%
  ggplot(aes(x = x1, y = y)) +
    geom_point(aes(y = .prediction, group = .draw), size = 0.1) +
    facet_wrap(~ .draw)

# Plot 4 prior simulated datasets
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
dat %>%
  add_predicted_draws(prior_model, n = 4) %>%
  ggplot(aes(x = x1, y = y, color = x2)) +
    geom_point(aes(y = .prediction, group = paste(x2, .draw)), size = 0.1) +
    facet_wrap(~ .draw)

12.2 Simulating the posterior

# Simulate the posterior model
# Assuming the prior_model was already simulated
posterior_model <- update(prior_model, prior_PD = FALSE)

# Simulate the posterior model
# Assuming the prior_model was NOT already simulated
posterior_model <- stan_glm(
  y ~ x1 + x2 + ... + xp, data = dat,
  family = gaussian,
  prior_intercept = normal(m0, s0),
  prior = normal([m1, m2, ..., mp], [s0, s1, ..., sp]),
  prior_aux = exponential(l),
  prior_PD = FALSE,
  chains = ___, iter = ___, seed = ___)

MCMC simulation diagnostics

# Trace plots
mcmc_trace(posterior_model)

# Overlaid density plots
mcmc_dens_overlay(posterior_model)

12.3 Posterior analysis & prediction

Posterior summaries

# Plot 200 posterior model lines
# Assuming 1 quantitative predictor x1
dat %>%
  add_fitted_draws(posterior_model, n = 200) %>%
  ggplot(aes(x = x1, y = y)) +
    geom_line(aes(y = .value, group = .draw), alpha = 0.15) + 
      geom_point(data = dat)

# Plot 200 posterior model lines
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
dat %>%
  add_fitted_draws(posterior_model, n = 200) %>%
  ggplot(aes(x = x1, y = y, color = x2)) +
    geom_line(aes(y = .value, group = paste(x2, .draw)), alpha = 0.15) + 
      geom_point(data = dat)

# Get posterior summaries of the "fixed" parameters (betas)
# and "auxiliary" parameter (sigma)
tidy(temp_posterior, effects = c("fixed", "aux"),
     conf.int = TRUE, conf.level = ___)

Posterior prediction for a SINGLE data point (using a shortcut)

# Simulate the posterior predictive model
set.seed(___)
shortcut_prediction <- posterior_predict(
  posterior_model, newdata = data.frame(x1 = ___, x2 = ___, etc))

# Construct a posterior credible interval
posterior_interval(shortcut_prediction, prob = ___)

# Plot the approximate predictive model
mcmc_areas(shortcut_prediction, prob = ___) + 
    xlab("y")

12.4 Posterior model evaluation

How wrong is the model?

# Construct a posterior predictive check
pp_check(posterior_model) + 
    xlab("y")

How accurate are the posterior predictions? (Visual summaries)

# Simulate the predictive models for all data points in dat
set.seed(___)
predictions <- posterior_predict(posterior_model, newdata = dat)

# Plot posterior predictive models for all data points
# Assuming 1 quantitative predictor x1
ppc_intervals(dat$y, yrep = predictions, x = dat$x1, 
                            prob = 0.5, prob_outer = 0.95) +
    labs(x = "x1", y = "ys")

# Plot posterior predictive models for all data points
# Assuming 1 categorical predictor x2
ppc_violin_grouped(dat$y, yrep = predictions,
                                     group = dat$x2, y_draw = "points") +
    labs(y = "y")

# Plot posterior predictive models for all data points
# Assuming 1 quantitative predictor x1 and 1 categorical predictor x2
ppc_intervals_grouped(dat$y, yrep = predictions,
                                            x = dat$x1, group = dat$x2,
                                            prob = 0.5, prob_outer = 0.95,
                                            facet_args = list(scales = "fixed")) + 
    labs(x = "x1", y = "y")

How accurate are the posterior predictions? (Numerical summaries)

# Calculate posterior prediction summaries (without cross-validation)
set.seed(___)
prediction_summary(model = posterior_model, data = dat)

# Calculate posterior prediction summaries (with cross-validation)
set.seed(___)
prediction_summary_cv(model = posterior_model, data = dat, k = 10)