DLM Demo

First install the stat536 library and other packages.

library(devtools)
install_github('andyhoegh/stat536')

## Skipping install of 'stat536' from a github remote, the SHA1 (5d5717cc) has not changed since last install.
##   Use `force = TRUE` to force installation

library(stat536)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dlm)

## 
## Attaching package: 'dlm'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

library(datasets)

1. Random Walk plus Noise Model (Local Level Model)

The first model we will look at has a time-varying mean, with white noise. Specifically, this model can be written as \[\begin{eqnarray} y_t &=& \theta_t + v_t \\ \theta_t &=& \theta_{t-1} + w_t, \end{eqnarray}\]

where \(v_t \sim N(0, V)\) and \(w_t \sim N(0, W)\),

This model is appropriate when there is no clear trend or seasonality.

1.1 Simulate from Local Level

set.seed(11152018)
num.pts <- 100 
V <- 10
W <- 3
sim.ll <- sim_locallevel(num.pts = num.pts, V = 10, W = 3, mu.0 = 0)

ll <- data.frame( val = c(sim.ll$y, sim.ll$mu), t = rep(1:num.pts, 2), label = rep(c('y','mu'), each = num.pts))
ggplot(data = ll, aes(y = val, x = t ) ) + geom_line(aes(color=(label)))

Q: vary V and W. What do you observe? This ratio is known as the signal-to-noise ratio.

1.2 Fit Local Level Model - Variance known

To fit this model there are there iterative steps that are repeated for each time point:

Prior: \[\theta_{t-1}|y_{t-1}, \dots, y_{0} \sim N(\mu_{t-1}, C_{t-1})\]
Prediction: \[\theta_t|y_{t-1}, \dots, y_{0} \sim N(a_t, R_t),\] where in this case \(a_t = \mu_{t-1}\) and \(R_t = C_{t-1} + W\) \[y_t|y_{t-1} \sim N(f_t, Q_t)\] where in this case \(f_t = a_t\) and \(Q_t = R_t + V\).
Estimation (filtering): \[\theta_t|y_t, \dots, y_1 \sim N(\mu_t, C_t)\] where \(\mu_t = a_t + \frac{R_t}{R_t + V} (y_t - f_t)\) and \(C_t = R_t - \frac{R_t}{R_t + V} R_t\)

The code below works through these equations, assuming V and W are known, to obtain the predictive distribution.

## Code extracted from filter_ll()
y <- sim.ll$y
mu0 <- 0
C0 <- 100

a <- R <- f <- Q <- mu <- C <- rep(0, num.pts)

# evolution dist parameters (theta_t|y_{t-1})
a[1] <- mu0
R[1] <- C0 + W

# predictive dist parameters (y_t|y_{t-1})
f[1] <- a[1]
Q[1] <- R[1] + V

# filtering dist parameters (theta_{t}|y_t)
mu[1] <- a[1] + (R[1] / (R[1] + V)) * (y[1] - f[1])
C[1] <- R[1] - (R[1]/ (R[1] + V)) * R[1]

for (time.pts in 2:num.pts){
  # evolution dist parameters (theta_t|y_{t-1})
  a[time.pts] <- mu[time.pts - 1]
  R[time.pts] <- C[time.pts - 1] + W

  # predictive dist parameters (y_t|y_{t-1})
  f[time.pts] <- a[time.pts]
  Q[time.pts] <- R[time.pts] + V

  # filtering dist parameters (theta_{t}|y_t)
  mu[time.pts] <- a[time.pts] + (R[time.pts] / (R[time.pts] + V)) * (y[time.pts] - f[time.pts])
  C[time.pts] <- R[time.pts] - (R[time.pts]/ (R[time.pts] + V)) * R[time.pts]  
}

obs <- ll %>% filter(label == 'y') %>% ggplot(aes(y = val, x = t )) + geom_point() + geom_line(alpha=.2) + ggtitle('Observed Values') + ylab('')

preds <- data.frame(upper = f + 1.96 * sqrt(Q), lower =  f - 1.96 * sqrt(Q), mean = f, t = 1:num.pts)

obs + geom_line(data=preds, aes(y = upper, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = lower, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = mean, x=t), col='red', linetype=2 )

Observed points in black. Dotted red lines are 95% intervals for the predictive distribution. Dashed red line is the mean prediction.

Q: why is the initial variance so large?
Q: why is the predictive distribution plotted? What about the other two distributions, how might we visualize these?

1.3 Fit Local Level Model - Variance Unknown

Now we need to build in a function to estimate the variance parameters (V and W) too. This is accomplished using the forward-filter backward-sampler (FFBS) algorithm using the dlm package.

mcmc.output <- dlmGibbsDIG(y =y, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )

We need to look at the posterior estimates for parameters in the model.
- First V.

plot(mcmc.output$dV, type= 'l', main = 'Trace for V')
abline(h = V, col = 'red', lwd = 2)

Posterior estimates for V as a trace plot

plot(mcmc.output$dW, type= 'l', main = 'Trace for W')
abline(h = W, col = 'red', lwd = 2)

Posterior estimates for W as a trace plot.

In this case we have samples from \(\theta_t|y_t\)

dim(mcmc.output$theta)

## [1]  101    1 5000

sample.paths <- sample(5000,15)
plot(mcmc.output$theta[,1,sample.paths[1]], type='l', ylab =expression(theta), xlab = 'time', main = 'Sample paths of theta')
for (i in 2:15){
  lines(1:101, mcmc.output$theta[,1,sample.paths[i]])
}
lines(2:101, sim.ll$mu, col = 'red', lwd=2, lty = 2)

sample paths for theta in black, true theta in red.

We can also get predictions from this model using the posterior parameters, but how??

The mean of the predictive distribution \(y_{t+1}|y_t\) is just \(\mu_{t-1}\) and the variance is \(C_{t-1} + W + V.\)

Thus we can create samples from the predictive distribution by using the samples of \(\theta\) and propogating the error. However, know that we are using all of the data to estimate V and W.

V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
y.pred <- mcmc.output$theta[,1,] + V.matrix + W.matrix

pred.mean <- rowMeans(y.pred)
pred.quant <- apply(y.pred, 1, quantile, probs = c(.025, .975))

plot(sim.ll$y, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')

lines(1:100, pred.quant[1,-1], col = 'red', lty = 2)
lines(1:100, pred.quant[2,-1], col = 'red', lty = 2)
lines(1:100, pred.mean[-1], col = 'red')

1.4 Fit Local Level Model - Variance Unknown to Nile data

data(Nile)
nile.output <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )

We can also get predictions from this model using the posterior parameters, but how??

The mean of the predictive distribution \(y_{t+1}|y_t\) is just \(\mu_{t-1}\) and the variance is \(C_{t-1} + W + V.\)

Thus we can create samples from the predictive distribution by using the samples of \(\theta\) and propogating the error. However, know that we are using all of the data to estimate V and W.

V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
nile.pred <- nile.output$theta[,1,] + V.matrix + W.matrix

pred.mean <- rowMeans(nile.pred)
pred.quant <- apply(nile.pred, 1, quantile, probs = c(.025, .975))

plot(Nile, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')

lines(1871:1970, pred.quant[1,-1], col = 'red', lty = 2)
lines(1871:1970, pred.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, pred.mean[-1], col = 'red')

1.5 Other models for Nile Data

The dlmModPoly has the capability to fit more complicated models too. Consider the linear growth model: \[\begin{eqnarray*} y_t &=& \mu_t + v_t\\ \mu_t &=& \mu_{t-1} + \beta_{t-1} + w_{t,1}\\ \beta_t &=& \beta_{t-1} + w_{t,2}. \end{eqnarray*}\]

This can be fit using dlmModPoly(order = 2)

nile.output2 <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 2), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )

mean.nile <- rowMeans(nile.output2$theta[,1,])
nile.quant <- apply(nile.output2$theta[,1,],1, quantile, probs=c(.025,.975))
plot(Nile,ylim = c(min(c(nile.quant,Nile)), max(c(nile.quant,Nile))), pch =16, ylab = '', xlab = 'Time', main = 'Filtering Distribution')

lines(1871:1970, mean.nile[-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[1,-1], col = 'red')

This figure shows the filtering distribution for mu, which is different than the predictive distribution

1.6 More about dlm

With the dlm package, the model type needs to be specified. We have see dlmModPoly(), but dlm also contains:

dlmModSeas()
dlmModReg()
dlmModARMA()

Furthermore, these models can be combined to form a model with trends, regression components, ARMA elements, etc…

2. Dynamic Regression

Another nice feature of state-space models is the ability to have time-varying regression coefficients. Furthermore, there is a straight-forward way to model time series data that is non-normal.

Consider the following model, describe the parameters in this model and explain what the model does.

\[\begin{eqnarray*} y_t &\sim& Bernoulli(\pi_t)\\ logit(\pi_t) &=& x_{t} \beta_{t}\\ \beta_{t} &=& \beta_{t-1} + w_{t} \\ \end{eqnarray*}\]

where \(w_{t} \sim N(0, W)\).

2.1 Simulate Dynamic Generalized Regression Model

set.seed(11192018)
num.pts <- 500
beta <- rep(3,num.pts)
x <- runif(num.pts)
W <- .05

for (time.pts in 2:num.pts){
  beta[time.pts] <- beta[time.pts - 1] + rnorm(1, mean = 0, sd = sqrt(W))
}

pi.t <- exp( x * beta) / (1 + exp(x * beta) )
y <- rbinom(num.pts, 1 ,pi.t)
plot(pi.t, type = 'l', lty = 2, ylim=c(0,1), xlab='time')
points(1:num.pts,y, pch = 16, cex = .8)

time varying probability in line and observed binary points as black dots

2.2 Fit Dynamic Generalized Regression Model

2.3 Example for one time point

In this situation, the prediction and filtering equations do not have closed form solutions. Technically, these distributions are computed with a series of integrals.

Another solution is to use a class of algorithms known as sequential Monte Carlo and in particular an algorithm known as a particle filter.

The algorithm is initialized with a distribution for the parameters at time 0. (For simplicity we will assume that \(W\) is known, but this is not necessary.)

prior distributions

# Storage
num.particles <- 10000
beta.samples <- matrix(0,nrow=num.particles, ncol = num.pts + 1)

# initialize particles at time 0
beta.samples[,1] <- rnorm(num.particles, mean = 0, sd=4)

hist(beta.samples[,1], main = 'Prior at time 0', xlab = expression(beta), breaks = 'FD')

evolution distributions

Next we want to consider where the state distribution is at the next time point

# Storage
beta.evolution <- matrix(0, num.particles, num.pts)
  
# initialize particles at time 0
beta.evolution[,1] <- beta.samples[,1] + rnorm(num.particles, mean = 0, sd=sqrt(W))

hist(beta.evolution[,1], main = 'Evolution Dist at time 1', xlab = expression(beta), breaks = 'FD')

predictive distributions

xb <-  beta.evolution[,1] * x[1]
probs <- exp(xb) / (1 + exp(xb))
y1 <- rbinom(num.particles, 1, probs)

par(mfcol=c(1,2))
hist(probs, main = 'distribution for pi', xlab = expression(pi[1]), breaks = 'FD')
hist(y1, breaks = c(-.5,.5,1.5),main = 'distribution for y', xlab = expression(y[1]), probability = TRUE)

compute importance weights and resample

w <- dbinom(y[1], 1, prob = probs) / (dnorm(beta.evolution[,1],mean = beta.samples[,1], sd = sqrt(W))) 

w.norm <- w / sum(w)

sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)

beta.samples[,2] <- beta.evolution[sample.pos,1]


hist(beta.samples[,2], main = 'Posterior after time 1', xlab = expression(beta), breaks = 'FD')
abline(v=beta[1], col = 'red')

2.3 Example for all time points

for (time.pts in 2:num.pts){
  # evolution
  beta.evolution[,time.pts] <- beta.samples[,time.pts] + rnorm(num.particles, mean = 0, sd=sqrt(W))

  # weights and resample
  xb <- beta.evolution[,time.pts] * x[time.pts]
  probs <- exp(xb) / (1 + exp(xb))
  w <- dbinom(y[time.pts],1, prob = probs) / (dnorm(beta.evolution[,time.pts],mean = beta.samples[,time.pts], sd = sqrt(W)) ) 
  w.norm <- w / sum(w)

  sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)

  beta.samples[,time.pts + 1] <- beta.evolution[sample.pos,time.pts]
}  

beta.q <- apply(beta.samples, 2, quantile, prob=c(.025,.975))
par(mfcol = c(1,1))
plot(1:num.pts, beta, ylim=c(min(beta.samples), max(beta.samples)), pch = 16, xlab = 'time')
lines(1:num.pts, beta.q[1, -1], lty = 2)
lines(1:num.pts, beta.q[2, -1], lty = 2)

prob.samples <- exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)) / (1  + exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)))
prob.q <- apply(prob.samples, 2, quantile, prob = c(.025,.975))

plot(1:num.pts, pi.t, ylim=c(0, 1), pch = 16, xlab = 'time', type = 'b')
lines(1:num.pts, colMeans(prob.samples), lty = 2, col = 'red', lwd = 2)