--- title: "DLM Demo" author: "Andy Hoegh" date: "11/19/2018" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` First install the stat536 library and other packages. ```{r } library(devtools) install_github('andyhoegh/stat536') library(stat536) library(ggplot2) library(dplyr) library(dlm) library(datasets) ``` ### 1. Random Walk plus Noise Model (Local Level Model) The first model we will look at has a time-varying mean, with white noise. Specifically, this model can be written as \begin{eqnarray} y_t &=& \theta_t + v_t \\ \theta_t &=& \theta_{t-1} + w_t, \end{eqnarray} where $v_t \sim N(0, V)$ and $w_t \sim N(0, W)$, This model is appropriate when there is no clear trend or seasonality. #### 1.1 Simulate from Local Level ```{r} set.seed(11152018) num.pts <- 100 V <- 10 W <- 3 sim.ll <- sim_locallevel(num.pts = num.pts, V = 10, W = 3, mu.0 = 0) ll <- data.frame( val = c(sim.ll$y, sim.ll$mu), t = rep(1:num.pts, 2), label = rep(c('y','mu'), each = num.pts)) ggplot(data = ll, aes(y = val, x = t ) ) + geom_line(aes(color=(label))) ``` - __Q__: vary V and W. What do you observe? This ratio is known as the signal-to-noise ratio. #### 1.2 Fit Local Level Model - Variance known To fit this model there are there iterative steps that are repeated for each time point: - Prior: $$\theta_{t-1}|y_{t-1}, \dots, y_{0} \sim N(\mu_{t-1}, C_{t-1})$$ - Prediction: $$\theta_t|y_{t-1}, \dots, y_{0} \sim N(a_t, R_t),$$ where in this case $a_t = \mu_{t-1}$ and $R_t = C_{t-1} + W$ $$y_t|y_{t-1} \sim N(f_t, Q_t)$$ where in this case $f_t = a_t$ and $Q_t = R_t + V$. - Estimation (filtering): $$\theta_t|y_t, \dots, y_1 \sim N(\mu_t, C_t)$$ where $\mu_t = a_t + \frac{R_t}{R_t + V} (y_t - f_t)$ and $C_t = R_t - \frac{R_t}{R_t + V} R_t$ The code below works through these equations, assuming V and W are known, to obtain the predictive distribution. ```{r, fig.cap = 'Observed points in black. Dotted red lines are 95% intervals for the predictive distribution. Dashed red line is the mean prediction.'} ## Code extracted from filter_ll() y <- sim.ll$y mu0 <- 0 C0 <- 100 a <- R <- f <- Q <- mu <- C <- rep(0, num.pts) # evolution dist parameters (theta_t|y_{t-1}) a[1] <- mu0 R[1] <- C0 + W # predictive dist parameters (y_t|y_{t-1}) f[1] <- a[1] Q[1] <- R[1] + V # filtering dist parameters (theta_{t}|y_t) mu[1] <- a[1] + (R[1] / (R[1] + V)) * (y[1] - f[1]) C[1] <- R[1] - (R[1]/ (R[1] + V)) * R[1] for (time.pts in 2:num.pts){ # evolution dist parameters (theta_t|y_{t-1}) a[time.pts] <- mu[time.pts - 1] R[time.pts] <- C[time.pts - 1] + W # predictive dist parameters (y_t|y_{t-1}) f[time.pts] <- a[time.pts] Q[time.pts] <- R[time.pts] + V # filtering dist parameters (theta_{t}|y_t) mu[time.pts] <- a[time.pts] + (R[time.pts] / (R[time.pts] + V)) * (y[time.pts] - f[time.pts]) C[time.pts] <- R[time.pts] - (R[time.pts]/ (R[time.pts] + V)) * R[time.pts] } obs <- ll %>% filter(label == 'y') %>% ggplot(aes(y = val, x = t )) + geom_point() + geom_line(alpha=.2) + ggtitle('Observed Values') + ylab('') preds <- data.frame(upper = f + 1.96 * sqrt(Q), lower = f - 1.96 * sqrt(Q), mean = f, t = 1:num.pts) obs + geom_line(data=preds, aes(y = upper, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = lower, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = mean, x=t), col='red', linetype=2 ) ``` - __Q__: why is the initial variance so large? - __Q__: why is the predictive distribution plotted? What about the other two distributions, how might we visualize these? #### 1.3 Fit Local Level Model - Variance Unknown Now we need to build in a function to estimate the variance parameters (V and W) too. This is accomplished using the forward-filter backward-sampler (FFBS) algorithm using the `dlm` package. ```{r, cache = T} mcmc.output <- dlmGibbsDIG(y =y, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 ) ``` We need to look at the posterior estimates for parameters in the model. - First V. ```{r, fig.cap = 'Posterior estimates for V as a trace plot'} plot(mcmc.output$dV, type= 'l', main = 'Trace for V') abline(h = V, col = 'red', lwd = 2) ``` ```{r, fig.cap = 'Posterior estimates for W as a trace plot.'} plot(mcmc.output$dW, type= 'l', main = 'Trace for W') abline(h = W, col = 'red', lwd = 2) ``` In this case we have samples from $\theta_t|y_t$ ```{r, fig.cap = 'sample paths for theta in black, true theta in red.'} dim(mcmc.output$theta) sample.paths <- sample(5000,15) plot(mcmc.output$theta[,1,sample.paths[1]], type='l', ylab =expression(theta), xlab = 'time', main = 'Sample paths of theta') for (i in 2:15){ lines(1:101, mcmc.output$theta[,1,sample.paths[i]]) } lines(2:101, sim.ll$mu, col = 'red', lwd=2, lty = 2) ``` We can also get predictions from this model using the posterior parameters, but how?? The mean of the predictive distribution $y_{t+1}|y_t$ is just $\mu_{t-1}$ and the variance is $C_{t-1} + W + V.$ Thus we can create samples from the predictive distribution by using the samples of $\theta$ and propogating the error. *However, know that we are using all of the data to estimate V and W.* ```{r} V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T) W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dW, 101))), nrow=101, ncol = 5000, byrow = T) y.pred <- mcmc.output$theta[,1,] + V.matrix + W.matrix pred.mean <- rowMeans(y.pred) pred.quant <- apply(y.pred, 1, quantile, probs = c(.025, .975)) plot(sim.ll$y, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution') lines(1:100, pred.quant[1,-1], col = 'red', lty = 2) lines(1:100, pred.quant[2,-1], col = 'red', lty = 2) lines(1:100, pred.mean[-1], col = 'red') ``` #### 1.4 Fit Local Level Model - Variance Unknown to Nile data ```{r, cache = T} data(Nile) nile.output <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 ) ``` We can also get predictions from this model using the posterior parameters, but how?? The mean of the predictive distribution $y_{t+1}|y_t$ is just $\mu_{t-1}$ and the variance is $C_{t-1} + W + V.$ Thus we can create samples from the predictive distribution by using the samples of $\theta$ and propogating the error. *However, know that we are using all of the data to estimate V and W.* ```{r} V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T) W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dW, 101))), nrow=101, ncol = 5000, byrow = T) nile.pred <- nile.output$theta[,1,] + V.matrix + W.matrix pred.mean <- rowMeans(nile.pred) pred.quant <- apply(nile.pred, 1, quantile, probs = c(.025, .975)) plot(Nile, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution') lines(1871:1970, pred.quant[1,-1], col = 'red', lty = 2) lines(1871:1970, pred.quant[2,-1], col = 'red', lty = 2) lines(1871:1970, pred.mean[-1], col = 'red') ``` #### 1.5 Other models for Nile Data The `dlmModPoly` has the capability to fit more complicated models too. Consider the linear growth model: \begin{eqnarray*} y_t &=& \mu_t + v_t\\ \mu_t &=& \mu_{t-1} + \beta_{t-1} + w_{t,1}\\ \beta_t &=& \beta_{t-1} + w_{t,2}. \end{eqnarray*} This can be fit using `dlmModPoly(order = 2)` ```{r, cache = T, fig.cap = 'This figure shows the filtering distribution for mu, which is different than the predictive distribution'} nile.output2 <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 2), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 ) mean.nile <- rowMeans(nile.output2$theta[,1,]) nile.quant <- apply(nile.output2$theta[,1,],1, quantile, probs=c(.025,.975)) plot(Nile,ylim = c(min(c(nile.quant,Nile)), max(c(nile.quant,Nile))), pch =16, ylab = '', xlab = 'Time', main = 'Filtering Distribution') lines(1871:1970, mean.nile[-1], col = 'red', lty = 2) lines(1871:1970, nile.quant[2,-1], col = 'red', lty = 2) lines(1871:1970, nile.quant[1,-1], col = 'red') ``` #### 1.6 More about dlm With the `dlm` package, the model type needs to be specified. We have see `dlmModPoly()`, but `dlm` also contains: - `dlmModSeas()` - `dlmModReg()` - `dlmModARMA()` Furthermore, these models can be combined to form a model with trends, regression components, ARMA elements, etc... ### 2. Dynamic Regression Another nice feature of state-space models is the ability to have time-varying regression coefficients. Furthermore, there is a straight-forward way to model time series data that is non-normal. Consider the following model, describe the parameters in this model and explain what the model does. \begin{eqnarray*} y_t &\sim& Bernoulli(\pi_t)\\ logit(\pi_t) &=& x_{t} \beta_{t}\\ \beta_{t} &=& \beta_{t-1} + w_{t} \\ \end{eqnarray*} where $w_{t} \sim N(0, W)$. #### 2.1 Simulate Dynamic Generalized Regression Model ```{r, fig.cap = 'time varying probability in line and observed binary points as black dots' } set.seed(11192018) num.pts <- 500 beta <- rep(3,num.pts) x <- runif(num.pts) W <- .05 for (time.pts in 2:num.pts){ beta[time.pts] <- beta[time.pts - 1] + rnorm(1, mean = 0, sd = sqrt(W)) } pi.t <- exp( x * beta) / (1 + exp(x * beta) ) y <- rbinom(num.pts, 1 ,pi.t) plot(pi.t, type = 'l', lty = 2, ylim=c(0,1), xlab='time') points(1:num.pts,y, pch = 16, cex = .8) ``` #### 2.2 Fit Dynamic Generalized Regression Model ##### 2.3 Example for one time point In this situation, the prediction and filtering equations do not have closed form solutions. Technically, these distributions are computed with a series of integrals. Another solution is to use a class of algorithms known as sequential Monte Carlo and in particular an algorithm known as a particle filter. The algorithm is initialized with a distribution for the parameters at time 0. (For simplicity we will assume that $W$ is known, but this is not necessary.) ##### prior distributions ```{r} # Storage num.particles <- 10000 beta.samples <- matrix(0,nrow=num.particles, ncol = num.pts + 1) # initialize particles at time 0 beta.samples[,1] <- rnorm(num.particles, mean = 0, sd=4) hist(beta.samples[,1], main = 'Prior at time 0', xlab = expression(beta), breaks = 'FD') ``` ##### evolution distributions Next we want to consider where the state distribution is at the next time point ```{r} # Storage beta.evolution <- matrix(0, num.particles, num.pts) # initialize particles at time 0 beta.evolution[,1] <- beta.samples[,1] + rnorm(num.particles, mean = 0, sd=sqrt(W)) hist(beta.evolution[,1], main = 'Evolution Dist at time 1', xlab = expression(beta), breaks = 'FD') ``` ##### predictive distributions ```{r} xb <- beta.evolution[,1] * x[1] probs <- exp(xb) / (1 + exp(xb)) y1 <- rbinom(num.particles, 1, probs) par(mfcol=c(1,2)) hist(probs, main = 'distribution for pi', xlab = expression(pi[1]), breaks = 'FD') hist(y1, breaks = c(-.5,.5,1.5),main = 'distribution for y', xlab = expression(y[1]), probability = TRUE) ``` ##### compute importance weights and resample ```{r} w <- dbinom(y[1], 1, prob = probs) / (dnorm(beta.evolution[,1],mean = beta.samples[,1], sd = sqrt(W))) w.norm <- w / sum(w) sample.pos <- base::sample(num.particles, prob = w.norm, replace = T) beta.samples[,2] <- beta.evolution[sample.pos,1] hist(beta.samples[,2], main = 'Posterior after time 1', xlab = expression(beta), breaks = 'FD') abline(v=beta[1], col = 'red') ``` ##### 2.3 Example for all time points ```{r} for (time.pts in 2:num.pts){ # evolution beta.evolution[,time.pts] <- beta.samples[,time.pts] + rnorm(num.particles, mean = 0, sd=sqrt(W)) # weights and resample xb <- beta.evolution[,time.pts] * x[time.pts] probs <- exp(xb) / (1 + exp(xb)) w <- dbinom(y[time.pts],1, prob = probs) / (dnorm(beta.evolution[,time.pts],mean = beta.samples[,time.pts], sd = sqrt(W)) ) w.norm <- w / sum(w) sample.pos <- base::sample(num.particles, prob = w.norm, replace = T) beta.samples[,time.pts + 1] <- beta.evolution[sample.pos,time.pts] } beta.q <- apply(beta.samples, 2, quantile, prob=c(.025,.975)) par(mfcol = c(1,1)) plot(1:num.pts, beta, ylim=c(min(beta.samples), max(beta.samples)), pch = 16, xlab = 'time') lines(1:num.pts, beta.q[1, -1], lty = 2) lines(1:num.pts, beta.q[2, -1], lty = 2) prob.samples <- exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)) / (1 + exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T))) prob.q <- apply(prob.samples, 2, quantile, prob = c(.025,.975)) plot(1:num.pts, pi.t, ylim=c(0, 1), pch = 16, xlab = 'time', type = 'b') lines(1:num.pts, colMeans(prob.samples), lty = 2, col = 'red', lwd = 2) ```