---
title: "DLM Demo"
author: "Andy Hoegh"
date: "11/19/2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


First install the stat536 library and other packages.
```{r }
library(devtools)
install_github('andyhoegh/stat536')
library(stat536)
library(ggplot2)
library(dplyr)
library(dlm)
library(datasets)
```

### 1. Random Walk plus Noise Model (Local Level Model)

The first model we will look at has a time-varying mean, with white noise. Specifically, this model can be written as
\begin{eqnarray}
y_t &=& \theta_t + v_t \\
\theta_t &=& \theta_{t-1} + w_t,
\end{eqnarray}
where $v_t \sim N(0, V)$ and $w_t \sim N(0, W)$,

This model is appropriate when there is no clear trend or seasonality.

#### 1.1 Simulate from Local Level
```{r}
set.seed(11152018)
num.pts <- 100 
V <- 10
W <- 3
sim.ll <- sim_locallevel(num.pts = num.pts, V = 10, W = 3, mu.0 = 0)

ll <- data.frame( val = c(sim.ll$y, sim.ll$mu), t = rep(1:num.pts, 2), label = rep(c('y','mu'), each = num.pts))
ggplot(data = ll, aes(y = val, x = t ) ) + geom_line(aes(color=(label)))
```

- __Q__: vary V and W. What do you observe? This ratio is known as the signal-to-noise ratio.
 

#### 1.2 Fit Local Level Model - Variance known

To fit this model there are there iterative steps that are repeated for each time point:

- Prior: $$\theta_{t-1}|y_{t-1}, \dots, y_{0} \sim N(\mu_{t-1}, C_{t-1})$$

- Prediction: $$\theta_t|y_{t-1}, \dots, y_{0} \sim N(a_t, R_t),$$
where in this case $a_t = \mu_{t-1}$ and $R_t = C_{t-1} + W$
$$y_t|y_{t-1} \sim N(f_t, Q_t)$$
where in this case $f_t = a_t$ and $Q_t = R_t + V$.

- Estimation (filtering): $$\theta_t|y_t, \dots, y_1 \sim N(\mu_t, C_t)$$
where $\mu_t = a_t + \frac{R_t}{R_t + V} (y_t - f_t)$ and $C_t = R_t - \frac{R_t}{R_t + V} R_t$

The code below works through these equations, assuming V and W are known, to obtain the predictive distribution.
```{r, fig.cap = 'Observed points in black. Dotted red lines are 95% intervals for the predictive distribution. Dashed red line is the mean prediction.'}

## Code extracted from filter_ll()
y <- sim.ll$y
mu0 <- 0
C0 <- 100

a <- R <- f <- Q <- mu <- C <- rep(0, num.pts)

# evolution dist parameters (theta_t|y_{t-1})
a[1] <- mu0
R[1] <- C0 + W

# predictive dist parameters (y_t|y_{t-1})
f[1] <- a[1]
Q[1] <- R[1] + V

# filtering dist parameters (theta_{t}|y_t)
mu[1] <- a[1] + (R[1] / (R[1] + V)) * (y[1] - f[1])
C[1] <- R[1] - (R[1]/ (R[1] + V)) * R[1]

for (time.pts in 2:num.pts){
  # evolution dist parameters (theta_t|y_{t-1})
  a[time.pts] <- mu[time.pts - 1]
  R[time.pts] <- C[time.pts - 1] + W

  # predictive dist parameters (y_t|y_{t-1})
  f[time.pts] <- a[time.pts]
  Q[time.pts] <- R[time.pts] + V

  # filtering dist parameters (theta_{t}|y_t)
  mu[time.pts] <- a[time.pts] + (R[time.pts] / (R[time.pts] + V)) * (y[time.pts] - f[time.pts])
  C[time.pts] <- R[time.pts] - (R[time.pts]/ (R[time.pts] + V)) * R[time.pts]  
}

obs <- ll %>% filter(label == 'y') %>% ggplot(aes(y = val, x = t )) + geom_point() + geom_line(alpha=.2) + ggtitle('Observed Values') + ylab('')

preds <- data.frame(upper = f + 1.96 * sqrt(Q), lower =  f - 1.96 * sqrt(Q), mean = f, t = 1:num.pts)

obs + geom_line(data=preds, aes(y = upper, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = lower, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = mean, x=t), col='red', linetype=2 )
```

- __Q__: why is the initial variance so large?

- __Q__: why is the predictive distribution plotted? What about the other two distributions, how might we visualize these?

#### 1.3 Fit Local Level Model - Variance Unknown

Now we need to build in a function to estimate the variance parameters (V and W) too. This is accomplished using the forward-filter backward-sampler (FFBS) algorithm using the `dlm` package.

```{r, cache = T}
mcmc.output <- dlmGibbsDIG(y =y, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )
```

We need to look at the posterior estimates for parameters in the model.  
- First V.
```{r, fig.cap = 'Posterior estimates for V as a trace plot'}
plot(mcmc.output$dV, type= 'l', main = 'Trace for V')
abline(h = V, col = 'red', lwd = 2)
```

```{r, fig.cap = 'Posterior estimates for W as a trace plot.'}
plot(mcmc.output$dW, type= 'l', main = 'Trace for W')
abline(h = W, col = 'red', lwd = 2)
```


In this case we have samples from $\theta_t|y_t$

```{r, fig.cap = 'sample paths for theta in black, true theta in red.'}
dim(mcmc.output$theta)

sample.paths <- sample(5000,15)
plot(mcmc.output$theta[,1,sample.paths[1]], type='l', ylab =expression(theta), xlab = 'time', main = 'Sample paths of theta')
for (i in 2:15){
  lines(1:101, mcmc.output$theta[,1,sample.paths[i]])
}
lines(2:101, sim.ll$mu, col = 'red', lwd=2, lty = 2)
```

We can also get predictions from this model using the posterior parameters, but how??

The mean of the predictive distribution $y_{t+1}|y_t$ is just $\mu_{t-1}$ and the variance is $C_{t-1} + W + V.$

Thus we can create samples from the predictive distribution by using the samples of $\theta$ and propogating the error. *However, know that we are using all of the data to estimate V and W.*
```{r}
V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
y.pred <- mcmc.output$theta[,1,] + V.matrix + W.matrix

pred.mean <- rowMeans(y.pred)
pred.quant <- apply(y.pred, 1, quantile, probs = c(.025, .975))

plot(sim.ll$y, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')

lines(1:100, pred.quant[1,-1], col = 'red', lty = 2)
lines(1:100, pred.quant[2,-1], col = 'red', lty = 2)
lines(1:100, pred.mean[-1], col = 'red')
```


#### 1.4 Fit Local Level Model - Variance Unknown to Nile data

```{r, cache = T}
data(Nile)
nile.output <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )
```


We can also get predictions from this model using the posterior parameters, but how??

The mean of the predictive distribution $y_{t+1}|y_t$ is just $\mu_{t-1}$ and the variance is $C_{t-1} + W + V.$

Thus we can create samples from the predictive distribution by using the samples of $\theta$ and propogating the error. *However, know that we are using all of the data to estimate V and W.*
```{r}
V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
nile.pred <- nile.output$theta[,1,] + V.matrix + W.matrix

pred.mean <- rowMeans(nile.pred)
pred.quant <- apply(nile.pred, 1, quantile, probs = c(.025, .975))

plot(Nile, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')

lines(1871:1970, pred.quant[1,-1], col = 'red', lty = 2)
lines(1871:1970, pred.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, pred.mean[-1], col = 'red')
```


#### 1.5 Other models for Nile Data

The `dlmModPoly` has the capability to fit more complicated models too. Consider the linear growth model:
\begin{eqnarray*}
y_t &=& \mu_t + v_t\\
\mu_t &=& \mu_{t-1} + \beta_{t-1} + w_{t,1}\\
\beta_t &=& \beta_{t-1} + w_{t,2}.
\end{eqnarray*}

This can be fit using `dlmModPoly(order = 2)`


```{r, cache = T, fig.cap = 'This figure shows the filtering distribution for mu, which is different than the predictive distribution'}
nile.output2 <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 2), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )

mean.nile <- rowMeans(nile.output2$theta[,1,])
nile.quant <- apply(nile.output2$theta[,1,],1, quantile, probs=c(.025,.975))
plot(Nile,ylim = c(min(c(nile.quant,Nile)), max(c(nile.quant,Nile))), pch =16, ylab = '', xlab = 'Time', main = 'Filtering Distribution')

lines(1871:1970, mean.nile[-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[1,-1], col = 'red')
```


#### 1.6 More about dlm

With the `dlm` package, the model type needs to be specified. We have see `dlmModPoly()`, but `dlm` also contains:

- `dlmModSeas()`
- `dlmModReg()`
- `dlmModARMA()`

Furthermore, these models can be combined to form a model with trends, regression components, ARMA elements, etc...


### 2. Dynamic Regression

Another nice feature of state-space models is the ability to have time-varying regression coefficients. Furthermore, there is a straight-forward way to model time series data that is non-normal.

Consider the following model, describe the parameters in this model and explain what the model does.

\begin{eqnarray*}
y_t &\sim& Bernoulli(\pi_t)\\
logit(\pi_t) &=&  x_{t} \beta_{t}\\
\beta_{t} &=& \beta_{t-1} + w_{t} \\
\end{eqnarray*}

where $w_{t} \sim N(0, W)$.


#### 2.1 Simulate Dynamic Generalized Regression Model

```{r, fig.cap = 'time varying probability in line and observed binary points as black dots' }
set.seed(11192018)
num.pts <- 500
beta <- rep(3,num.pts)
x <- runif(num.pts)
W <- .05

for (time.pts in 2:num.pts){
  beta[time.pts] <- beta[time.pts - 1] + rnorm(1, mean = 0, sd = sqrt(W))
}

pi.t <- exp( x * beta) / (1 + exp(x * beta) )
y <- rbinom(num.pts, 1 ,pi.t)
plot(pi.t, type = 'l', lty = 2, ylim=c(0,1), xlab='time')
points(1:num.pts,y, pch = 16, cex = .8)
```


#### 2.2 Fit Dynamic Generalized Regression Model

##### 2.3 Example for one time point

In this situation, the prediction and filtering equations do not have closed form solutions. Technically, these distributions are computed with a series of integrals.

Another solution is to use a class of algorithms known as sequential Monte Carlo and in particular an algorithm known as a particle filter.

The algorithm is initialized with a distribution for the parameters at time 0. (For simplicity we will assume that $W$ is known, but this is not necessary.)

##### prior distributions
```{r}
# Storage
num.particles <- 10000
beta.samples <- matrix(0,nrow=num.particles, ncol = num.pts + 1)

# initialize particles at time 0
beta.samples[,1] <- rnorm(num.particles, mean = 0, sd=4)

hist(beta.samples[,1], main = 'Prior at time 0', xlab = expression(beta), breaks = 'FD')
```

##### evolution distributions

Next we want to consider where the state distribution is at the next time point

```{r}
# Storage
beta.evolution <- matrix(0, num.particles, num.pts)
  
# initialize particles at time 0
beta.evolution[,1] <- beta.samples[,1] + rnorm(num.particles, mean = 0, sd=sqrt(W))

hist(beta.evolution[,1], main = 'Evolution Dist at time 1', xlab = expression(beta), breaks = 'FD')
```

##### predictive distributions

```{r}
xb <-  beta.evolution[,1] * x[1]
probs <- exp(xb) / (1 + exp(xb))
y1 <- rbinom(num.particles, 1, probs)

par(mfcol=c(1,2))
hist(probs, main = 'distribution for pi', xlab = expression(pi[1]), breaks = 'FD')
hist(y1, breaks = c(-.5,.5,1.5),main = 'distribution for y', xlab = expression(y[1]), probability = TRUE)
```

##### compute importance weights and resample

```{r}
w <- dbinom(y[1], 1, prob = probs) / (dnorm(beta.evolution[,1],mean = beta.samples[,1], sd = sqrt(W))) 

w.norm <- w / sum(w)

sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)

beta.samples[,2] <- beta.evolution[sample.pos,1]


hist(beta.samples[,2], main = 'Posterior after time 1', xlab = expression(beta), breaks = 'FD')
abline(v=beta[1], col = 'red')
```

##### 2.3 Example for all time points

```{r}

for (time.pts in 2:num.pts){
  # evolution
  beta.evolution[,time.pts] <- beta.samples[,time.pts] + rnorm(num.particles, mean = 0, sd=sqrt(W))

  # weights and resample
  xb <- beta.evolution[,time.pts] * x[time.pts]
  probs <- exp(xb) / (1 + exp(xb))
  w <- dbinom(y[time.pts],1, prob = probs) / (dnorm(beta.evolution[,time.pts],mean = beta.samples[,time.pts], sd = sqrt(W)) ) 
  w.norm <- w / sum(w)

  sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)

  beta.samples[,time.pts + 1] <- beta.evolution[sample.pos,time.pts]
}  

beta.q <- apply(beta.samples, 2, quantile, prob=c(.025,.975))
par(mfcol = c(1,1))
plot(1:num.pts, beta, ylim=c(min(beta.samples), max(beta.samples)), pch = 16, xlab = 'time')
lines(1:num.pts, beta.q[1, -1], lty = 2)
lines(1:num.pts, beta.q[2, -1], lty = 2)

prob.samples <- exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)) / (1  + exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)))
prob.q <- apply(prob.samples, 2, quantile, prob = c(.025,.975))

plot(1:num.pts, pi.t, ylim=c(0, 1), pch = 16, xlab = 'time', type = 'b')
lines(1:num.pts, colMeans(prob.samples), lty = 2, col = 'red', lwd = 2)

```