## Lecture 14: Poisson regression

## Read in Gun Registration data by copying and pasting the following into R.
## Source: Albyn Jones
## http://people.reed.edu/~jones/141/Guns.html

## Variables: Socioeconomic data from 1990/1991 -
# pop = population of state (in 1000s of people)
# area = area of state (in 1000 square miles)
# urban = percent urban population
# poverty = percent below poverty line
# gunreg = whether there are gun registration laws or not
# homicides = number of homicides in the past year
"GunReg" <- 
structure(.Data = list(
"pop" = c(4089, 2372, 30380, 3291, 598, 13277, 1135, 2795, 11543, 5996, 
   4860, 9368, 4432, 5158, 6737, 635, 7760, 18058, 10939, 11961, 1004,
   3560, 4953, 17349, 1770, 5018, 570, 3750, 3377, 680, 6623,
   1039, 5610, 2495, 3713, 4252, 1235, 2592, 808, 1593, 1105,
   1548, 1284, 3175, 2922, 703, 6286, 567, 4955, 1801, 460.), 
"area" = c(52.4, 53.2, 163.7, 5.5, 0.1, 65.8, 10.9, 56.3, 57.9, 10.6, 
   12.4, 96.8, 86.9, 69.7, 53.8, 70.7, 8.7, 54.5, 44.8, 46.1, 1.5, 32, 
   42.1, 268.6, 84.9, 71.3, 656.4, 114, 104.1, 2.5, 59.4, 83.6, 36.4, 
   82.3, 40.4, 51.8, 35.4, 48.4, 147, 77.4, 9.4, 121.6, 110.6, 69.9, 
   98.4, 77.1, 42.8, 9.6, 65.5, 24.2, 97.8), 
"urban" = c(60, 54, 93, 79, 100, 85, 89, 61, 85, 84, 81, 70, 71, 53, 
   50, 53, 89, 84, 74, 69, 86, 55, 61, 80, 87, 76, 68, 88, 82,
   73, 63, 57, 65, 69, 52, 68, 45, 47, 53, 66, 51, 73, 88,
   68, 71, 50, 69, 32, 66, 36, 65.), 
"poverty" = c(19, 18.4, 14.2, 5.8, 19.2, 14.1, 10, 10.1, 13.3, 10.2, 
   9.3, 13.9, 12, 13.6, 13.2, 13.5, 9, 14.1, 11.8, 10.8, 8.2, 16.5, 
   16.9, 16.8, 9.8, 26.2, 11.2, 14.2, 12.1, 8.1, 16, 13.7, 14.1, 11.1, 
   17.4, 22, 12.5, 23.8, 15.8, 10.9, 7.1, 20.9, 10.7, 15.8, 11.3, 13.5, 
   10.6, 7.1, 9.2, 17.2, 10.6), 
"gunreg" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.), 
"homicides" = c(410, 240, 3710, 170, 489, 1300, 44, 62, 1270, 200, 540, 
   1020, 100, 550, 730, 11, 350, 2550, 760, 740, 38, 350, 470, 2660,
   43, 220, 56, 290, 155, 32, 720, 21, 380, 150, 260, 760,
   23, 370, 29, 43, 32, 160, 135, 220, 120, 9, 550, 24, 240,
   135, 20.)), 
names = c("pop", "area", "urban", "poverty", "gunreg", "homicides"), 
row.names = c("AL", "AR", "CA", "CT", "DC", "FL", "HI", "IA", "IL", "MA", 
   "MD", "MI", "MN", "MO", "NC", "ND", "NJ", "NY", "OH", "PA", "RI", "SC", 
   "TN", "TX", "UT", "WA", "AK", "AZ", "CO", "DE", "GA", "ID", "IN", "KS", 
   "KY", "LA", "ME", "MS", "MT", "NE", "NH", "NM", "NV", "OK", "OR", "SD", 
   "VA", "VT", "WI", "WV", "WY"), class = "data.frame")

##
### Research question: Do gun registration laws affect the rate of homicides in a state?
##

##
### Exploratory data analysis
##
summary(GunReg)

# Ordered by homicides (smallest to largest)
GunReg[order(GunReg$homicides),]

# Is this an appropriate model? Why or why not?
glm(homicides ~ gunreg, family="poisson", data=GunReg)

# Ordered by population (smallest to largest)
GunReg[order(GunReg$pop),]

# Homicide rates (per 1,000 person-years):
GunReg$rate <- GunReg$homicides/GunReg$pop
summary(GunReg$rate)
GunReg[order(GunReg$rate),]
hist(GunReg$rate, breaks=50, col="blue", xlab="Homicide Rate per 1000 person-years")
# Large outlier - which state?

# Homicide rates stratifying by gun registration laws:
Y <- aggregate(homicides ~ gunreg, data = GunReg, sum)
T <- aggregate(pop ~ gunreg, data = GunReg, sum)
by.strata <- cbind(Y,T)[,-3]
by.strata$homrate <- by.strata$homicides/by.strata$pop
by.strata

## Aside -
# What about using this?
lm(rate ~ gunreg, data=GunReg)
# Not terrible idea - approx normal; approx constant var
# Doesn't give any negative fitted values (but could!)


boxplot(GunReg$rate ~ GunReg$gunreg, xlab="Gun registration laws", ylab="Homicide Rate per 1000 person-years")
identify(GunReg$gunreg+1, GunReg$rate, labels=row.names(GunReg))  # x-axis points = 1, 2
# Click on outliers on plot to identify; then type Esc

##
### Modeling
##

# Poisson model addressing research question (note offset - why needed?):
mod1 <- glm(homicides ~ gunreg, family=poisson, offset=log(pop), data=GunReg)
summary(mod1)
# Interpret gunreg coef:

# Extremely large deviance for df
# GoF p-value =
pchisq(summary(mod1)$deviance, df=49, lower.tail=FALSE)

# Where is the lack of fit?
sort(residuals(mod1, type="deviance"))   

# Something weirder
GunReg$x <- rnorm(51, 40, 5)
summary(glm(homicides ~ gunreg+x, family=poisson, offset=log(pop), data=GunReg))
# What's happening here?

# Compare to:
GunReg$lrate <- log(GunReg$homicides/GunReg$pop)
mod.lm = lm(lrate ~ gunreg, data=GunReg) 
summary(mod.lm)  ## Note low r-squared --> gun registration law indicator doesn't explain much of the variability among homicide rates!
# Make sure you understand the inherant difference between
# this model and the Poisson regression model!
# Residual diagnostics for linear model:
par(mfrow=c(2,2))
plot(mod.lm)
# Possible problem with constant variance.
# Normality assumption looks ok.
# DC an outlier.

# Control for other confounders:
mod2 <- glm(homicides ~ gunreg+poverty+urban, family=poisson, offset=log(pop), data=GunReg)
summary(mod2)
# Deviance still extremely large
sort(residuals(mod2, type="deviance"))
# Deviance GoF:
pchisq(3919.4, 47, lower.tail=FALSE)


### How well does the model really fit?
### Plot observed data and fitted model: Tricky since we have three predictors.
plot(rate ~ poverty, data=GunReg)
points(predict(mod2, type="response")/GunReg$pop ~ GunReg$poverty, pch=3, col="red")
legend("topleft", c("Observed Rate", "Fitted Rate"), pch=c(1,3), col=c(1,"red"))


## Overdispersion?
plot(residuals(mod2, type="pearson")^2 ~ fitted(mod2), xlab="Fitted Homicide Counts", ylab="Squared Pearson Residuals")
sfit <- supsmu(fitted(mod2), residuals(mod2, type="pearson")^2)
lines(sfit$x[ order(sfit$x) ], sfit$y[ order(sfit$x) ], col="red", lwd=2)
abline(h=1, lty=2, col="blue", lwd=2)
# Identify outliers
identify(fitted(mod2), residuals(mod2, type="pearson")^2, label=row.names(GunReg))

## Identify influential points
plot(cooks.distance(mod2) ~ hatvalues(mod2), ylab="Cook's Distance", xlab="Leverage")
text(hatvalues(mod2)-.015, cooks.distance(mod2), row.names(GunReg), cex=.7)
abline(h=qf(.5, length(mod2$coef), mod2$df.residual), col="blue")
abline(v=3*mean(hatvalues(mod2)), col="red")  
# Effect of WA on fit?
summary(mod2)$coef
summary(update(mod2, subset=-26))$coef

## Use squared Pearson residuals to estimate dispersion parameter phi:
presid2 <- residuals(mod2,type="pearson")^2
phihat <- sum(presid2)/mod2$df.residual  
phihat # 85.981
# We estimate that the variance is about 86 times larger
# than what we would expect under the Poisson distn.


### Quasi-Poisson Model to fit overdispersion
mod2.quasi <- glm(homicides ~ gunreg+poverty+urban, family=quasipoisson, offset=log(pop), data=GunReg)
summary(mod2.quasi)
# Notes: 
#  - estimated dispersion parameter matches estimate above
#  - std. errors = std. errors of mod2 multiplied by sqrt(85.981) - now standard errors are valid!
#  - Residual deviance is the same as mod2 --> need to divide by phihat for LRT

# Deviance GoF - need to divide by phihat
LRT.GoF <- mod2.quasi$deviance/phihat
pchisq(LRT.GoF, 47, lower.tail=FALSE)


# LRT test statistic needs to be adjusted by dividing by dispersion parameter:
anova(update(mod2.quasi, .~.-gunreg), mod2.quasi)
LRT.incorrect <- summary(update(mod2.quasi, .~.-gunreg))$deviance - summary(mod2.quasi)$deviance
## Diff in deviance = 67.048 --> differences in binomial likelihood
# Correct test statistic when allowing for overdispersion:
LRT.correct <- LRT.incorrect/phihat
# Use chi-squared with 1 df for p-value:
pchisq(LRT.correct, df=1, lower.tail=FALSE)

## Conclusion?
# No evidence that, when controlling for poverty and percent urban,
# gun registration laws have an effect on the homicide rate.

# But wait.... what about the outlier WA?
mod2.quasi.out <- update(mod2.quasi, subset=-26)
summary(mod2.quasi.out)
# Observation 26 (WA) still has a large influence on the results!
# Much lower estimate of the dispersion parameter if WA removed.
# gunreg coef now marginally significant.

#
## Alternative model: Fit negative binomial regression model:
#
library(MASS)
?glm.nb  # Uses log link by default
mod2.nb <- glm.nb(homicides ~ gunreg+poverty+urban+offset(log(pop)), data=GunReg)
summary(mod2.nb)