library(VGAM)  # vglm
library(MASS)  # polr, housing

######## PO Model with quantitative predictor
## Mental Impairment Data:
# Study of mental health for a random sample of adult residents
# of Alachua County, Florida.
# Y = mental impairment level (1 = well, 2 = mild symptom formation, 3 = moderate symptom formation, 4 = impaired)
# Explanatory variables:
#  SES = socio-economic status (1 = high, 0 = low)
#  Life = life events index = quantitative composite score of the number
#         and severity of important life events such as birth of child,
#         new job, divorce, or deach in the family that occurred
#         to the subject within the past three years.

dat = read.table("http://www.math.montana.edu/shancock/courses/stat539/data/mental-impairment.txt",header=T)
head(dat$Y)  # R treats as quantitative

## Descriptive stats
library(ggplot2)
ggplot(dat, aes(x = Y, group = SES, fill= SES)) + geom_bar()

attach(dat)
prop.table(table(SES, Y))
props <- table(SES,Y)/rep(colSums(table(SES,Y)), each=2)
barplot(props, xlab="Mental Impairment", ylab="Proportion in SES")

mosaicplot(SES ~ Y)

## Life index vs Y? Life index and SES vs Y?
table(Life)

plot(Y ~ jitter(Life, factor=.5), xlab="Life Index", ylab="Mental Impairment Score", pch=SES+1, col=c("tomato","purple")[SES+1], cex=1.3)
legend("topleft", c("High","Low"), col=c("purple","tomato"), pch=c(2,1))
# Add smoother

table(Life, Y)/rep(rowSums(table(Life,Y)), each=4) # Not helpful

tapply(Y, Life, median)

dim(dat)

# Re-code variables:
dat$Y = as.ordered(dat$Y)  # Change class to ordered factor
levels(dat$Y) = c("well", "mild", "mod", "imp")
dat$SES = factor(dat$SES, levels=c(0,1), labels=c("low", "high"))
head(dat)

# Use polr function in MASS library: Response must be ordered factor.
?polr
fit1 <- polr(Y ~ Life, data = dat)
summary(fit1)

# Beta estimate:
b=-fit1$coef
# Alpha estimates:
a=fit1$zeta

# Plot cumulative probabilities:
cols = c("darkgreen","darkblue","purple","red")
curve(1/(1+exp(-(a[1]+b*x))),0,10,ylim=c(0,1),col=cols[1],lty=2,
main = "Cumulative Probabilities",xlab="Life Events",ylab="P(Y<=j)")
curve(1/(1+exp(-(a[2]+b*x))),add=T,col=cols[2],lty=3)
curve(1/(1+exp(-(a[3]+b*x))),add=T,col=cols[3],lty=4)
legend(7,.9,c("Well","Mild","Mod"),col=cols[1:3],lty=c(2:4))

# Plot individual probabilities:
curve(1/(1+exp(-(a[1]+b*x))),0,10,ylim=c(0,1),col=cols[1],lty=2,
main="Category Probabilities",xlab="Life Events",ylab="P(Y=j)")
curve(1/(1+exp(-(a[2]+b*x)))-1/(1+exp(-(a[1]+b*x))),add=T,col=cols[2],lty=3)
curve(1/(1+exp(-(a[3]+b*x)))-1/(1+exp(-(a[2]+b*x))),add=T,col=cols[3],lty=4)
curve(1-1/(1+exp(-(a[3]+b*x))),add=T,col=cols[4],lty=5)
legend(7,.9,c("Well","Mild","Mod","Impaired"),col=cols,lty=c(2:5))


fit2 = polr(Y ~ SES + Life, data = dat)
summary(fit2)
# Beta estimates:
b2=-fit2$coef
# Alpha estimates:
a2=fit2$zeta

# Plot cumulative probabilities:
cols = c("darkgreen","darkblue","purple","red")
# SES = low
curve(1/(1+exp(-(a2[1]+b2[2]*x))),0,10,ylim=c(0,1),col=cols[1],lty=1,
main = "Cumulative Probabilities",xlab="Life Events",ylab="P(Y<=j)")
curve(1/(1+exp(-(a2[2]+b2[2]*x))),add=T,col=cols[2],lty=1)
curve(1/(1+exp(-(a2[3]+b2[2]*x))),add=T,col=cols[3],lty=1)
# SES = high
curve(1/(1+exp(-(a2[1]+b2[1]+b2[2]*x))),add=T,col=cols[1],lty=2)
curve(1/(1+exp(-(a2[2]+b2[1]+b2[2]*x))),add=T,col=cols[2],lty=2)
curve(1/(1+exp(-(a2[3]+b2[1]+b2[2]*x))),add=T,col=cols[3],lty=2)

legend(8,.95,c("Well","Mild","Mod"),fill=cols[1:3])
legend(0,.1,c("High SES","Low SES"),lty=c(2,1))

# Model comparison LRT of H0: fit1 vs Ha: fit2
anova(fit1, fit2)

# Interaction between SES and Life?
anova(fit2, update(fit2, .~.+SES:Life))

## Interpret coefs in fit2: Change in the estimated odds of mental 
## impairment below any fixed level.
# --> Signs of beta estimates suggest cumulative probability
# starting at the "well" end of the scale decreases as life
# events increases (adjusting for SES), and is higher for the
# high level of SES (adjusting for life events).

# Can we use deviance to check goodness of fit?
# --> No. Contingency table is too sparse.
# But can check model fit by comparing to more complex models:
fit3 <- update(fit2, .~.+SES:Life)
anova(fit2, fit3)

## Interpret SEShigh:Life interaction coef?
fit3
# --> Impact of life events on mental impairment seems more
# severe for the low SES group, but the difference in effects
# is not statistically significant.

## Alternative interpretation: compare cumulative probabilities directly.
# Quantitative predictor - compare cumulative probabilities at quartiles
# Categorical predictor - compare cumulative probabilities for different category
# Control for quantitative variables by setting equal to mean.
# Control for categorical variables by fixing the category.
## Try for this model.

######## PO Model with categorical predictor
# Built-in data set of 1,681 renters from a housing satisfaction survey
# in Copenhagen
# Variables:
# Sat = subject's level of satisfaction with housing conditions
# Infl = subject's feeling of influence on apartment management
# Type = type of rental accommodation occupied
# Cont = degree of contact subject had with other residents
# Freq = cell count
data(housing)  # Loads data set
attach(housing)

# Note that Sat is an ordered factor:
Sat

# Marginal tables of each explanatory variable vs. Sat:
Infl.tab = xtabs(Freq ~ Infl+Sat)
Type.tab = xtabs(Freq ~ Type+Sat)
Cont.tab = xtabs(Freq ~ Cont+Sat)
# All marginal associations seem strong:
chisq.test(Infl.tab)
chisq.test(Type.tab)
chisq.test(Cont.tab)

# Re-arrange data:
Sat.Low = housing[Sat=="Low",c(2:5)]
Sat.Med = housing[Sat=="Medium",5]
Sat.High = housing[Sat=="High",5]

housing.new = data.frame(Sat.Low,Sat.Med,Sat.High)
names(housing.new) = c("Infl","Type","Cont","Low","Med","High")

# Treat Sat as response -
# Fit proportional odds model using Infl as explanatory variable.
# Use vglm function in VGAM library:
mod1 = vglm(cbind(Low,Med,High) ~ Infl, family = cumulative(parallel=TRUE), data = housing.new)
summary(mod1)

detach(housing)