### Stat 539: Lecture 13
### Loglinear models for contingency tables with housing data

library(MASS)

# Built-in data set of 1,681 renters from a housing satisfaction survey
# in Copenhagen

# Variables:
# Sat = subject's level of satisfaction with housing conditions (response)
# Infl = subject's feeling of influence on apartment management
# Type = type of rental accommodation occupied
# Cont = degree of contact subject had with other residents
# Freq = cell count
data(housing)  # Loads data set
head(housing)
class(housing$Sat)  # Treating as ordered factor
housing$Sat <- factor(housing$Sat,ordered=FALSE)

attach(housing)

sum(Freq)  # Sample size

# Visualize the data: mosaic plot
# Convert into 4-way contingency table
House <- xtabs(Freq ~ Type+Infl+Cont+Sat, housing)
House
# Type = x-axis, Infl = y-axis, 
# Cont = sub-group on x-axis, Sat = sub-group on y=axis
mosaicplot(House, color=TRUE)
?mosaicplot
# See: http://www.datavis.ca/papers/asa92.html

# Mosaic plot showing departure from the independence model:
mosaicplot(House, type="deviance", shade=T)
# Hard to read - maybe try different ordering:
House <- xtabs(Freq ~ Sat+Type+Infl+Cont, housing)
mosaicplot(House, type="deviance", shade=T)

### 
## Consider the 2-way marginal table of 
# Cont = row variable; Sat = col variable
###
tab1 <- tapply(Freq,list(Cont,Sat),sum)
tab1

# Independence model:
mod1 <- glm(Freq ~ Cont+Sat, family=poisson, data=housing)
## Check odds ratios = 1 with fitted model:
mod1.fit <- tapply(mod1$fitted, list(Cont,Sat), sum)
mod1.fit[1,1]*mod1.fit[2,2]/(mod1.fit[1,2]*mod1.fit[2,1])
mosaicplot(mod1.fits, type="deviance", shade=T)

# Saturated model:
mod2 <- glm(Freq ~ Cont*Sat, family=poisson, data=housing)
# Likelihood ratio test of independence:
anova(mod1,mod2,test="LRT")

# Chi-squared test of independence gives same conclusion:
chisq.test(tab1)

# Independence model using loglm function (in MASS library):
mod1.ll <- loglm(~1+2, data=tab1, param=TRUE, fit=TRUE)
# Gives both likelihood ratio test (G^2) and chi-squared test (X^2):
mod1.ll
# Saturated model using loglm function:
mod2.ll <- update(mod1.ll, .~.^2)
anova(mod1.ll,mod2.ll)

# Compare parameter estimates of glm vs. loglm:
mod1$coef
mod1.ll$param
# Compare fitted values:
tapply(mod1$fit,list(Cont,Sat),sum) # Same as 12*(linear predictor)
mod1.ll$fit

# Can change glm parameter constraints using options
options(contrasts=c("contr.sum","contr.poly"))   # Sum to zero constraint
mod1b <- glm(Freq ~ Cont+Sat, family=poisson, data=housing)

options(contrasts=c("contr.treatment","contr.poly"))  # Reference category equal to zero constraint


### Start here Tue Mar 28 ###

####
#### Model selection using all variables
####

# Saturated model:
mod.sat <- glm(Freq~(Sat+Infl+Type+Cont)^4, family=poisson, data=housing)
summary(mod.sat) # All residuals = 0; deviance = 0

# Think of Sat as the response variable - look at initial model
# where conditional probs for each of the three Sat groups
# are the same for all type x contact x influence groups
mod0 <- glm(Freq ~ Infl*Type*Cont + Sat, family=poisson, data=housing)
summary(mod0) 
# High deviance implies lack of fit, so probs do appear to vary
# with explanatory variables.

# Try adding simplest terms that allow Sat probs to vary:
addterm(mod0, ~. + Sat:(Infl+Type+Cont), test="Chisq")
# Infl achieves the largest single term reduction in AIC.

# Add Sat:Infl and reassess:
mod1 <- update(mod0, . ~ . + Sat:Infl, family=poisson, data=housing)
summary(mod1)

addterm(mod1, ~. + Sat:Type + Sat:Cont, test="Chisq")

# Add Sat:Type
mod2 <- update(mod1, . ~ . + Sat:Type, family=poisson, data=housing)
summary(mod2)
# Still some lack of fit indicated by deviance:
pchisq(54.72,36,lower.tail=FALSE)

# Add Sat:Cont
mod3 <- update(mod2, .~. + Sat:Cont, family=poisson, data=housing)
summary(mod3)
# Deviance looks good:
pchisq(38.662,34,lower.tail=FALSE)

# See if we can drop any terms?
dropterm(mod3, test="Chisq")
# Infl:Type:Cont is part of minimum model and hence may not be removed.
# Only interested in terms that include response Sat.

# Consider adding possible interaction terms:
addterm(mod3, ~. + Sat:(Infl+Type+Cont)^2, test="Chisq")
# Looks like Type and Infl may interact on their effect on Sat,
# but increases AIC -> Don't include.

# Look at effects of Type, Infl, and Cont on Sat by
# normalizing the expected values to cell probabilities.
hnames <- lapply(housing[,-5], levels) # Omit Freq
house.pm <- predict(mod3, expand.grid(hnames), type="response") # Poisson Means
house.pm <- matrix(house.pm, ncol=3, byrow=T, dimnames=list(NULL,hnames[[1]]))
house.pr <- house.pm/rowSums(house.pm)
cbind(expand.grid(hnames[-1]), prob=round(house.pr,2))


detach(housing)