### Stat 539: Lecture 13 ### Loglinear models for contingency tables with housing data library(MASS) # Built-in data set of 1,681 renters from a housing satisfaction survey # in Copenhagen # Variables: # Sat = subject's level of satisfaction with housing conditions (response) # Infl = subject's feeling of influence on apartment management # Type = type of rental accommodation occupied # Cont = degree of contact subject had with other residents # Freq = cell count data(housing) # Loads data set head(housing) class(housing$Sat) # Treating as ordered factor housing$Sat <- factor(housing$Sat,ordered=FALSE) attach(housing) sum(Freq) # Sample size # Visualize the data: mosaic plot # Convert into 4-way contingency table House <- xtabs(Freq ~ Type+Infl+Cont+Sat, housing) House # Type = x-axis, Infl = y-axis, # Cont = sub-group on x-axis, Sat = sub-group on y=axis mosaicplot(House, color=TRUE) ?mosaicplot # See: http://www.datavis.ca/papers/asa92.html # Mosaic plot showing departure from the independence model: mosaicplot(House, type="deviance", shade=T) # Hard to read - maybe try different ordering: House <- xtabs(Freq ~ Sat+Type+Infl+Cont, housing) mosaicplot(House, type="deviance", shade=T) ### ## Consider the 2-way marginal table of # Cont = row variable; Sat = col variable ### tab1 <- tapply(Freq,list(Cont,Sat),sum) tab1 # Independence model: mod1 <- glm(Freq ~ Cont+Sat, family=poisson, data=housing) ## Check odds ratios = 1 with fitted model: mod1.fit <- tapply(mod1$fitted, list(Cont,Sat), sum) mod1.fit[1,1]*mod1.fit[2,2]/(mod1.fit[1,2]*mod1.fit[2,1]) mosaicplot(mod1.fits, type="deviance", shade=T) # Saturated model: mod2 <- glm(Freq ~ Cont*Sat, family=poisson, data=housing) # Likelihood ratio test of independence: anova(mod1,mod2,test="LRT") # Chi-squared test of independence gives same conclusion: chisq.test(tab1) # Independence model using loglm function (in MASS library): mod1.ll <- loglm(~1+2, data=tab1, param=TRUE, fit=TRUE) # Gives both likelihood ratio test (G^2) and chi-squared test (X^2): mod1.ll # Saturated model using loglm function: mod2.ll <- update(mod1.ll, .~.^2) anova(mod1.ll,mod2.ll) # Compare parameter estimates of glm vs. loglm: mod1$coef mod1.ll$param # Compare fitted values: tapply(mod1$fit,list(Cont,Sat),sum) # Same as 12*(linear predictor) mod1.ll$fit # Can change glm parameter constraints using options options(contrasts=c("contr.sum","contr.poly")) # Sum to zero constraint mod1b <- glm(Freq ~ Cont+Sat, family=poisson, data=housing) options(contrasts=c("contr.treatment","contr.poly")) # Reference category equal to zero constraint ### Start here Tue Mar 28 ### #### #### Model selection using all variables #### # Saturated model: mod.sat <- glm(Freq~(Sat+Infl+Type+Cont)^4, family=poisson, data=housing) summary(mod.sat) # All residuals = 0; deviance = 0 # Think of Sat as the response variable - look at initial model # where conditional probs for each of the three Sat groups # are the same for all type x contact x influence groups mod0 <- glm(Freq ~ Infl*Type*Cont + Sat, family=poisson, data=housing) summary(mod0) # High deviance implies lack of fit, so probs do appear to vary # with explanatory variables. # Try adding simplest terms that allow Sat probs to vary: addterm(mod0, ~. + Sat:(Infl+Type+Cont), test="Chisq") # Infl achieves the largest single term reduction in AIC. # Add Sat:Infl and reassess: mod1 <- update(mod0, . ~ . + Sat:Infl, family=poisson, data=housing) summary(mod1) addterm(mod1, ~. + Sat:Type + Sat:Cont, test="Chisq") # Add Sat:Type mod2 <- update(mod1, . ~ . + Sat:Type, family=poisson, data=housing) summary(mod2) # Still some lack of fit indicated by deviance: pchisq(54.72,36,lower.tail=FALSE) # Add Sat:Cont mod3 <- update(mod2, .~. + Sat:Cont, family=poisson, data=housing) summary(mod3) # Deviance looks good: pchisq(38.662,34,lower.tail=FALSE) # See if we can drop any terms? dropterm(mod3, test="Chisq") # Infl:Type:Cont is part of minimum model and hence may not be removed. # Only interested in terms that include response Sat. # Consider adding possible interaction terms: addterm(mod3, ~. + Sat:(Infl+Type+Cont)^2, test="Chisq") # Looks like Type and Infl may interact on their effect on Sat, # but increases AIC -> Don't include. # Look at effects of Type, Infl, and Cont on Sat by # normalizing the expected values to cell probabilities. hnames <- lapply(housing[,-5], levels) # Omit Freq house.pm <- predict(mod3, expand.grid(hnames), type="response") # Poisson Means house.pm <- matrix(house.pm, ncol=3, byrow=T, dimnames=list(NULL,hnames[[1]])) house.pr <- house.pm/rowSums(house.pm) cbind(expand.grid(hnames[-1]), prob=round(house.pr,2)) detach(housing)