library(VGAM) # vglm library(MASS) # polr, housing ######## PO Model with quantitative predictor ## Mental Impairment Data: # Study of mental health for a random sample of adult residents # of Alachua County, Florida. # Y = mental impairment level (1 = well, 2 = mild symptom formation, 3 = moderate symptom formation, 4 = impaired) # Explanatory variables: # SES = socio-economic status (1 = high, 0 = low) # Life = life events index = quantitative composite score of the number # and severity of important life events such as birth of child, # new job, divorce, or deach in the family that occurred # to the subject within the past three years. dat = read.table("http://www.math.montana.edu/shancock/courses/stat539/data/mental-impairment.txt",header=T) head(dat$Y) # R treats as quantitative ## Descriptive stats library(ggplot2) ggplot(dat, aes(x = Y, group = SES, fill= SES)) + geom_bar() attach(dat) prop.table(table(SES, Y)) props <- table(SES,Y)/rep(colSums(table(SES,Y)), each=2) barplot(props, xlab="Mental Impairment", ylab="Proportion in SES") mosaicplot(SES ~ Y) ## Life index vs Y? Life index and SES vs Y? table(Life) plot(Y ~ jitter(Life, factor=.5), xlab="Life Index", ylab="Mental Impairment Score", pch=SES+1, col=c("tomato","purple")[SES+1], cex=1.3) legend("topleft", c("High","Low"), col=c("purple","tomato"), pch=c(2,1)) # Add smoother table(Life, Y)/rep(rowSums(table(Life,Y)), each=4) # Not helpful tapply(Y, Life, median) dim(dat) # Re-code variables: dat$Y = as.ordered(dat$Y) # Change class to ordered factor levels(dat$Y) = c("well", "mild", "mod", "imp") dat$SES = factor(dat$SES, levels=c(0,1), labels=c("low", "high")) head(dat) # Use polr function in MASS library: Response must be ordered factor. ?polr fit1 <- polr(Y ~ Life, data = dat) summary(fit1) # Beta estimate: b=-fit1$coef # Alpha estimates: a=fit1$zeta # Plot cumulative probabilities: cols = c("darkgreen","darkblue","purple","red") curve(1/(1+exp(-(a[1]+b*x))),0,10,ylim=c(0,1),col=cols[1],lty=2, main = "Cumulative Probabilities",xlab="Life Events",ylab="P(Y<=j)") curve(1/(1+exp(-(a[2]+b*x))),add=T,col=cols[2],lty=3) curve(1/(1+exp(-(a[3]+b*x))),add=T,col=cols[3],lty=4) legend(7,.9,c("Well","Mild","Mod"),col=cols[1:3],lty=c(2:4)) # Plot individual probabilities: curve(1/(1+exp(-(a[1]+b*x))),0,10,ylim=c(0,1),col=cols[1],lty=2, main="Category Probabilities",xlab="Life Events",ylab="P(Y=j)") curve(1/(1+exp(-(a[2]+b*x)))-1/(1+exp(-(a[1]+b*x))),add=T,col=cols[2],lty=3) curve(1/(1+exp(-(a[3]+b*x)))-1/(1+exp(-(a[2]+b*x))),add=T,col=cols[3],lty=4) curve(1-1/(1+exp(-(a[3]+b*x))),add=T,col=cols[4],lty=5) legend(7,.9,c("Well","Mild","Mod","Impaired"),col=cols,lty=c(2:5)) fit2 = polr(Y ~ SES + Life, data = dat) summary(fit2) # Beta estimates: b2=-fit2$coef # Alpha estimates: a2=fit2$zeta # Plot cumulative probabilities: cols = c("darkgreen","darkblue","purple","red") # SES = low curve(1/(1+exp(-(a2[1]+b2[2]*x))),0,10,ylim=c(0,1),col=cols[1],lty=1, main = "Cumulative Probabilities",xlab="Life Events",ylab="P(Y<=j)") curve(1/(1+exp(-(a2[2]+b2[2]*x))),add=T,col=cols[2],lty=1) curve(1/(1+exp(-(a2[3]+b2[2]*x))),add=T,col=cols[3],lty=1) # SES = high curve(1/(1+exp(-(a2[1]+b2[1]+b2[2]*x))),add=T,col=cols[1],lty=2) curve(1/(1+exp(-(a2[2]+b2[1]+b2[2]*x))),add=T,col=cols[2],lty=2) curve(1/(1+exp(-(a2[3]+b2[1]+b2[2]*x))),add=T,col=cols[3],lty=2) legend(8,.95,c("Well","Mild","Mod"),fill=cols[1:3]) legend(0,.1,c("High SES","Low SES"),lty=c(2,1)) # Model comparison LRT of H0: fit1 vs Ha: fit2 anova(fit1, fit2) # Interaction between SES and Life? anova(fit2, update(fit2, .~.+SES:Life)) ## Interpret coefs in fit2: Change in the estimated odds of mental ## impairment below any fixed level. # --> Signs of beta estimates suggest cumulative probability # starting at the "well" end of the scale decreases as life # events increases (adjusting for SES), and is higher for the # high level of SES (adjusting for life events). # Can we use deviance to check goodness of fit? # --> No. Contingency table is too sparse. # But can check model fit by comparing to more complex models: fit3 <- update(fit2, .~.+SES:Life) anova(fit2, fit3) ## Interpret SEShigh:Life interaction coef? fit3 # --> Impact of life events on mental impairment seems more # severe for the low SES group, but the difference in effects # is not statistically significant. ## Alternative interpretation: compare cumulative probabilities directly. # Quantitative predictor - compare cumulative probabilities at quartiles # Categorical predictor - compare cumulative probabilities for different category # Control for quantitative variables by setting equal to mean. # Control for categorical variables by fixing the category. ## Try for this model. ######## PO Model with categorical predictor # Built-in data set of 1,681 renters from a housing satisfaction survey # in Copenhagen # Variables: # Sat = subject's level of satisfaction with housing conditions # Infl = subject's feeling of influence on apartment management # Type = type of rental accommodation occupied # Cont = degree of contact subject had with other residents # Freq = cell count data(housing) # Loads data set attach(housing) # Note that Sat is an ordered factor: Sat # Marginal tables of each explanatory variable vs. Sat: Infl.tab = xtabs(Freq ~ Infl+Sat) Type.tab = xtabs(Freq ~ Type+Sat) Cont.tab = xtabs(Freq ~ Cont+Sat) # All marginal associations seem strong: chisq.test(Infl.tab) chisq.test(Type.tab) chisq.test(Cont.tab) # Re-arrange data: Sat.Low = housing[Sat=="Low",c(2:5)] Sat.Med = housing[Sat=="Medium",5] Sat.High = housing[Sat=="High",5] housing.new = data.frame(Sat.Low,Sat.Med,Sat.High) names(housing.new) = c("Infl","Type","Cont","Low","Med","High") # Treat Sat as response - # Fit proportional odds model using Infl as explanatory variable. # Use vglm function in VGAM library: mod1 = vglm(cbind(Low,Med,High) ~ Infl, family = cumulative(parallel=TRUE), data = housing.new) summary(mod1) detach(housing)