library(pscl) library(MASS) ## Data set from Agresti, 2007. # Data: Study of nesting horseshoe crabs (J. Brockman, Ethology, 1996). # Each female crab in the study had a male crab attached to her in her nest. # The study investigated factors that affect whether the female crab had # any other males, called satellites, residing nearby her. # Explanatory variables thought possibly to affect this included the # female crab's color, spine condition, weight, and carapace width. # The response outcome for each female crab is her number of satellites. # Variable descriptions: # color: 1 - light medium, 2 - medium, 3 - dark medium, 4 - dark # spine: 1 - both good, 2 - one worn or broken, 3 - both worn or broken # width: carapace width in cm # satell: number of satellites # weight: weight in kg crab <- read.table("http://www.math.montana.edu/shancock/courses/stat539/data/horseshoe.txt",header=T) crab$color <- factor(crab$color) crab$spine <- factor(crab$spine) head(crab) dim(crab) ## EDA summary(crab) # Note at least 25% of data set are zeroes # Observed distribution of No. of Satellites vs. expected from Poisson hist(crab$satell, xlab="Number of Satellites", col="lightgreen", breaks=20, main="Histogram of No. of Satellites with Poisson Expected Counts") 173*dpois(0:15,mean(crab$satell)) points(0:15+0.5, 173*dpois(0:15,mean(crab$satell)), type="h") points(0:15+0.5, 173*dpois(0:15,mean(crab$satell)), pch=16, type="p") par(mfrow=c(2,2)) plot(crab$satell ~ crab$color) plot(crab$satell ~ crab$spine) plot(crab$satell ~ crab$width) plot(crab$satell ~ crab$weight) cor(crab[,3:5]) # Width and weight highly correlated - maybe only use weight in model ## Overdispersion? # Create weight groups crab$weight.grp <- cut(crab$weight, breaks=c(min(crab$weight)-1, quantile(crab$weight, c(.2,.4,.6,.8)), max(crab$weight)+1)) # Calculate sample means and sample variances within each weight group means <- aggregate(crab$satell ~ crab$weight.grp, FUN=mean) vars <- aggregate(crab$satell ~ crab$weight.grp, FUN=var) # Plot sample variances vs sample means - should follow y=x if no over-dispersion plot(vars[,2] ~ means[,2], xlim=c(0,15), ylim=c(0,15), pch=15, xlab="Sample Means per Weight Group", ylab="Sample Variances per Weight Group", col="blue") abline(a = 0, b = 1) ## Book analysis - compare models with no predictors # Poisson GLM mod0.pois <- glm(satell ~ 1, family=poisson, data=crab) # Negative Binomial GLM mod0.nb <- glm.nb(satell ~ 1, data=crab) # Zero-inflated Poisson mod0.zip <- zeroinfl(satell ~ 1, data=crab) # Zero-inflated Negative Binomial mod0.zinb <- zeroinfl(satell ~ 1, dist="negbin", data=crab) summary(mod0.pois) logLik(mod0.pois) summary(mod0.nb) # phi-hat = 1/theta = 1/0.758 logLik(mod0.nb) summary(mod0.zip) # Mixture with prob exp(-0.6139)/(1+exp(-0.6139)) = 0.351 for # degenerate distn at zero; prob 1-0.351 for Pois distn with mean # exp(1.50385) = 4.499 summary(mod0.zinb) # Prob of degenerate distn at zero = exp(-0.7279)/(1+exp(-0.7279)) = 0.326 # Neg bin with mean exp(1.465) = 4.33 and disp parameter 1/4.4605 = 0.22 ## Compare fitted values to observed (re-create table on p. 257): count <- c(0:9) # 0, 1,..., 8, 9 or more table(crab$satell) observed <- c(as.vector(table(crab$satell))[1:9], 10) ## Fitted frequencies using fitted probabilities of each count: lam.pois <- exp(coef(mod0.pois)) fit.pois <- 173*c(dpois(0:8, lambda = lam.pois), 1-ppois(8, lambda = lam.pois)) mu.nb <- exp(coef(mod0.nb)) k.nb <- mod0.nb$theta fit.nb <- 173*c(dnbinom(0:8, mu = mu.nb, size = k.nb), 1-pnbinom(8, mu = mu.nb, size = k.nb)) lam.zip <- exp(coef(mod0.zip)[1]) phi.zip <- 1-exp(coef(mod0.zip)[2])/(1+exp(coef(mod0.zip)[2])) fit.zip <- 173*c(1-phi.zip+phi.zip*exp(-lam.zip), phi.zip*exp(-lam.zip)*lam.zip^(1:8)/factorial(1:8), 1-(1-phi.zip+phi.zip*exp(-lam.zip))-sum(phi.zip*exp(-lam.zip)*lam.zip^(1:8)/factorial(1:8))) mu.zinb <- exp(coef(mod0.zinb)[1]) k.zinb <- mod0.zinb$theta phi.zinb <- 1-exp(coef(mod0.zinb)[2])/(1+exp(coef(mod0.zinb)[2])) fit.zinb <- 173*c(1-phi.zinb+phi.zinb*dnbinom(0, mu=mu.zinb, size=k.zinb), phi.zinb*dnbinom(1:8, mu=mu.zinb, size=k.zinb), 1-(1-phi.zinb+phi.zinb*dnbinom(0, mu=mu.zinb, size=k.zinb))-sum(phi.zinb*dnbinom(1:8, mu=mu.zinb, size=k.zinb))) ## Table on p. 257 data.frame(count, observed, fit.pois, fit.nb, fit.zip, fit.zinb) ## ZINB model adding predictors: mod.zinb <- zeroinfl(satell ~ weight | weight + color, dist="negbin", data=crab) summary(mod.zinb) ## Mean response = phi*(E(Y | Z=1)) my.mean <- function(wgt, col){ co <- coef(mod.zinb) if(col %in% c(2,3,4)){ return((1/(1+exp(co[3] + co[4]*wgt + co[3+col]))) * exp(co[1] + co[2]*wgt)) } if(col == 1){ return((1/(1+exp(co[3] + co[4]*wgt))) * exp(co[1] + co[2]*wgt)) } } curve(my.mean(x,1), from=0, to=5, xlab="Weight", ylab="Fitted Mean No. of Satellites", col="blue", lty=1, lwd=2, main="Fitted Mean No. of Satellites by Weight and Color") curve(my.mean(x,2), add=TRUE, col="red", lty=2, lwd=2) curve(my.mean(x,3), add=TRUE, col="purple", lty=3, lwd=2) curve(my.mean(x,4), add=TRUE, col="darkgreen", lty=4, lwd=2) legend("topleft", c("Lgt Med","Med","Dark Med","Dark"), lty=c(1:4), lwd=2, col=c("blue","red","purple","darkgreen"))