# R code for Chapter 5 # Barcharts for Poisson distribution: p. 83 barplot(dpois(0:5,0.5),xlab='y',ylab='Probability',names=0:5,main='mean = 0.5') barplot(dpois(0:10,2),xlab='y',ylab='Probability',names=0:10,main='mean = 2') barplot(dpois(0:15,5),xlab='y',ylab='Probability',names=0:15,main='mean = 5') # load and initial analysis of Galapagos data library(faraway) data(gala) gala=gala[,-2] mod1=lm(Species~.,gala) plot(mod1,1) # observe non-constant variance # try to improve using Box-Cox transformation library(MASS) bc=boxcox(mod1) bc$x[which.max(bc$y)] # use a square root transformation modt=lm(sqrt(Species)~.,gala) plot(modt,1) sumary(modt) # Alternate method: glm with Poisson family modp=glm(Species~.,family=poisson,gala) sumary(modp) plot(modp) # calculate deviance and Pearson residuals residuals(modp,type='deviance') residuals(modp,type='pearson') df.residual(modp) deviance(modp) sum(residuals(modp,type='deviance')^2) sum(residuals(modp,type='pearson')^2) # testing fit via chi-square distribution pchisq(716.8458,24,lower=F) pchisq(761.9792,24,lower=F) # note both forms of goodness of fit statistic are much larger than the df.residual # - indicates lack of fi of the model # plots to illustrate fit halfnorm(residuals(modp)) # shows no outlier # plot estimated variance against the mean plot((fitted(modp)),((gala$Species-fitted(modp))^2), xlab=expression(hat(mu)),ylab=expression((y-hat(mu))^2)) abline(0,1) plot(log(fitted(modp)),log((gala$Species-fitted(modp))^2), xlab=expression(hat(mu)),ylab=expression((y-hat(mu))^2)) abline(0,1) # variance is proportional to but larger than the mean # by-hand estimator of dispersion parameter (dp=sum(residuals(modp,type='pearson')^2)/df.residual(modp)) sumary(modp,dispersion=dp) # Alternately, use quasipoisson model modd=glm(Species~.,family=quasipoisson,gala) sumary(modd) plot(modd) # see effect of dropping terms drop1(modd,test='F') # Section 5.3 library(faraway) data(dicentric) round(xtabs(ca/cells~doseamt+doserate,dicentric),2) with(dicentric,interaction.plot(doseamt,doserate,ca/cells,legend=F)) with(dicentric,interaction.plot(doseamt,doserate,ca/cells,legend=T)) lmod=lm(ca/cells~log(doserate)*factor(doseamt),dicentric) summary(lmod)$adj plot(residuals(lmod)~fitted(lmod),xlab='Fitted',ylab='Residuals') abline(h=0) # model dose amount as a factor and refit as Poisson regression dosef=factor(dicentric$doseamt) pmod=glm(ca~log(cells)+log(doserate)*dosef,family=poisson,dicentric) sumary(pmod) # same model with log(cells) coefficient fixed at 1 rmod=glm(ca~offset(log(cells))+log(doserate)*dosef,family=poisson,dicentric) sumary(rmod) # solder data section 5.4 library(faraway) data(solder) hist(solder$skip,breaks=0:50) mean(solder$skips) # plot theoretical istogram for Poisson distribution with mean 5.53 barplot(dpois(0:15,5.53),xlab='y',ylab='Probability',names=0:15,main='mean = 5.53') modp=glm(skips~.,family=poisson,solder) c(deviance(modp),df.residual(modp)) modqp=glm(skips~.,family=quasipoisson,solder) c(deviance(modqp),df.residual(modqp)) modp2=glm(skips~(Opening+Solder+Mask+PadType+Panel)^2,family=poisson,solder) deviance(modp2) range(modp2$fitted) hist(modp2$fitted,breaks=0:100,xlim=c(0,15),col='blue') pchisq(deviance(modp2),df.residual(modp2),lower=F) # Venables-Ripley method - assumes k=1 library(MASS) modn=glm(skips~.,negative.binomial(1),solder) summary(modn) # two plots to assess visual fit of modn: # first, plot fitted values against y plot(modn$fitted,modn$y) abline(0,1) # many values for which fitted value is much greater than observed # second, side-by-side histograms of fitted and observed values # (note use of xlim to limite range of plot) par(mfrow=c(1,2)) hist(modn$y,breaks=0:130,xlim=c(0,20)) hist(modn$fitted,breaks=0:130,xlim=c(0,20)) # second, side-by-side histograms of fitted and observed values # (note use of xlim to limite range of plot) par(mfrow=c(1,2)) hist(modn$y,breaks=0:130,xlim=c(0,20)) hist(modp2$fitted,breaks=0:130,xlim=c(0,20)) # second Venables-Ripley method includes estimator of k modn1=glm.nb(skips~.,solder) summary(modn1) # same two plots to assess fit par(mfrow=c(1,1)) plot(modn1$fitted,modn$y) abline(0,1) # this is better than the previous plot but still not too good # second, side-by-side histograms of fitted and observed values # (note use of xlim to limite range of plot) par(mfrow=c(1,2)) hist(modn1$y,breaks=0:130,xlim=c(0,20)) hist(modn1$fitted,breaks=0:130,xlim=c(0,20)) # also plot the two fitted histograms against each other par(mfrow=c(2,2)) hist(modn1$y,breaks=0:130,xlim=c(0,20),main='Observations') hist(modn1$fitted,breaks=0:130,xlim=c(0,20),main='Fitted with k=1') hist(modn$fitted,breaks=0:130,xlim=c(0,20),main='Fitted with k Estimated') hist(modp2$fitted,breaks=0:130,xlim=c(0,20),main='Second Poisson Fit') # quite similar to the previous histogram # my conclusion: it's not obvious that negative binomial is the right distribution # zero-inflated count models section 5.5 library(faraway) library(pscl) data(bioChemists) par(mfrow=c(1,2)) hist(bioChemists$art,breaks=0:20) mean(bioChemists$art) barplot(dpois(0:10,1.6929),xlab='y',ylab='Probability',names=0:10,main='Poisson Mean 1.6929') modp=glm(art~.,data=bioChemists,family=poisson) sumary(modp) par(mfrow=c(1,1)) ocount=table(bioChemists$art)[1:8] pcount=colSums(predprob(modp)[,1:8]) plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed') text(pcount,ocount,0:7) modh=hurdle(art~.,data=bioChemists) summary(modh) ocount=table(bioChemists$art)[1:8] pcount=colSums(predprob(modh)[,1:8]) plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed') text(pcount,ocount,0:7) modz=zeroinfl(art~.,data=bioChemists) summary(modz) par(mfrow=c(1,1)) ocount=table(bioChemists$art)[1:8] pcount=colSums(predprob(modz)[,1:8]) plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed', main='Zero-inflated Poisson Model for Biochem Data') text(pcount,ocount,0:7) plot(fitted(modh),fitted(modz),xlab='Hurdle Predictions',ylab='ZIF Predictions') abline(0,1) modz2=zeroinfl(art~fem+kid5+ment|ment, data=bioChemists) summary(modz2) lrt=2*(modz$loglik-modz2$loglik) 1-pchisq(lrt,6) # unfortunately this doesn't work anova(modz2,modz) exp(coef(modz2)) newman=data.frame(fem='Men',mar='Single',kid5=0,ment=6) predict(modz2,newdata=newman,type='prob') predict(modz2,newdata=newman,type='zero')