# R code for Chapter 5

# Barcharts for Poisson distribution: p. 83

barplot(dpois(0:5,0.5),xlab='y',ylab='Probability',names=0:5,main='mean = 0.5')

barplot(dpois(0:10,2),xlab='y',ylab='Probability',names=0:10,main='mean = 2')

barplot(dpois(0:15,5),xlab='y',ylab='Probability',names=0:15,main='mean = 5')

# load and initial analysis of Galapagos data
library(faraway)
data(gala)
gala=gala[,-2]

mod1=lm(Species~.,gala)
plot(mod1,1)
# observe non-constant variance

# try to improve using Box-Cox transformation
library(MASS)
bc=boxcox(mod1)
bc$x[which.max(bc$y)]

# use a square root transformation
modt=lm(sqrt(Species)~.,gala)
plot(modt,1)
sumary(modt)

# Alternate method: glm with Poisson family
modp=glm(Species~.,family=poisson,gala)
sumary(modp)
plot(modp)

# calculate deviance and Pearson residuals
residuals(modp,type='deviance')

residuals(modp,type='pearson')

df.residual(modp)
deviance(modp)
sum(residuals(modp,type='deviance')^2)
sum(residuals(modp,type='pearson')^2)

# testing fit via chi-square distribution
pchisq(716.8458,24,lower=F)
pchisq(761.9792,24,lower=F)



# note both forms of goodness of fit statistic are much larger than the df.residual
# - indicates lack of fi of the model

# plots to illustrate fit

halfnorm(residuals(modp))
# shows no outlier

# plot estimated variance against the mean
plot((fitted(modp)),((gala$Species-fitted(modp))^2),
xlab=expression(hat(mu)),ylab=expression((y-hat(mu))^2))
abline(0,1)

plot(log(fitted(modp)),log((gala$Species-fitted(modp))^2),
xlab=expression(hat(mu)),ylab=expression((y-hat(mu))^2))
abline(0,1)

# variance is proportional to but larger than the mean

# by-hand estimator of dispersion parameter
(dp=sum(residuals(modp,type='pearson')^2)/df.residual(modp))
sumary(modp,dispersion=dp)


# Alternately, use quasipoisson model

modd=glm(Species~.,family=quasipoisson,gala)
sumary(modd)
plot(modd)

# see effect of dropping terms
drop1(modd,test='F')

# Section 5.3

library(faraway)
data(dicentric)
round(xtabs(ca/cells~doseamt+doserate,dicentric),2)

with(dicentric,interaction.plot(doseamt,doserate,ca/cells,legend=F))


with(dicentric,interaction.plot(doseamt,doserate,ca/cells,legend=T))

lmod=lm(ca/cells~log(doserate)*factor(doseamt),dicentric)
summary(lmod)$adj


plot(residuals(lmod)~fitted(lmod),xlab='Fitted',ylab='Residuals')
abline(h=0)

# model dose amount as a factor and refit as Poisson regression

dosef=factor(dicentric$doseamt)
pmod=glm(ca~log(cells)+log(doserate)*dosef,family=poisson,dicentric)
sumary(pmod)

# same model with log(cells) coefficient fixed at 1
rmod=glm(ca~offset(log(cells))+log(doserate)*dosef,family=poisson,dicentric)
sumary(rmod)

# solder data section 5.4
library(faraway)
data(solder)

hist(solder$skip,breaks=0:50)
mean(solder$skips)

# plot theoretical istogram for Poisson distribution with mean 5.53

barplot(dpois(0:15,5.53),xlab='y',ylab='Probability',names=0:15,main='mean = 5.53')


modp=glm(skips~.,family=poisson,solder)
c(deviance(modp),df.residual(modp))

modqp=glm(skips~.,family=quasipoisson,solder)
c(deviance(modqp),df.residual(modqp))

modp2=glm(skips~(Opening+Solder+Mask+PadType+Panel)^2,family=poisson,solder)
deviance(modp2)

range(modp2$fitted)


hist(modp2$fitted,breaks=0:100,xlim=c(0,15),col='blue')




pchisq(deviance(modp2),df.residual(modp2),lower=F)

# Venables-Ripley method - assumes k=1
library(MASS)
modn=glm(skips~.,negative.binomial(1),solder)
summary(modn)

# two plots to assess visual fit of modn:

# first, plot fitted values against y
plot(modn$fitted,modn$y)
abline(0,1)
# many values for which fitted value is much greater than observed

# second, side-by-side histograms of fitted and observed values
# (note use of xlim to limite range of plot)
par(mfrow=c(1,2))
hist(modn$y,breaks=0:130,xlim=c(0,20))
hist(modn$fitted,breaks=0:130,xlim=c(0,20))

# second, side-by-side histograms of fitted and observed values
# (note use of xlim to limite range of plot)
par(mfrow=c(1,2))
hist(modn$y,breaks=0:130,xlim=c(0,20))
hist(modp2$fitted,breaks=0:130,xlim=c(0,20))


# second Venables-Ripley method includes estimator of k

modn1=glm.nb(skips~.,solder)
summary(modn1)

# same two plots to assess fit
par(mfrow=c(1,1))
plot(modn1$fitted,modn$y)
abline(0,1)
# this is better than the previous plot but still not too good

# second, side-by-side histograms of fitted and observed values
# (note use of xlim to limite range of plot)
par(mfrow=c(1,2))
hist(modn1$y,breaks=0:130,xlim=c(0,20))
hist(modn1$fitted,breaks=0:130,xlim=c(0,20))

# also plot the two fitted histograms against each other

par(mfrow=c(2,2))
hist(modn1$y,breaks=0:130,xlim=c(0,20),main='Observations')
hist(modn1$fitted,breaks=0:130,xlim=c(0,20),main='Fitted with k=1')
hist(modn$fitted,breaks=0:130,xlim=c(0,20),main='Fitted with k Estimated')
hist(modp2$fitted,breaks=0:130,xlim=c(0,20),main='Second Poisson Fit')


# quite similar to the previous histogram

# my conclusion: it's not obvious that negative binomial is the right distribution



# zero-inflated count models section 5.5

library(faraway)
library(pscl)


data(bioChemists)
par(mfrow=c(1,2))
hist(bioChemists$art,breaks=0:20)
mean(bioChemists$art)

barplot(dpois(0:10,1.6929),xlab='y',ylab='Probability',names=0:10,main='Poisson Mean 1.6929')

modp=glm(art~.,data=bioChemists,family=poisson)
sumary(modp)

par(mfrow=c(1,1))
ocount=table(bioChemists$art)[1:8]
pcount=colSums(predprob(modp)[,1:8])
plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed')
text(pcount,ocount,0:7)

modh=hurdle(art~.,data=bioChemists)
summary(modh)



ocount=table(bioChemists$art)[1:8]
pcount=colSums(predprob(modh)[,1:8])
plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed')
text(pcount,ocount,0:7)

modz=zeroinfl(art~.,data=bioChemists)
summary(modz)


par(mfrow=c(1,1))
ocount=table(bioChemists$art)[1:8]
pcount=colSums(predprob(modz)[,1:8])
plot(pcount,ocount,type='n',xlab='Predicted',ylab='Observed',
main='Zero-inflated Poisson Model for Biochem Data')
text(pcount,ocount,0:7)


plot(fitted(modh),fitted(modz),xlab='Hurdle Predictions',ylab='ZIF Predictions')
abline(0,1)


modz2=zeroinfl(art~fem+kid5+ment|ment, data=bioChemists)
summary(modz2)

lrt=2*(modz$loglik-modz2$loglik)
1-pchisq(lrt,6)

# unfortunately this doesn't work
anova(modz2,modz)

exp(coef(modz2))


newman=data.frame(fem='Men',mar='Single',kid5=0,ment=6)
predict(modz2,newdata=newman,type='prob')


predict(modz2,newdata=newman,type='zero')