library('faraway')

data(wcgs)

summary(wcgs[c("chd","height","cigs")])

plot(height~chd,wcgs)

wcgs$y=ifelse(wcgs$chd=="no",0,1)

plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.')

library(ggplot2)

# the next two commands are a lkttle different from the text
ggplot(wcgs,aes(x=height,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=1)

ggplot(wcgs,aes(x=cigs,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=5,aes(y=..density..))

ggplot(wcgs,aes(x=height,y=cigs))+geom_point(alpha=0.2,position=position_jitter())+facet_grid(~chd)

lmod=glm(chd~height+cigs,family=binomial,wcgs)

summary(lmod)
# or Faraway's own command: sumary(lmod)

beta=coef(lmod)
plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.')
curve(ilogit(beta[1]+beta[2]*x+beta[3]*0),add=T)
curve(ilogit(beta[1]+beta[2]*x+beta[3]*20),add=T,lty=2)

plot(jitter(y,0.1)~jitter(cigs),wcgs,xlab='Cigarettes',ylab='Heart Disease',pch='.')
curve(ilogit(beta[1]+beta[2]*60+beta[3]*x),add=T)
curve(ilogit(beta[1]+beta[2]*78+beta[3]*x),add=T,lty=2)

# relative risk calculation for a 68-inch man who smokes 20 a day against 
# a 68-inch man who smokes none
print(c(ilogit(sum(beta*c(1,68,20))),ilogit(sum(beta*c(1,68,0)))))

# relative risk
ilogit(sum(beta*c(1,68,20)))/ilogit(sum(beta*c(1,68,0)))


# Inference

# testing null deviance v. residual deviance in lmod: deviance changes by
# 1781.2-1749.0=32.2 with 2 degrees of freedom
pchisq(32.2,2,lower.tail=F)

# various related tests

lmodc=glm(chd~cigs,family=binomial,wcgs)

anova(lmodc,lmod,test='Chi')

drop1(lmod,test='Chi')

confint(lmod)

# diagnostics

linpred=predict(lmod)
predprob=predict(lmod,type='response')

head(linpred)
head(predprob)
head(ilogit(linpred))


rawres=wcgs$y-predprob

plot(rawres~linpred,xlab='Linear Predictor',ylab='Residuals')

plot(residuals(lmod)~linpred,xlab='Linear Predictor',ylab='Deviance Residuals')

library(dplyr)
wcgs=mutate(wcgs,residuals=residuals(lmod),linpred=predict(lmod))
gdf=group_by(wcgs,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101))))
diagdf=summarise(gdf,residuals=mean(residuals),linpred=mean(linpred))
plot(residuals~linpred,diagdf,xlab='Linear Predictor',pch=20)


gdf=group_by(wcgs,height)
diagdf=summarise(gdf,residuals=mean(residuals))
ggplot(diagdf,aes(x=height,y=residuals))+geom_point()

filter(wcgs,height==77) %>% select(height,cigs,chd,residuals)

group_by(wcgs,cigs) %>% summarise(residuals=mean(residuals),count=n()) %>% ggplot(aes(x=cigs,y=residuals,size=sqrt(count)))+geom_point()

qqnorm(residuals(lmod))

halfnorm(hatvalues(lmod))

filter(wcgs,hatvalues(lmod)>0.015) %>% select(height,cigs,chd)

# sequential search for best model using AIC
wcgs$bmi=with(wcgs,703*weight/(wcgs$height^2))
lmod=glm(chd~age+height+weight+bmi+sdp+dbp+chol+dibep+cigs+arcus,family=binomial,wcgs)
lmodr=step(lmod,trace=0)
sumary(lmod)
sumary(lmodr)

drop1(glm(chd~dbp,family=binomial,wcgs),test='Chi')