library('faraway') data(wcgs) summary(wcgs[c("chd","height","cigs")]) plot(height~chd,wcgs) wcgs$y=ifelse(wcgs$chd=="no",0,1) plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.') library(ggplot2) # the next two commands are a lkttle different from the text ggplot(wcgs,aes(x=height,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=1) ggplot(wcgs,aes(x=cigs,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=5,aes(y=..density..)) ggplot(wcgs,aes(x=height,y=cigs))+geom_point(alpha=0.2,position=position_jitter())+facet_grid(~chd) lmod=glm(chd~height+cigs,family=binomial,wcgs) summary(lmod) # or Faraway's own command: sumary(lmod) beta=coef(lmod) plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.') curve(ilogit(beta[1]+beta[2]*x+beta[3]*0),add=T) curve(ilogit(beta[1]+beta[2]*x+beta[3]*20),add=T,lty=2) plot(jitter(y,0.1)~jitter(cigs),wcgs,xlab='Cigarettes',ylab='Heart Disease',pch='.') curve(ilogit(beta[1]+beta[2]*60+beta[3]*x),add=T) curve(ilogit(beta[1]+beta[2]*78+beta[3]*x),add=T,lty=2) # relative risk calculation for a 68-inch man who smokes 20 a day against # a 68-inch man who smokes none print(c(ilogit(sum(beta*c(1,68,20))),ilogit(sum(beta*c(1,68,0))))) # relative risk ilogit(sum(beta*c(1,68,20)))/ilogit(sum(beta*c(1,68,0))) # Inference # testing null deviance v. residual deviance in lmod: deviance changes by # 1781.2-1749.0=32.2 with 2 degrees of freedom pchisq(32.2,2,lower.tail=F) # various related tests lmodc=glm(chd~cigs,family=binomial,wcgs) anova(lmodc,lmod,test='Chi') drop1(lmod,test='Chi') confint(lmod) # diagnostics linpred=predict(lmod) predprob=predict(lmod,type='response') head(linpred) head(predprob) head(ilogit(linpred)) rawres=wcgs$y-predprob plot(rawres~linpred,xlab='Linear Predictor',ylab='Residuals') plot(residuals(lmod)~linpred,xlab='Linear Predictor',ylab='Deviance Residuals') library(dplyr) wcgs=mutate(wcgs,residuals=residuals(lmod),linpred=predict(lmod)) gdf=group_by(wcgs,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101)))) diagdf=summarise(gdf,residuals=mean(residuals),linpred=mean(linpred)) plot(residuals~linpred,diagdf,xlab='Linear Predictor',pch=20) gdf=group_by(wcgs,height) diagdf=summarise(gdf,residuals=mean(residuals)) ggplot(diagdf,aes(x=height,y=residuals))+geom_point() filter(wcgs,height==77) %>% select(height,cigs,chd,residuals) group_by(wcgs,cigs) %>% summarise(residuals=mean(residuals),count=n()) %>% ggplot(aes(x=cigs,y=residuals,size=sqrt(count)))+geom_point() qqnorm(residuals(lmod)) halfnorm(hatvalues(lmod)) filter(wcgs,hatvalues(lmod)>0.015) %>% select(height,cigs,chd) # sequential search for best model using AIC wcgs$bmi=with(wcgs,703*weight/(wcgs$height^2)) lmod=glm(chd~age+height+weight+bmi+sdp+dbp+chol+dibep+cigs+arcus,family=binomial,wcgs) lmodr=step(lmod,trace=0) sumary(lmod) sumary(lmodr) drop1(glm(chd~dbp,family=binomial,wcgs),test='Chi')