# this file has been reordered for the class of 01/31/2019 ############ the following commands were run before the class library('faraway') data(wcgs) summary(wcgs[c("chd","height","cigs")]) plot(height~chd,wcgs) wcgs$y=ifelse(wcgs$chd=="no",0,1) plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.') library(ggplot2) # the next two commands are a little different from the text ggplot(wcgs,aes(x=height,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=1) ggplot(wcgs,aes(x=cigs,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=5,aes(y=..density..)) ggplot(wcgs,aes(x=height,y=cigs))+geom_point(alpha=0.2,position=position_jitter())+facet_grid(~chd) lmod=glm(chd~height+cigs,family=binomial,wcgs) summary(lmod) # or Faraway's own command: sumary(lmod) beta=coef(lmod) plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.') curve(ilogit(beta[1]+beta[2]*x+beta[3]*0),add=T) curve(ilogit(beta[1]+beta[2]*x+beta[3]*20),add=T,lty=2) plot(jitter(y,0.1)~jitter(cigs),wcgs,xlab='Cigarettes',ylab='Heart Disease',pch='.') curve(ilogit(beta[1]+beta[2]*60+beta[3]*x),add=T) curve(ilogit(beta[1]+beta[2]*78+beta[3]*x),add=T,lty=2) # relative risk calculation for a 68-inch man who smokes 20 a day against # a 68-inch man who smokes none print(c(ilogit(sum(beta*c(1,68,20))),ilogit(sum(beta*c(1,68,0))))) # relative risk ilogit(sum(beta*c(1,68,20)))/ilogit(sum(beta*c(1,68,0))) # Inference # testing null deviance v. residual deviance in lmod: deviance changes by # 1781.2-1749.0=32.2 with 2 degrees of freedom pchisq(32.2,2,lower.tail=F) # various related tests lmodc=glm(chd~cigs,family=binomial,wcgs) anova(lmodc,lmod,test='Chi') drop1(lmod,test='Chi') confint(lmod) ################ new material starts here # diagnostics linpred=predict(lmod) predprob=predict(lmod,type='response') head(linpred) head(predprob) head(ilogit(linpred)) rawres=wcgs$y-predprob plot(rawres~linpred,xlab='Linear Predictor',ylab='Residuals') plot(residuals(lmod)~linpred,xlab='Linear Predictor',ylab='Deviance Residuals') library(dplyr) wcgs=mutate(wcgs,residuals=residuals(lmod),linpred=predict(lmod)) gdf=group_by(wcgs,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101)))) diagdf=summarise(gdf,residuals=mean(residuals),linpred=mean(linpred)) plot(residuals~linpred,diagdf,xlab='Linear Predictor',pch=20) gdf=group_by(wcgs,height) diagdf=summarise(gdf,residuals=mean(residuals)) ggplot(diagdf,aes(x=height,y=residuals))+geom_point() filter(wcgs,height==77) %>% select(height,cigs,chd,residuals) ############ End of material covered in class Jan 31 ############ The remaining material is for future classes - check back for updates! group_by(wcgs,cigs) %>% summarise(residuals=mean(residuals),count=n()) %>% ggplot(aes(x=cigs,y=residuals,size=sqrt(count)))+geom_point() qqnorm(residuals(lmod)) halfnorm(hatvalues(lmod)) filter(wcgs,hatvalues(lmod)>0.015) %>% select(height,cigs,chd) # sequential search for best model using AIC wcgs$bmi=with(wcgs,703*weight/(wcgs$height^2)) lmod=glm(chd~age+height+weight+bmi+sdp+dbp+chol+dibep+cigs+arcus,family=binomial,wcgs) lmodr=step(lmod,trace=0) sumary(lmod) sumary(lmodr) ## compare previous "best" model with one that includes only dbp drop1(glm(chd~dbp,family=binomial,wcgs),test='Chi') # the conclusion is that bdp on its own is significant even though it was not # included in the best model by AIC #goodness of fit - section 2.6 wcgsm=na.omit(wcgs) wcgsm=mutate(wcgsm,predprob=predict(lmod,type='response')) gdf=group_by(wcgsm,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101)))) hldf=summarise(gdf,y=sum(y),ppred=mean(predprob),count=n()) hldf=mutate(hldf,se.fit=sqrt(ppred*(1-ppred)/count)) ggplot(hldf,aes(x=ppred,y=y/count,ymin=y/count-2*se.fit,ymax=y/count+2*se.fit))+geom_point()+ geom_linerange(color=grey(0.75))+geom_abline(intercept=0,slope=1)+xlab('Predicted Probability')+ ylab("Observed Proportion") hlstat=with(hldf,sum((y-count*ppred)^2/(count*ppred*(1-ppred)))) c(hlstat,nrow(hldf)) # the book gets chi-square statistic of 63.212 with 56 parameters but I get 62.832 with 54 pchisq(63.212,56-1,lower.tail=F) pchisq(62.832,54-1,lower.tail=F) # p-values 0.209 the first way and 0.167 the second way - not statistically significant # Here is an alternative way to do the Hosmer-Lemeshow test. # Note three different shoices of the number of groups g, # but none of them gives a statistically significant result # Install package 'ResourceSelection' first library(ResourceSelection) hoslem.test(lmodr$y,lmodr$fitted, g = 10) hoslem.test(lmodr$y,lmodr$fitted, g = 50) hoslem.test(lmodr$y,lmodr$fitted, g = 100) # evaluate logistic regression approach as a predictor of heart disease # compare different thresholds for predprob and compute Specificity, Sensitivity and # the total Misclassification wcgsm=mutate(wcgsm,predout=ifelse(predprob<0.5,"no","yes")) xtabs(~chd+predout,wcgsm) # proportion correct classification (2882+2)/(2882+3+253+2) thresh=seq(0.01,0.5,0.01) Sensitivity=numeric(length(thresh)) Specificity=numeric(length(thresh)) Misclassify=numeric(length(thresh)) for(j in seq(along=thresh)){ pp=ifelse(wcgsm$predprob% select(Sepal.Width,Sepal.Length,Species) (p=ggplot(irisr,aes(x=Sepal.Width,y=Sepal.Length,shape=Species))+geom_point()) lmod=glm(Species~Sepal.Width+Sepal.Length,family=binomial,irisr) sumary(lmod) library(brglm) bmod=brglm(Species~Sepal.Width+Sepal.Length,family=binomial,irisr) summary(bmod) p+geom_abline(intercept=(0.5+24.51)/9.73,slope=8.9/9.73)