# this file has been reordered for the class of 01/31/2019

############ the following commands were run before the class

library('faraway')

data(wcgs)

summary(wcgs[c("chd","height","cigs")])

plot(height~chd,wcgs)

wcgs$y=ifelse(wcgs$chd=="no",0,1)

plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.')

library(ggplot2)

# the next two commands are a little different from the text
ggplot(wcgs,aes(x=height,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=1)

ggplot(wcgs,aes(x=cigs,color=chd,fill=chd))+geom_histogram(position='dodge',binwidth=5,aes(y=..density..))

ggplot(wcgs,aes(x=height,y=cigs))+geom_point(alpha=0.2,position=position_jitter())+facet_grid(~chd)

lmod=glm(chd~height+cigs,family=binomial,wcgs)

summary(lmod)
# or Faraway's own command: sumary(lmod)

beta=coef(lmod)
plot(jitter(y,0.1)~jitter(height),wcgs,xlab='Height',ylab='Heart Disease',pch='.')
curve(ilogit(beta[1]+beta[2]*x+beta[3]*0),add=T)
curve(ilogit(beta[1]+beta[2]*x+beta[3]*20),add=T,lty=2)

plot(jitter(y,0.1)~jitter(cigs),wcgs,xlab='Cigarettes',ylab='Heart Disease',pch='.')
curve(ilogit(beta[1]+beta[2]*60+beta[3]*x),add=T)
curve(ilogit(beta[1]+beta[2]*78+beta[3]*x),add=T,lty=2)

# relative risk calculation for a 68-inch man who smokes 20 a day against 
# a 68-inch man who smokes none
print(c(ilogit(sum(beta*c(1,68,20))),ilogit(sum(beta*c(1,68,0)))))

# relative risk
ilogit(sum(beta*c(1,68,20)))/ilogit(sum(beta*c(1,68,0)))


# Inference

# testing null deviance v. residual deviance in lmod: deviance changes by
# 1781.2-1749.0=32.2 with 2 degrees of freedom
pchisq(32.2,2,lower.tail=F)

# various related tests

lmodc=glm(chd~cigs,family=binomial,wcgs)

anova(lmodc,lmod,test='Chi')

drop1(lmod,test='Chi')

confint(lmod)


################ new material starts here

# diagnostics

linpred=predict(lmod)
predprob=predict(lmod,type='response')

head(linpred)
head(predprob)
head(ilogit(linpred))


rawres=wcgs$y-predprob

plot(rawres~linpred,xlab='Linear Predictor',ylab='Residuals')

plot(residuals(lmod)~linpred,xlab='Linear Predictor',ylab='Deviance Residuals')

library(dplyr)
wcgs=mutate(wcgs,residuals=residuals(lmod),linpred=predict(lmod))
gdf=group_by(wcgs,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101))))
diagdf=summarise(gdf,residuals=mean(residuals),linpred=mean(linpred))
plot(residuals~linpred,diagdf,xlab='Linear Predictor',pch=20)

gdf=group_by(wcgs,height)
diagdf=summarise(gdf,residuals=mean(residuals))
ggplot(diagdf,aes(x=height,y=residuals))+geom_point()

filter(wcgs,height==77) %>% select(height,cigs,chd,residuals)

############ End of material covered in class Jan 31

############ The remaining material is for future classes - check back for updates!

group_by(wcgs,cigs) %>% summarise(residuals=mean(residuals),count=n()) %>% ggplot(aes(x=cigs,y=residuals,size=sqrt(count)))+geom_point()

qqnorm(residuals(lmod))

halfnorm(hatvalues(lmod))


filter(wcgs,hatvalues(lmod)>0.015) %>% select(height,cigs,chd)

# sequential search for best model using AIC
wcgs$bmi=with(wcgs,703*weight/(wcgs$height^2))
lmod=glm(chd~age+height+weight+bmi+sdp+dbp+chol+dibep+cigs+arcus,family=binomial,wcgs)
lmodr=step(lmod,trace=0)
sumary(lmod)
sumary(lmodr)

## compare previous "best" model with one that includes only dbp

drop1(glm(chd~dbp,family=binomial,wcgs),test='Chi')

# the conclusion is that bdp on its own is significant even though it was not
# included in the best model by AIC



#goodness of fit - section 2.6
wcgsm=na.omit(wcgs)
wcgsm=mutate(wcgsm,predprob=predict(lmod,type='response'))
gdf=group_by(wcgsm,cut(linpred,breaks=unique(quantile(linpred,(1:100)/101))))
hldf=summarise(gdf,y=sum(y),ppred=mean(predprob),count=n())


hldf=mutate(hldf,se.fit=sqrt(ppred*(1-ppred)/count))
ggplot(hldf,aes(x=ppred,y=y/count,ymin=y/count-2*se.fit,ymax=y/count+2*se.fit))+geom_point()+
geom_linerange(color=grey(0.75))+geom_abline(intercept=0,slope=1)+xlab('Predicted Probability')+
ylab("Observed Proportion")

hlstat=with(hldf,sum((y-count*ppred)^2/(count*ppred*(1-ppred))))
c(hlstat,nrow(hldf))

# the book gets chi-square statistic of 63.212 with 56 parameters but I get 62.832 with 54
pchisq(63.212,56-1,lower.tail=F)
pchisq(62.832,54-1,lower.tail=F)

# p-values 0.209 the first way and 0.167 the second way - not statistically significant


# Here is an alternative way to do the Hosmer-Lemeshow test.
# Note three different shoices of the number of groups g,
# but none of them gives a statistically significant result
# Install package 'ResourceSelection' first
library(ResourceSelection)
hoslem.test(lmodr$y,lmodr$fitted, g = 10)
hoslem.test(lmodr$y,lmodr$fitted, g = 50)
hoslem.test(lmodr$y,lmodr$fitted, g = 100)


# evaluate logistic regression approach as a predictor of heart disease

# compare different thresholds for predprob and compute Specificity, Sensitivity and 
# the total Misclassification
wcgsm=mutate(wcgsm,predout=ifelse(predprob<0.5,"no","yes"))
xtabs(~chd+predout,wcgsm)

# proportion correct classification
(2882+2)/(2882+3+253+2)


thresh=seq(0.01,0.5,0.01)
Sensitivity=numeric(length(thresh))
Specificity=numeric(length(thresh))
Misclassify=numeric(length(thresh))
for(j in seq(along=thresh)){
  pp=ifelse(wcgsm$predprob<thresh[j],"no","yes")
  xx=xtabs(~chd+pp,wcgsm)
  Specificity[j]=xx[1,1]/(xx[1,1]+xx[1,2])
  Sensitivity[j]=xx[2,2]/(xx[2,1]+xx[2,2])
  Misclassify[j]=xx[2,1]+xx[1,2]
}

matplot(thresh,cbind(Sensitivity,Specificity),type='l',xlab='Threshold',ylab='Proportion',lty=1:2)


plot(1-Specificity,Sensitivity,type='l')
abline(0,1,lty=2)

# Nagelkerke method for R^2

lmodr=glm(chd~age+height+bmi+sdp+chol+dibep+cigs+arcus,family=binomial,wcgs)
(1-exp((lmodr$dev-lmodr$null)/3140))/(1-exp(-lmodr$null/3140))


# Section 2.7 Estimation Problems

library(dplyr)
irisr=filter(iris,Species!='virginica') %>% select(Sepal.Width,Sepal.Length,Species)

(p=ggplot(irisr,aes(x=Sepal.Width,y=Sepal.Length,shape=Species))+geom_point())


lmod=glm(Species~Sepal.Width+Sepal.Length,family=binomial,irisr)
sumary(lmod)


library(brglm)
bmod=brglm(Species~Sepal.Width+Sepal.Length,family=binomial,irisr)
summary(bmod)

p+geom_abline(intercept=(0.5+24.51)/9.73,slope=8.9/9.73)