# Chapter 11: Faraway

# 11.1 - PCA method

# fat data - plots to illustrate thar body dimensions variables are highly correlated

library(faraway)
data(fat)
par(mfrow=c(1,3))
plot(neck~knee,fat)
plot(chest~thigh,fat)
plot(hip~wrist,fat)

par(mfrow=c(1,1))

# compute a PCA of these variables

cfat=fat[,9:18]
prfat=prcomp(cfat)
dim(prfat$rot)
dim(prfat$x)

############ additional code added to check assumptions
# verify matrix assertions on p. 162
X=as.matrix(cfat)
# center X about its column means
for(i in 1:ncol(X)){X[,i]=X[,i]-mean(X[,i])}
U=as.matrix(prfat$rot)
Z=as.matrix(prfat$x)
range(Z-X%*%U)
# conclusion: in matrix notation, Z = X U

# U is orthonormal
W=t(U) %*% U
diag(W)=0
range(W)

# Z is orthonormal but diag entries are not 1
W=t(Z) %*% Z
diag(W)=0
range(W)
############ end of additional code

# check variance proportions and show weights of first PC
summary(prfat)
round(prfat$rot[,1],2)

# alternative formulation: scaled PCs
prfatc=prcomp(cfat,scale=T)
summary(prfatc)
round(prfatc$rot[,1],2)
# first PC weights columns almost equally - "overall size" interpretation


round(prfatc$rot[,2],2)
# second PC is contrast between "body center" and "body extremities" measures

# Mahalanobis distance to assess outliers
library(MASS)
robfat=cov.rob(cfat) # use robust measures of center and covariance
md=mahalanobis(cfat,center=robfat$center,cov=robfat$cov)
n=nrow(cfat)
p=ncol(cfat)
plot(qchisq(1:n/(n+1),p),sort(md),xlab=expression(paste(chi^2," quantiles")),
ylab='Sorted Mahalanobis Distances',pch=20)
abline(0,1)

# now turn to PCR regression

# fat data: first fit all body-measurement variables

lmoda=lm(fat$brozek~.,data=cfat)
summary(lmoda)

# now do PCR regression: first two PCs

lmodpcr=lm(fat$brozek~prfatc$x[,1:2])
summary(lmodpcr)

# increase the number of PCs

lmodpcr3=lm(fat$brozek~prfatc$x[,1:3])
summary(lmodpcr3)

lmodpcr4=lm(fat$brozek~prfatc$x[,1:4])
summary(lmodpcr4)

lmodpcr10=lm(fat$brozek~prfatc$x[,1:10])
summary(lmodpcr10)

# note caution: the higher-order PCs may contain more explanatory power

# contrast with PLS regression - later

# try to interpret the PCs in terms of specific variables

lmodr=lm(fat$brozek~scale(abdom)+I(scale(ankle)-scale(abdom)),data=cfat)
summary(lmodr)

# actually, this does BETTER than the 2PC model

# new dataset - Faraway page 167

library(faraway)
data(meatspec)
trainmeat=meatspec[1:172,]
testmeat=meatspec[173:215,]
modlm=lm(fat~.,trainmeat)
summary(modlm)

# look at predictive power in test sample
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(fitted(modlm),trainmeat$fat)
rmse(predict(modlm,testmeat),testmeat$fat)

# interpretation: using all 100 covariates makes the out of sample predictions much worse

# use stepwise regression to select variables

modsteplm=step(modlm)
rmse(fitted(modsteplm),trainmeat$fat)
rmse(predict(modsteplm,testmeat),testmeat$fat)

# helps a little bit - out of sample RMSE from 3.81 to 3.59

meatpca=prcomp(trainmeat[,-101])

round(meatpca$sdev,3)
# interpretation: almost all of the variability is in first 4 PCs

# plot graphs of largest 4 PCs
matplot(1:100,meatpca$rot[,1:4],type='l',xlab='Frequency',ylab='',col=1)

modlmpc=lm(fat~meatpca$x[,1:4],trainmeat)
summary(modlmpc)
#parameters much more stable but no immediate way to us "predict" command because we changed the x variables
# ocmpared with the original dataset


# run on largest 4 PCs using pls package
library(pls)
pcrmod=pcr(fat~.,data=trainmeat,ncomp=50)

rmse(predict(pcrmod,ncomp=4),trainmeat$fat)
# not as good as using all 100 variables (in fact seems quite a lot worse)

# plot original and shrinkage coefficients
par(mfrow=c(1,2))
plot(modlm$coef[-1],xlab='Frequency',ylab='Coefficient',type='l')
coefplot(pcrmod,ncomp=4,xlab='Frequency',main='')

# screeplot
par(mfrow=c(1,1))
plot(meatpca$sdev[1:10],type='l',ylab='SD of PC',xlab='PC number')
# elbow at 5?

rmse(predict(pcrmod,testmeat,ncomp=4),testmeat$fat)
# not too impressive

# however we can also do --
par(mfrow=c(1,2))
pcrmse=RMSEP(pcrmod,newdata=testmeat)
plot(pcrmse,main='')
which.min(pcrmse$val)
min(pcrmse$val)

# plots to illustrate cross-validation

# this is optional
set.seed(123)

pcrmod=pcr(fat~.,data=trainmeat,validation='CV',ncomp=50)
pcrCV=RMSEP(pcrmod,estimate='CV')

plot(pcrCV,main='')
ypred=predict(pcrmod,testmeat,ncomp=18)
rmse(ypred,testmeat$fat)

which.min(pcrCV$val[1,1,])
ypred=predict(pcrmod,testmeat,ncomp=which.min(pcrCV$val[1,1,])-1)
rmse(ypred,testmeat$fat)

ypred=predict(pcrmod,testmeat,ncomp=32)
rmse(ypred,testmeat$fat)

# Faraway section 11.2 - PLS


# this is optional
# set.seed(123)
plsmod=plsr(fat~.,data=meatspec[1:172,],ncomp=50,validation='CV')
coefplot(plsmod,ncomp=4,xlab='Frequency')
plsCV=RMSEP(plsmod,estimate='CV')
plot(plsCV,main='')

# actually, plsCV is smallest at ncomp=14 (print it out)

ypred=predict(plsmod,ncomp=14)
rmse(ypred,trainmeat$fat)
# similar to PCR

ytpred=predict(plsmod,testmeat,ncomp=14)
rmse(ytpred,testmeat$fat)
# compare 2.13 by PCR

### Faraway section 11.3 - ridge regression

require(MASS)
par(mfrow=c(1,1))
rgmod=lm.ridge(fat~.,trainmeat,lambda=seq(0,5e-8,len=21))
matplot(rgmod$lambda,coef(rgmod),type='l',xlab=expression(lambda),ylab=expression(hat(beta)),col=1)

# called the ridge trace plot

# select lambda via GCV
which.min(rgmod$GCV)
abline(v=1.75e-08)


ypred=cbind(1,as.matrix(trainmeat[,-101])) %*% coef(rgmod)[12,]
rmse(ypred,trainmeat$fat)

ypred=cbind(1,as.matrix(testmeat[,-101])) %*% coef(rgmod)[12,]
rmse(ypred,testmeat$fat)

# observation 13 is bad:
plot(ypred,testmeat$fat)

points(ypred[13],testmeat$fat[13],col='red',pch=20)



c(ytpred[13],ypred[13],testmeat$fat[13])
rmse(ypred[-13],testmeat$fat[-13])


# section 11.4 - lasso
# least absolute shrinkage and selection operator

library(lars)
data(state)
statedata=data.frame(state.x77,row.names=state.abb)
lmod=lars(as.matrix(statedata[,-4]),statedata$Life)

par(mfrow=c(1,2))
plot(lmod)
cvlmod=cv.lars(as.matrix(statedata[,-4]),statedata$Life)

set.seed(123)
cvlmod$index[which.min(cvlmod$cv)]
predict(lmod,s=0.65657,type='coef',mode='fraction')$coef


coef(lm(Life.Exp~Population+Murder+HS.Grad+Frost,statedata))

# fat data
par(mfrow=c(1,1))
trainy=trainmeat$fat
trainx=as.matrix(trainmeat[,-101])
lassomod=lars(trainx,trainy)
cvout=cv.lars(trainx,trainy)

cvout$index[which.min(cvout$cv)]

testx=as.matrix(testmeat[,-101])
predlars=predict(lassomod,testx,s=0.0101,mode='fraction')
rmse(testmeat$fat,predlars$fit)

predlars=predict(lassomod,s=0.0101,type='coef',mode='fraction')
plot(predlars$coef,type='h',ylab='Coefficient')
sum(predlars$coef!=0)