# Chapter 11: Faraway # 11.1 - PCA method # fat data - plots to illustrate thar body dimensions variables are highly correlated library(faraway) data(fat) par(mfrow=c(1,3)) plot(neck~knee,fat) plot(chest~thigh,fat) plot(hip~wrist,fat) par(mfrow=c(1,1)) # compute a PCA of these variables cfat=fat[,9:18] prfat=prcomp(cfat) dim(prfat$rot) dim(prfat$x) ############ additional code added to check assumptions # verify matrix assertions on p. 162 X=as.matrix(cfat) # center X about its column means for(i in 1:ncol(X)){X[,i]=X[,i]-mean(X[,i])} U=as.matrix(prfat$rot) Z=as.matrix(prfat$x) range(Z-X%*%U) # conclusion: in matrix notation, Z = X U # U is orthonormal W=t(U) %*% U diag(W)=0 range(W) # Z is orthonormal but diag entries are not 1 W=t(Z) %*% Z diag(W)=0 range(W) ############ end of additional code # check variance proportions and show weights of first PC summary(prfat) round(prfat$rot[,1],2) # alternative formulation: scaled PCs prfatc=prcomp(cfat,scale=T) summary(prfatc) round(prfatc$rot[,1],2) # first PC weights columns almost equally - "overall size" interpretation round(prfatc$rot[,2],2) # second PC is contrast between "body center" and "body extremities" measures # Mahalanobis distance to assess outliers library(MASS) robfat=cov.rob(cfat) # use robust measures of center and covariance md=mahalanobis(cfat,center=robfat$center,cov=robfat$cov) n=nrow(cfat) p=ncol(cfat) plot(qchisq(1:n/(n+1),p),sort(md),xlab=expression(paste(chi^2," quantiles")), ylab='Sorted Mahalanobis Distances',pch=20) abline(0,1) # now turn to PCR regression # fat data: first fit all body-measurement variables lmoda=lm(fat$brozek~.,data=cfat) summary(lmoda) # now do PCR regression: first two PCs lmodpcr=lm(fat$brozek~prfatc$x[,1:2]) summary(lmodpcr) # increase the number of PCs lmodpcr3=lm(fat$brozek~prfatc$x[,1:3]) summary(lmodpcr3) lmodpcr4=lm(fat$brozek~prfatc$x[,1:4]) summary(lmodpcr4) lmodpcr10=lm(fat$brozek~prfatc$x[,1:10]) summary(lmodpcr10) # note caution: the higher-order PCs may contain more explanatory power # contrast with PLS regression - later # try to interpret the PCs in terms of specific variables lmodr=lm(fat$brozek~scale(abdom)+I(scale(ankle)-scale(abdom)),data=cfat) summary(lmodr) # actually, this does BETTER than the 2PC model # new dataset - Faraway page 167 library(faraway) data(meatspec) trainmeat=meatspec[1:172,] testmeat=meatspec[173:215,] modlm=lm(fat~.,trainmeat) summary(modlm) # look at predictive power in test sample rmse=function(x,y){sqrt(mean((x-y)^2))} rmse(fitted(modlm),trainmeat$fat) rmse(predict(modlm,testmeat),testmeat$fat) # interpretation: using all 100 covariates makes the out of sample predictions much worse # use stepwise regression to select variables modsteplm=step(modlm) rmse(fitted(modsteplm),trainmeat$fat) rmse(predict(modsteplm,testmeat),testmeat$fat) # helps a little bit - out of sample RMSE from 3.81 to 3.59 meatpca=prcomp(trainmeat[,-101]) round(meatpca$sdev,3) # interpretation: almost all of the variability is in first 4 PCs # plot graphs of largest 4 PCs matplot(1:100,meatpca$rot[,1:4],type='l',xlab='Frequency',ylab='',col=1) modlmpc=lm(fat~meatpca$x[,1:4],trainmeat) summary(modlmpc) #parameters much more stable but no immediate way to us "predict" command because we changed the x variables # ocmpared with the original dataset # run on largest 4 PCs using pls package library(pls) pcrmod=pcr(fat~.,data=trainmeat,ncomp=50) rmse(predict(pcrmod,ncomp=4),trainmeat$fat) # not as good as using all 100 variables (in fact seems quite a lot worse) # plot original and shrinkage coefficients par(mfrow=c(1,2)) plot(modlm$coef[-1],xlab='Frequency',ylab='Coefficient',type='l') coefplot(pcrmod,ncomp=4,xlab='Frequency',main='') # screeplot par(mfrow=c(1,1)) plot(meatpca$sdev[1:10],type='l',ylab='SD of PC',xlab='PC number') # elbow at 5? rmse(predict(pcrmod,testmeat,ncomp=4),testmeat$fat) # not too impressive # however we can also do -- par(mfrow=c(1,2)) pcrmse=RMSEP(pcrmod,newdata=testmeat) plot(pcrmse,main='') which.min(pcrmse$val) min(pcrmse$val) # plots to illustrate cross-validation # this is optional set.seed(123) pcrmod=pcr(fat~.,data=trainmeat,validation='CV',ncomp=50) pcrCV=RMSEP(pcrmod,estimate='CV') plot(pcrCV,main='') ypred=predict(pcrmod,testmeat,ncomp=18) rmse(ypred,testmeat$fat) which.min(pcrCV$val[1,1,]) ypred=predict(pcrmod,testmeat,ncomp=which.min(pcrCV$val[1,1,])-1) rmse(ypred,testmeat$fat) ypred=predict(pcrmod,testmeat,ncomp=32) rmse(ypred,testmeat$fat) # Faraway section 11.2 - PLS # this is optional # set.seed(123) plsmod=plsr(fat~.,data=meatspec[1:172,],ncomp=50,validation='CV') coefplot(plsmod,ncomp=4,xlab='Frequency') plsCV=RMSEP(plsmod,estimate='CV') plot(plsCV,main='') # actually, plsCV is smallest at ncomp=14 (print it out) ypred=predict(plsmod,ncomp=14) rmse(ypred,trainmeat$fat) # similar to PCR ytpred=predict(plsmod,testmeat,ncomp=14) rmse(ytpred,testmeat$fat) # compare 2.13 by PCR ### Faraway section 11.3 - ridge regression require(MASS) par(mfrow=c(1,1)) rgmod=lm.ridge(fat~.,trainmeat,lambda=seq(0,5e-8,len=21)) matplot(rgmod$lambda,coef(rgmod),type='l',xlab=expression(lambda),ylab=expression(hat(beta)),col=1) # called the ridge trace plot # select lambda via GCV which.min(rgmod$GCV) abline(v=1.75e-08) ypred=cbind(1,as.matrix(trainmeat[,-101])) %*% coef(rgmod)[12,] rmse(ypred,trainmeat$fat) ypred=cbind(1,as.matrix(testmeat[,-101])) %*% coef(rgmod)[12,] rmse(ypred,testmeat$fat) # observation 13 is bad: plot(ypred,testmeat$fat) points(ypred[13],testmeat$fat[13],col='red',pch=20) c(ytpred[13],ypred[13],testmeat$fat[13]) rmse(ypred[-13],testmeat$fat[-13]) # section 11.4 - lasso # least absolute shrinkage and selection operator library(lars) data(state) statedata=data.frame(state.x77,row.names=state.abb) lmod=lars(as.matrix(statedata[,-4]),statedata$Life) par(mfrow=c(1,2)) plot(lmod) cvlmod=cv.lars(as.matrix(statedata[,-4]),statedata$Life) set.seed(123) cvlmod$index[which.min(cvlmod$cv)] predict(lmod,s=0.65657,type='coef',mode='fraction')$coef coef(lm(Life.Exp~Population+Murder+HS.Grad+Frost,statedata)) # fat data par(mfrow=c(1,1)) trainy=trainmeat$fat trainx=as.matrix(trainmeat[,-101]) lassomod=lars(trainx,trainy) cvout=cv.lars(trainx,trainy) cvout$index[which.min(cvout$cv)] testx=as.matrix(testmeat[,-101]) predlars=predict(lassomod,testx,s=0.0101,mode='fraction') rmse(testmeat$fat,predlars$fit) predlars=predict(lassomod,s=0.0101,type='coef',mode='fraction') plot(predlars$coef,type='h',ylab='Coefficient') sum(predlars$coef!=0)