library(glmnet) data(QuickStartExample) names(QuickStartExample) x=QuickStartExample$x y=QuickStartExample$y dim(x) dim(y) # some initial examination of data titl=c('x[,1]','x[,2]','x[,3]','x[,4]','x[,5]','x[,6]','x[,7]','x[,8]','x[,9]','x[,10]', 'x[,11]','x[,12]','x[,13]','x[,14]','x[,15]','x[,16]','x[,17]','x[,18]','x[,19]','x[,20]') par(mfrow=c(4,5)) for(i in 1:20){hist(x[,i],breaks=10,xlab='',main=titl[i])} for(i in 1:20){plot(x[,i],y,pch=20,main=titl[i])} par(mfrow=c(1,1)) hist(y,breaks=10) par(mfrow=c(1,2)) boxplot(data.frame(x[,1:20])) stripchart(data.frame(x),method='jitter',las=2,vertical=TRUE) # looking for scaling and multicollinearity M=rep(20) S=rep(20) for(i in 1:20){M[i]=mean(x[,i]) S[i]=sqrt(var(x[,i]))} evals=eigen(t(x)%*% x)$val sqrt(evals[1]/evals) # largest condition number is 2.26 #variance inflation factors computed in section 5.2.2 of Smith and Young: x1=scale(x)/sqrt(99) # scaling makes diagonal entries of X^TX equal to 1 diag(t(x1)%*%x1) library(MASS) diag(ginv(t(x1)%*%x1)) # alternatively: vif(x1) # now do some fitting with glmnet fit=glmnet(x,y) par(mfrow=c(1,1)) plot(fit) cvfit=cv.glmnet(x,y,nfolds=100) # note use of nfolds here plot(cvfit) cvfit$lambda.min coef(cvfit, s = "lambda.min") sum(abs(coef(cvfit, s = "lambda.min")[2:21])) par(mfrow=c(1,1),cex=1.3) plot(fit) mtext(side=3,line=2,'glmnet test dataset',cex=2) lines(c(6.075728,6.075728),c(-10,10)) text(6.45,1.1,expression(lambda==0.0757)) text(6.48,0.95,expression(sum(abs(beta[j]))==6.076)) # see what the minimum CV score actually is which(cvfit$lambda==cvfit$lambda.min) cvfit$cvm cvfit$cvsd cvfit$cvm[34] cvfit$cvsd[34] # min CV score is 1.02 with a SD of 0.13 # alternative value of alpha fit=glmnet(x,y,alpha=0.2) plot(fit) cvfit=cv.glmnet(x,y,alpha=0.2,nfolds=100) plot(cvfit) cvfit$lambda.min coef(cvfit, s = "lambda.min") which(fit$lambda==cvfit$lambda.min) cvfit$cvm[43] # min CV score here is 1.097... # recall nuclear power data nukes=read.table('d:/stor664-f21/nukes.txt',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes) anova(lm4,lm5) summary(lm5) # do similar scaling things here x=cbind(nukes$D,nukes$LT1,nukes$LT2,nukes$LS,nukes$PR,nukes$NE,nukes$CT,nukes$BW,nukes$LN,nukes$PT) y=nukes$LC M=rep(10) S=rep(10) for(i in 1:10){M[i]=mean(x[,i]) S[i]=sqrt(var(x[,i]))} eigen(t(x) %*% x)$val # condition indices here sqrt(eigen(t(x) %*% x)$val[1]/eigen(t(x) %*% x)$val) x=scale(x) eigen(t(x) %*% x)$val sqrt(eigen(t(x) %*% x)$val[1]/eigen(t(x) %*% x)$val) # repeat earlier model fit lm5a=lm(y~x[,c(1,4,6,7,9,10)]) summary(lm5a) # essentially the same as model m5 fit=glmnet(x,y) cvfit=cv.glmnet(x,y,nfolds=32) plot(cvfit) coef(cvfit, s = cvfit$lambda.min) sum(abs(coef(cvfit, s = cvfit$lambda.min)[2:11])) # value is 0.5027698 - this explain the next piece of code plot(fit) lines(c(0.503,0.503),c(-10,10)) cvfit$cvm[which(fit$lambda==cvfit$lambda.min)] # minimum CV score is 0.03827464 # compare LS fit in this case - not parameters in above model are V1, V3, V4, V6, V7, V10 lm6=lm(y~x[,c(1,3,4,6,7,10)]) summary(lm6) # redo CV score with this fit - use smallest lambda in this case fit1=glmnet(x[,c(1,4,6,7,9,10)],y) plot(fit1) cvfit1=cv.glmnet(x[,c(1,4,6,7,9,10)],y,nfolds=32) plot(cvfit1) fit1$beta[,60] cvfit1$cvm[60] # this is 0.03783014 - slightly smaller than lasso best fit coef(cvfit1, s = 0.001036283) # alternative approach for CV score nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes) MSPE=0 for(i in 1:32){ lm5a=lm(LC~D+LS+NE+CT+LN+PT,nukes,subset=-i) MSPE=MSPE+(predict(lm5a,newdata=nukes)[i]-nukes$LC[i])^2 } MSPE/32 # result is 0.03579819 ################################ new dataset - ozone data library(faraway) data(ozone) par(mfrow=c(2,4)) hist(ozone$vh) hist(ozone$wind) hist(ozone$humidity) hist(ozone$temp) hist(ozone$ibh) hist(ozone$dpg) hist(ozone$ibt) hist(ozone$vis) par(mfrow=c(1,3)) hist(ozone$O3) hist(log(ozone$O3)) hist(sqrt(ozone$O3)) x=as.matrix(ozone[,2:9]) y=as.vector(ozone[,1]) lm1=lm(y~x) summary(lm1) par(mfrow=c(1,1)) library(MASS) boxcox(lm1) boxcox(lm1,lambda=0.01*(0:50)) y=y^0.25 hist(y,breaks=12) # in this case use a holdout set - every tenth observation # rescale x matrix first x=scale(x) u=rep(F,330) u[10*rep(1:33)]=T ytest=y[u] xtest=x[u,] yfit=y[!u] xfit=x[!u,] ozonefit=data.frame(yfit,xfit) ozonetest=data.frame(ytest,xtest) ## write two datasets to separate files, then reload ## ## (I don't think this step is actually necessary, but I was having a ## problem at one point) write.csv(ozonefit,'C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonefit.csv') write.csv(ozonetest,'C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonetest.csv') ######## reload ozonefit=read.csv('C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonefit.csv') ozonetest=read.csv('C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonetest.csv') # initial fits - best subsets regression require(leaps) lm1=regsubsets(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit) lm1s=summary(lm1) lm1=regsubsets(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit) lm1s=summary(lm1) lm1s$which AIC=297*log(lm1s$rss/297)+(1:8)*2 plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20) plot(AIC[3:8]~I(3:8),ylab='AIC',xlab='Number of Predictors',pch=20) # include x columns 3,4,5 and maybe 8 # do a cross-validation three ways MSPE=0 for(i in 1:297){ lm1a=lm(yfit~humidity+temp+ibh,data=ozonefit,subset=-i) MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2 } MSPE/297 MSPE=0 for(i in 1:297){ lm1a=lm(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit,subset=-i) MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2 } MSPE/297 MSPE=0 for(i in 1:297){ lm1a=lm(yfit~humidity+temp+ibh+vis,data=ozonefit,subset=-i) MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2 } MSPE/297 # now try on test data lm1=lm(yfit~humidity+temp+ibh,data=ozonefit) pr1=predict(lm1,newdata=ozonetest) print(sum((pr1-ozonetest$ytest)^2/33)) lm1=lm(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit) pr1=predict(lm1,newdata=ozonetest) print(sum((pr1-ozonetest$ytest)^2/33)) lm1=lm(yfit~humidity+temp+ibh+vis,data=ozonefit) pr1=predict(lm1,newdata=ozonetest) print(sum((pr1-ozonetest$ytest)^2/33)) # lasso fits library(glmnet) xfit=as.matrix(ozonefit[,2:9]) yfit=as.vector(ozonefit[,1]) fit=glmnet(xfit,yfit) plot(fit) cvfit=cv.glmnet(xfit,yfit) which.min(cvfit$cvm) cvfit$cvm[38:42] coef(fit)[,41] lm1=lm(yfit~humidity+temp+ibh+vis,data=ozonefit) lm1$coef xtest=as.matrix(ozonetest[,2:9]) ytest=ozonetest[,1] pr1=predict(fit,xtest) print(sum((pr1[,41]-ytest)^2)/33) pr2=predict(lm1,newdata=ozonetest) print(sum((pr2-ytest)^2)/33)