library(glmnet)
data(QuickStartExample)

names(QuickStartExample)
x=QuickStartExample$x
y=QuickStartExample$y
dim(x)
dim(y)




# some initial examination of data


titl=c('x[,1]','x[,2]','x[,3]','x[,4]','x[,5]','x[,6]','x[,7]','x[,8]','x[,9]','x[,10]',
'x[,11]','x[,12]','x[,13]','x[,14]','x[,15]','x[,16]','x[,17]','x[,18]','x[,19]','x[,20]')
par(mfrow=c(4,5))
for(i in 1:20){hist(x[,i],breaks=10,xlab='',main=titl[i])}

for(i in 1:20){plot(x[,i],y,pch=20,main=titl[i])}

par(mfrow=c(1,1))
hist(y,breaks=10)

par(mfrow=c(1,2))
boxplot(data.frame(x[,1:20]))
stripchart(data.frame(x),method='jitter',las=2,vertical=TRUE)

# looking for scaling and multicollinearity

M=rep(20)
S=rep(20)

for(i in 1:20){M[i]=mean(x[,i])
S[i]=sqrt(var(x[,i]))}

evals=eigen(t(x)%*% x)$val

sqrt(evals[1]/evals)
# largest condition number is 2.26

#variance inflation factors computed in section 5.2.2 of Smith and Young:
x1=scale(x)/sqrt(99) # scaling makes diagonal entries of X^TX equal to 1
diag(t(x1)%*%x1)
library(MASS)
diag(ginv(t(x1)%*%x1))

# alternatively:
vif(x1)



# now do some fitting with glmnet

fit=glmnet(x,y)

par(mfrow=c(1,1))
plot(fit)

cvfit=cv.glmnet(x,y,nfolds=100)
# note use of nfolds here
plot(cvfit)
cvfit$lambda.min
coef(cvfit, s = "lambda.min")

sum(abs(coef(cvfit, s = "lambda.min")[2:21]))

par(mfrow=c(1,1),cex=1.3)
plot(fit)
mtext(side=3,line=2,'glmnet test dataset',cex=2)
lines(c(6.075728,6.075728),c(-10,10))
text(6.45,1.1,expression(lambda==0.0757))
text(6.48,0.95,expression(sum(abs(beta[j]))==6.076))

# see what the minimum CV score actually is

which(cvfit$lambda==cvfit$lambda.min)

cvfit$cvm
cvfit$cvsd
cvfit$cvm[34]
cvfit$cvsd[34]

# min CV score is 1.02 with a SD of 0.13


# alternative value of alpha
fit=glmnet(x,y,alpha=0.2)
plot(fit)
cvfit=cv.glmnet(x,y,alpha=0.2,nfolds=100)
plot(cvfit)
cvfit$lambda.min
coef(cvfit, s = "lambda.min")
which(fit$lambda==cvfit$lambda.min)
cvfit$cvm[43]
# min CV score here is 1.097...

# recall nuclear power data


nukes=read.table('d:/stor664-f21/nukes.txt',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)


lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes)
anova(lm4,lm5)
summary(lm5)

# do similar scaling things here
x=cbind(nukes$D,nukes$LT1,nukes$LT2,nukes$LS,nukes$PR,nukes$NE,nukes$CT,nukes$BW,nukes$LN,nukes$PT)
y=nukes$LC
M=rep(10)
S=rep(10)
for(i in 1:10){M[i]=mean(x[,i])
S[i]=sqrt(var(x[,i]))}
eigen(t(x) %*% x)$val

# condition indices here

sqrt(eigen(t(x) %*% x)$val[1]/eigen(t(x) %*% x)$val)


x=scale(x)
eigen(t(x) %*% x)$val
sqrt(eigen(t(x) %*% x)$val[1]/eigen(t(x) %*% x)$val)

# repeat earlier model fit 
lm5a=lm(y~x[,c(1,4,6,7,9,10)])
summary(lm5a)
# essentially the same as model m5



fit=glmnet(x,y)
cvfit=cv.glmnet(x,y,nfolds=32)
plot(cvfit)
coef(cvfit, s = cvfit$lambda.min)
sum(abs(coef(cvfit, s = cvfit$lambda.min)[2:11]))
# value is 0.5027698 - this explain the next piece of code
plot(fit)
lines(c(0.503,0.503),c(-10,10))

cvfit$cvm[which(fit$lambda==cvfit$lambda.min)]
# minimum CV score is 0.03827464

# compare LS fit in this case - not parameters in above model are V1, V3, V4, V6, V7, V10
lm6=lm(y~x[,c(1,3,4,6,7,10)])
summary(lm6)

# redo CV score with this fit - use smallest lambda in this case

fit1=glmnet(x[,c(1,4,6,7,9,10)],y)
plot(fit1)
cvfit1=cv.glmnet(x[,c(1,4,6,7,9,10)],y,nfolds=32)
plot(cvfit1)
fit1$beta[,60]
cvfit1$cvm[60]
# this is 0.03783014 - slightly smaller than lasso best fit
coef(cvfit1, s = 0.001036283)

# alternative approach for CV score

nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)
lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes)

MSPE=0
for(i in 1:32){
lm5a=lm(LC~D+LS+NE+CT+LN+PT,nukes,subset=-i)
MSPE=MSPE+(predict(lm5a,newdata=nukes)[i]-nukes$LC[i])^2
}
MSPE/32

# result is 0.03579819 









################################ new dataset - ozone data


library(faraway)
data(ozone)
par(mfrow=c(2,4))
hist(ozone$vh)
hist(ozone$wind)
hist(ozone$humidity)
hist(ozone$temp)
hist(ozone$ibh)
hist(ozone$dpg)
hist(ozone$ibt)
hist(ozone$vis)

par(mfrow=c(1,3))
hist(ozone$O3)
hist(log(ozone$O3))
hist(sqrt(ozone$O3))

x=as.matrix(ozone[,2:9])
y=as.vector(ozone[,1])
lm1=lm(y~x)
summary(lm1)

par(mfrow=c(1,1))
library(MASS)
boxcox(lm1)

boxcox(lm1,lambda=0.01*(0:50))

y=y^0.25
hist(y,breaks=12)

# in this case use a holdout set - every tenth observation

# rescale x matrix first

x=scale(x)

u=rep(F,330)
u[10*rep(1:33)]=T
ytest=y[u]
xtest=x[u,]
yfit=y[!u]
xfit=x[!u,]

ozonefit=data.frame(yfit,xfit)
ozonetest=data.frame(ytest,xtest)

## write two datasets to separate files, then reload
##
## (I don't think this step is actually necessary, but I was having a
## problem at one point)

write.csv(ozonefit,'C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonefit.csv')
write.csv(ozonetest,'C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonetest.csv')

######## reload


ozonefit=read.csv('C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonefit.csv')
ozonetest=read.csv('C:/Users/rls/aug20/UNC/STOR664/R-Code/ozonetest.csv')

# initial fits - best subsets regression

require(leaps)
lm1=regsubsets(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit)
lm1s=summary(lm1)

lm1=regsubsets(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit)
lm1s=summary(lm1)

lm1s$which

AIC=297*log(lm1s$rss/297)+(1:8)*2
plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20)
plot(AIC[3:8]~I(3:8),ylab='AIC',xlab='Number of Predictors',pch=20)

# include x columns 3,4,5 and maybe 8

# do a cross-validation three ways

MSPE=0
for(i in 1:297){
lm1a=lm(yfit~humidity+temp+ibh,data=ozonefit,subset=-i)
MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2
}
MSPE/297


MSPE=0
for(i in 1:297){
lm1a=lm(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit,subset=-i)
MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2
}
MSPE/297

MSPE=0
for(i in 1:297){
lm1a=lm(yfit~humidity+temp+ibh+vis,data=ozonefit,subset=-i)
MSPE=MSPE+(predict(lm1a,newdata=ozonefit)[i]-ozonefit$yfit[i])^2
}
MSPE/297


# now try on test data
lm1=lm(yfit~humidity+temp+ibh,data=ozonefit)
pr1=predict(lm1,newdata=ozonetest)
print(sum((pr1-ozonetest$ytest)^2/33))

lm1=lm(yfit~vh+wind+humidity+temp+ibh+dpg+ibt+vis,data=ozonefit)
pr1=predict(lm1,newdata=ozonetest)
print(sum((pr1-ozonetest$ytest)^2/33))

lm1=lm(yfit~humidity+temp+ibh+vis,data=ozonefit)
pr1=predict(lm1,newdata=ozonetest)
print(sum((pr1-ozonetest$ytest)^2/33))

# lasso fits

library(glmnet)
xfit=as.matrix(ozonefit[,2:9])
yfit=as.vector(ozonefit[,1])
fit=glmnet(xfit,yfit)
plot(fit)


cvfit=cv.glmnet(xfit,yfit)
which.min(cvfit$cvm)
cvfit$cvm[38:42]

coef(fit)[,41]
lm1=lm(yfit~humidity+temp+ibh+vis,data=ozonefit)

lm1$coef

xtest=as.matrix(ozonetest[,2:9])
ytest=ozonetest[,1]

pr1=predict(fit,xtest)
print(sum((pr1[,41]-ytest)^2)/33)

pr2=predict(lm1,newdata=ozonetest)
print(sum((pr2-ytest)^2)/33)