# cross-ref old and new Amherst dataset
A=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amherst.s',header=T)
B=read.csv('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amh1.csv',header=T)
u1=c(58:60,62:65,67:72,74:162)
cbind(B[1:102,2],A[c(58:60,62:65,67:72,74:162),2])
plot(B[1:102,2],A[c(58:60,62:65,67:72,74:162),2])
# doesn't seem a good match


# some examples of gofsim

# new material added Fall 2023

# first load gofsim
source('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/Rcode-gof.txt')

# original Amherst dataset
A=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amherst.s',header=T)
par(mfrow=c(1,1))
plot(A$year,A$temp)

lm1=lm(temp~year,A)
lm2=lm(temp~year+I(year^2),A)

summary(lm1)
summary(lm2)
anova(lm1,lm2)
plot(lm1)
plot(lm2)

gofsim(A$temp,A$year,1,10000)

# [1,] ""        "LG"      "KS"      "CVM"     "AD"     
# [2,] "Test"    "0.98924" "0.07601" "0.18092" "1.09833"
# [3,] "p-value" "0.0131"  "0.0218"  "0.0104"  "0.0081" 

gofsim(A$temp,model.matrix(lm2),0,10000)

# [1,] ""        "LG"      "KS"      "CVM"     "AD"     
# [2,] "Test"    "0.99186" "0.06908" "0.14732" "0.91804"
# [3,] "p-value" "0.0528"  "0.0582"  "0.0283"  "0.02"   

library(EnvStats)
gofTest(lm1$resid,test='ppcc')$statistic
gofTest(lm1$resid,test='ppcc')$p.value
gofTest(lm1$resid,test='ks')$statistic
gofTest(lm1$resid,test='ks')$p.value
gofTest(lm1$resid,test='cvm')$statistic
gofTest(lm1$resid,test='cvm')$p.value
gofTest(lm1$resid,test='ad')$statistic
gofTest(lm1$resid,test='ad')$p.value

gofTest(lm2$resid,test='ppcc')$statistic
gofTest(lm2$resid,test='ppcc')$p.value
gofTest(lm2$resid,test='ks')$statistic
gofTest(lm2$resid,test='ks')$p.value
gofTest(lm2$resid,test='cvm')$statistic
gofTest(lm2$resid,test='cvm')$p.value
gofTest(lm2$resid,test='ad')$statistic
gofTest(lm2$resid,test='ad')$p.value

# it looks like the "gofTest" p-values are good except for "ks"



# minitab tree data

trees=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/tree.dat',header=T)
y=log(trees$vol)
x1=log(trees$diam)
x2=log(trees$ht)
par(mfrow=c(1,2))
plot(x1,y,xlab='Log Diameter',ylab='Log Volume',pch=20)
plot(x2,y,xlab='Log Height',ylab='Log Volume',pch=20)
lm1=lm(y~x1+x2)
summary(lm1)

# test whether beta_1=2, beta_2=1
y1=y-2*x1-x2
lm2=lm(y1~x1+x2)
lm3=lm(y1~1)
anova(lm3,lm2)

par(mfrow=c(2,2))
x1c=x1-mean(x1)
x2c=x2-mean(x2)
plot(x1c,lm1$resid,ylab='Residual',xlab='x1c',pch=20)
plot(x2c,lm1$resid,ylab='Residual',xlab='x2c',pch=20)
plot(x1c*x2c,lm1$resid,ylab='Residual',xlab='x1c*x2c',pch=20)
plot(lm1$fitted,lm1$resid,ylab='Residual',xlab='Fitted Value',pch=20)

gofsim(y,cbind(x1,x2),1,10000)

# tests of outliers in tree data

sort(rstandard(lm1))
sort(rstudent(lm1))

2*pt(-2.32572,27)

# calculations on Smith & Young p. 179
# try deletion of observations 15, 16, 18
lm1a=lm(y~x1+x2,subset=-c(15,16,18))
summary(lm1a)
summary(lm1a)$sigma

pr1=predict(lm1a,data.frame(cbind(x1,x2)))
sub1=c(15,16,18)
(y[sub1]-pr1[sub1])/summary(lm1a)$sigma



pr2=predict(lm1a,data.frame(cbind(x1,x2)),interval='prediction')

prse=(pr2[,3]-pr2[,2])/(2*qt(0.975,25))

(y[sub1]-pr1[sub1])/prse[sub1]

2*pt((y[sub1]-pr1[sub1])/prse[sub1],25)

# DFFITS etc.

hatvalues(lm1)

dffits(lm1)

dfbetas(lm1)

cooks.distance(lm1)

covratio(lm1)

# nuclear power plant data

nukes=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/nukes.dat',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)


lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes)
anova(lm4,lm5)

par(mfrow=c(1,2))
plot(lm5$fitted,lm5$residuals,xlab='Fitted Values',ylab='Residuals',pch=20)
plot(nukes$PT,lm5$residuals,xlab='PT',ylab='Residuals',pch=20)

# could also do it like this:

plot(lm5$fitted,lm5$residuals,xlab='Fitted Values',ylab='Residuals',pch=20)
plot(factor(nukes$PT),lm5$residuals,xlab='PT',ylab='Residuals',pch=20)


gofsim(nukes$LC,model.matrix(lm5),0,10000)


############ illustrate backward selection

nukes=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/nukes.dat',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)
summary(lm4)

# drop BW
lm5=lm(LC~D+LT1+LT2+LS+PR+NE+CT+LN+PT,nukes)
summary(lm5)

# drop LT1
lm6=lm(LC~D+LT2+LS+PR+NE+CT+LN+PT,nukes)
summary(lm6)

# drop LT2
lm7=lm(LC~D+LS+PR+NE+CT+LN+PT,nukes)
summary(lm7)

# drop PR
lm8=lm(LC~D+LS+NE+CT+LN+PT,nukes)
summary(lm8)

# drop PT
lm9=lm(LC~D+LS+NE+CT+LN,nukes)
summary(lm9)


require(leaps)
lm10=regsubsets(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)
lm10s=summary(lm10)

lm10s$which

AIC=32*log(lm10s$rss/32)+(1:8)*2
plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20)

plot(1:8,lm10s$adjr2,xlab='Number of Parameters',ylab='Adjusted R-Square',pch=20)


plot(1:8,lm10s$cp,xlab='Number of Parameters',ylab='Cp',pch=20)
abline(0,1)

lm11=step(lm4)


# look for influential observations

sort(influence(lm8)$hat)

sort(cooks.distance(lm8))

# try omitting two most influential

lm13=regsubsets(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes,subset=c(-19,-26))
lm13s=summary(lm13)
lm13s$which
AIC=30*log(lm10s$rss/30)+(1:8)*2
plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20)


# best model now contains
# D LT2 LS NE CT PT
# compared with
# D LS NE CT LN PT
# for original data

# stripchart plot (Faraway p. 158) - do on ORIGINAL scale of data to assess transformations
nukesorig=nukes[,2:12]
stripchart(data.frame(scale(nukesorig)),method='jitter',las=2,vertical=TRUE)

# could also try
boxplot(data.frame(scale(nukes[,2:12])))

# not obvious why any of the variables need to be transformed, though some skewness in C, D and N

# Examples of multicollinearity

library(faraway)

# 1: artificial extension of trees data on p. 218
trees=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/trees.txt',header=T)
trees$y=log(trees$vol)
trees$x1=log(trees$diam)
trees$x2=log(trees$ht)
trees$x3=0.35*trees$x1-trees$x2-0.15*trees$x1*trees$x2

lm1=lm(y~x1+x2,trees)
lm2=lm(y~x1+x2+x3,trees)
vif(lm1)
vif(lm2)

# VIFs for nuclear data
nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)
lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes)
anova(lm4,lm5)

vif(lm4)
vif(lm5)



# illustrating "boxcox" function

nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T)


nukes=read.table('D:/STOR664-F21/nukes.txt',header=T)
nukes$LC=log(nukes$C)
nukes$LT1=log(nukes$T1)
nukes$LT2=log(nukes$T2)
nukes$LS=log(nukes$S)
nukes$LN=log(nukes$N)
lm1=lm(C~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes)
library(MASS)
boxcox(lm1)