# cross-ref old and new Amherst dataset A=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amherst.s',header=T) B=read.csv('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amh1.csv',header=T) u1=c(58:60,62:65,67:72,74:162) cbind(B[1:102,2],A[c(58:60,62:65,67:72,74:162),2]) plot(B[1:102,2],A[c(58:60,62:65,67:72,74:162),2]) # doesn't seem a good match # some examples of gofsim # new material added Fall 2023 # first load gofsim source('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/Rcode-gof.txt') # original Amherst dataset A=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/amherst.s',header=T) par(mfrow=c(1,1)) plot(A$year,A$temp) lm1=lm(temp~year,A) lm2=lm(temp~year+I(year^2),A) summary(lm1) summary(lm2) anova(lm1,lm2) plot(lm1) plot(lm2) gofsim(A$temp,A$year,1,10000) # [1,] "" "LG" "KS" "CVM" "AD" # [2,] "Test" "0.98924" "0.07601" "0.18092" "1.09833" # [3,] "p-value" "0.0131" "0.0218" "0.0104" "0.0081" gofsim(A$temp,model.matrix(lm2),0,10000) # [1,] "" "LG" "KS" "CVM" "AD" # [2,] "Test" "0.99186" "0.06908" "0.14732" "0.91804" # [3,] "p-value" "0.0528" "0.0582" "0.0283" "0.02" library(EnvStats) gofTest(lm1$resid,test='ppcc')$statistic gofTest(lm1$resid,test='ppcc')$p.value gofTest(lm1$resid,test='ks')$statistic gofTest(lm1$resid,test='ks')$p.value gofTest(lm1$resid,test='cvm')$statistic gofTest(lm1$resid,test='cvm')$p.value gofTest(lm1$resid,test='ad')$statistic gofTest(lm1$resid,test='ad')$p.value gofTest(lm2$resid,test='ppcc')$statistic gofTest(lm2$resid,test='ppcc')$p.value gofTest(lm2$resid,test='ks')$statistic gofTest(lm2$resid,test='ks')$p.value gofTest(lm2$resid,test='cvm')$statistic gofTest(lm2$resid,test='cvm')$p.value gofTest(lm2$resid,test='ad')$statistic gofTest(lm2$resid,test='ad')$p.value # it looks like the "gofTest" p-values are good except for "ks" # minitab tree data trees=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/tree.dat',header=T) y=log(trees$vol) x1=log(trees$diam) x2=log(trees$ht) par(mfrow=c(1,2)) plot(x1,y,xlab='Log Diameter',ylab='Log Volume',pch=20) plot(x2,y,xlab='Log Height',ylab='Log Volume',pch=20) lm1=lm(y~x1+x2) summary(lm1) # test whether beta_1=2, beta_2=1 y1=y-2*x1-x2 lm2=lm(y1~x1+x2) lm3=lm(y1~1) anova(lm3,lm2) par(mfrow=c(2,2)) x1c=x1-mean(x1) x2c=x2-mean(x2) plot(x1c,lm1$resid,ylab='Residual',xlab='x1c',pch=20) plot(x2c,lm1$resid,ylab='Residual',xlab='x2c',pch=20) plot(x1c*x2c,lm1$resid,ylab='Residual',xlab='x1c*x2c',pch=20) plot(lm1$fitted,lm1$resid,ylab='Residual',xlab='Fitted Value',pch=20) gofsim(y,cbind(x1,x2),1,10000) # tests of outliers in tree data sort(rstandard(lm1)) sort(rstudent(lm1)) 2*pt(-2.32572,27) # calculations on Smith & Young p. 179 # try deletion of observations 15, 16, 18 lm1a=lm(y~x1+x2,subset=-c(15,16,18)) summary(lm1a) summary(lm1a)$sigma pr1=predict(lm1a,data.frame(cbind(x1,x2))) sub1=c(15,16,18) (y[sub1]-pr1[sub1])/summary(lm1a)$sigma pr2=predict(lm1a,data.frame(cbind(x1,x2)),interval='prediction') prse=(pr2[,3]-pr2[,2])/(2*qt(0.975,25)) (y[sub1]-pr1[sub1])/prse[sub1] 2*pt((y[sub1]-pr1[sub1])/prse[sub1],25) # DFFITS etc. hatvalues(lm1) dffits(lm1) dfbetas(lm1) cooks.distance(lm1) covratio(lm1) # nuclear power plant data nukes=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/nukes.dat',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes) anova(lm4,lm5) par(mfrow=c(1,2)) plot(lm5$fitted,lm5$residuals,xlab='Fitted Values',ylab='Residuals',pch=20) plot(nukes$PT,lm5$residuals,xlab='PT',ylab='Residuals',pch=20) # could also do it like this: plot(lm5$fitted,lm5$residuals,xlab='Fitted Values',ylab='Residuals',pch=20) plot(factor(nukes$PT),lm5$residuals,xlab='PT',ylab='Residuals',pch=20) gofsim(nukes$LC,model.matrix(lm5),0,10000) ############ illustrate backward selection nukes=read.table('http://rls.sites.oasis.unc.edu/faculty/rs/source/Data/nukes.dat',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) summary(lm4) # drop BW lm5=lm(LC~D+LT1+LT2+LS+PR+NE+CT+LN+PT,nukes) summary(lm5) # drop LT1 lm6=lm(LC~D+LT2+LS+PR+NE+CT+LN+PT,nukes) summary(lm6) # drop LT2 lm7=lm(LC~D+LS+PR+NE+CT+LN+PT,nukes) summary(lm7) # drop PR lm8=lm(LC~D+LS+NE+CT+LN+PT,nukes) summary(lm8) # drop PT lm9=lm(LC~D+LS+NE+CT+LN,nukes) summary(lm9) require(leaps) lm10=regsubsets(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) lm10s=summary(lm10) lm10s$which AIC=32*log(lm10s$rss/32)+(1:8)*2 plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20) plot(1:8,lm10s$adjr2,xlab='Number of Parameters',ylab='Adjusted R-Square',pch=20) plot(1:8,lm10s$cp,xlab='Number of Parameters',ylab='Cp',pch=20) abline(0,1) lm11=step(lm4) # look for influential observations sort(influence(lm8)$hat) sort(cooks.distance(lm8)) # try omitting two most influential lm13=regsubsets(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes,subset=c(-19,-26)) lm13s=summary(lm13) lm13s$which AIC=30*log(lm10s$rss/30)+(1:8)*2 plot(AIC~I(1:8),ylab='AIC',xlab='Number of Predictors',pch=20) # best model now contains # D LT2 LS NE CT PT # compared with # D LS NE CT LN PT # for original data # stripchart plot (Faraway p. 158) - do on ORIGINAL scale of data to assess transformations nukesorig=nukes[,2:12] stripchart(data.frame(scale(nukesorig)),method='jitter',las=2,vertical=TRUE) # could also try boxplot(data.frame(scale(nukes[,2:12]))) # not obvious why any of the variables need to be transformed, though some skewness in C, D and N # Examples of multicollinearity library(faraway) # 1: artificial extension of trees data on p. 218 trees=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/trees.txt',header=T) trees$y=log(trees$vol) trees$x1=log(trees$diam) trees$x2=log(trees$ht) trees$x3=0.35*trees$x1-trees$x2-0.15*trees$x1*trees$x2 lm1=lm(y~x1+x2,trees) lm2=lm(y~x1+x2+x3,trees) vif(lm1) vif(lm2) # VIFs for nuclear data nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm4=lm(LC~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) lm5=lm(LC~D+LS+NE+CT+LN+PT,nukes) anova(lm4,lm5) vif(lm4) vif(lm5) # illustrating "boxcox" function nukes=read.table('C:/Users/rls/aug20/UNC/STOR664/R-Code/nukes.txt',header=T) nukes=read.table('D:/STOR664-F21/nukes.txt',header=T) nukes$LC=log(nukes$C) nukes$LT1=log(nukes$T1) nukes$LT2=log(nukes$T2) nukes$LS=log(nukes$S) nukes$LN=log(nukes$N) lm1=lm(C~D+LT1+LT2+LS+PR+NE+CT+BW+LN+PT,nukes) library(MASS) boxcox(lm1)