# prepare anova method for webpage # This file was prepared in windows - replace path C:/Users/rls/mar11/research/boston2013/ by whatever's # appropriate for your local machine # full dataset TIM1=read.table('C:/Users/rls/mar11/research/boston2013/TIM1.txt',header=T) # crossvsalidation dataset TIM2=read.table('C:/Users/rls/mar11/research/boston2013/TIM2.txt',header=T) # statistics of DNF dropout points in full and validation datasets DNF1=c(sum(is.na(TIM1[15])),sum(is.na(TIM1[14])),sum(is.na(TIM1[13])),sum(is.na(TIM1[12])),sum(is.na(TIM1[11]))) DNF2=c(sum(is.na(TIM2[15])),sum(is.na(TIM2[14])),sum(is.na(TIM2[13])),sum(is.na(TIM2[12])),sum(is.na(TIM2[11]))) DNF1/21930 DNF2/16302 # classify by HM time and 20-40 split # # HM is half-marathon time, HMq1 etc. are quantiles # # y2 represent standardized split times from 20K to 40K # y1=TIM1[,c(11,12,13,14,16)] HM=y1[,5] y1[is.na(y1[,1]),1]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),2]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),3]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),4]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,2]),2]=y1[is.na(y1[,2]),1] y1[is.na(y1[,3]),3]=y1[is.na(y1[,3]),2] y1[is.na(y1[,4]),4]=y1[is.na(y1[,4]),3] y2=(y1[,1]+y1[,2]+y1[,3]+y1[,4])*21.0975/(20*HM) range(y2) HMq1=quantile(HM,0.125) HMq2=quantile(HM,0.25) HMq3=quantile(HM,0.375) HMq4=quantile(HM,0.5) HMq5=quantile(HM,0.625) HMq6=quantile(HM,0.75) HMq7=quantile(HM,0.875) print(c(min(HM),HMq1,HMq2,HMq3,HMq4,HMq5,HMq6,HMq7,max(HM))) RHM=rep(NA,length(HM)) Ry2=rep(NA,length(HM)) RHM[HM=HMq1&HM=HMq2&HM=HMq3&HM=HMq4&HM=HMq5&HM=HMq6&HM=HMq7]=8 # # within each eighth of the sorted HM data, perform a secondary sort by y2 for(ii in 1:8){ u2=(RHM==ii) y2a=y2[RHM==ii] y2q1=quantile(y2a,0.25) y2q2=quantile(y2a,0.5) y2q3=quantile(y2a,0.75) Ry2[RHM==ii&y2=y2q1&y2=y2q2&y2=y2q3]=4 } print(c(min(y2),y2q1,y2q2,y2q3,max(y2))) # compute finish times - ind1 are total in each (HM,y2) category FT=TIM1[,7]+TIM1[,8]+TIM1[,9]+TIM1[,10]+TIM1[,11]+TIM1[,12]+TIM1[,13]+TIM1[,14]+TIM1[,15] ind1=matrix(NA,nrow=8,ncol=4) for(i in 1:8){for(j in 1:4){ #ind1[i,j]=sum(RHM==i&Ry2==j)}} ind1[i,j]=sum(RHM==i&Ry2==j&!is.na(FT))}} # compute predicted times by anova method in each (HM,y2) category # # DIF computes a mean absolute difference in each category TIM1P=TIM1 DIF=matrix(nrow=8,ncol=4) for(ii in 1:8){for(jj in 1:4){ u1=(RHM==ii&Ry2==jj) TIM1a=TIM1[u1,] n1a=length(TIM1a[,1]) # apply ANOVA to all observations in TIM1a y3=matrix(NA,nrow=n1a,ncol=9) X1=y3 X2=y3 for(j in 1:9){y3[,j]=log(TIM1a[,6+j])} for(j in 1:9){X1[,j]=j} for(j in 1:9){X2[,j]=1:n1a} y4=as.vector(y3) X1=factor(as.vector(X1)) X2=factor(as.vector(X2)) lm1=lm(y4~X1+X2-1) y3P=outer(c(0,lm1$coef[10:(n1a+8)]),lm1$coef[1:9],"+") FT=exp(y3) %*% rep(1,9) FTP=exp(y3P) %*% rep(1,9) for(j in 1:9){ y5=TIM1P[u1,6+j] y5[is.na(y5)]=exp(y3P[is.na(y5),j]) TIM1P[u1,6+j]=y5 } DIF[ii,jj]=mean(abs(FT-FTP),na.rm=T) }} # output of the above # > DIF # [,1] [,2] [,3] [,4] # [1,] 1.0583178 0.8928235 1.1845015 3.831078 # [2,] 0.6821657 0.7598598 0.9029886 1.874636 # [3,] 0.6435357 0.6542438 0.8378227 1.614446 # [4,] 0.7059666 0.6868412 0.7869991 1.520252 # [5,] 0.7784509 0.7920119 0.8958751 1.775263 # [6,] 0.8932688 0.7915885 0.8636407 1.674258 # [7,] 1.0256868 0.9009678 0.9974807 1.763001 # [8,] 1.4476367 1.2401811 1.3443487 1.926764 # > FT1P=TIM1P[,7]+TIM1P[,8]+TIM1P[,9]+TIM1P[,10]+TIM1P[,11]+TIM1P[,12]+TIM1P[,13]+TIM1P[,14]+TIM1P[,15] finishtime=TIM1P[1:5628,5]*60+TIM1P[1:5628,6]+FT1P[1:5628] sum(floor(finishtime)==885) sum(floor(finishtime)==886) sum(floor(finishtime)==887) sum(floor(finishtime)==888) sum(floor(finishtime)==889) sum(floor(finishtime)==890) sum(floor(finishtime)==891) sum(floor(finishtime)==892) sum(floor(finishtime)==893) sum(floor(finishtime)==894) sum(floor(finishtime)==895) sum(finishtime<889) # # eliminate all those projected to finish before 2:49pm (i.e. 889 minutes after midnight) and then repeat the entire operation # # first expand "finishtime" to the entire dataset finishtime=c(finishtime,rep(10000,16302)) TIM1=TIM1[finishtime>889,] # length of series has been reduced from 21930 to 21793 # re[eat ANOVa calculation # classify by HM time and 20-40 split y1=TIM1[,c(11,12,13,14,16)] HM=y1[,5] y1[is.na(y1[,1]),1]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),2]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),3]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),4]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,2]),2]=y1[is.na(y1[,2]),1] y1[is.na(y1[,3]),3]=y1[is.na(y1[,3]),2] y1[is.na(y1[,4]),4]=y1[is.na(y1[,4]),3] y2=(y1[,1]+y1[,2]+y1[,3]+y1[,4])*21.0975/(20*HM) range(y2) HMq1=quantile(HM,0.125) HMq2=quantile(HM,0.25) HMq3=quantile(HM,0.375) HMq4=quantile(HM,0.5) HMq5=quantile(HM,0.625) HMq6=quantile(HM,0.75) HMq7=quantile(HM,0.875) print(c(min(HM),HMq1,HMq2,HMq3,HMq4,HMq5,HMq6,HMq7,max(HM))) RHM=rep(NA,length(HM)) Ry2=rep(NA,length(HM)) RHM[HM=HMq1&HM=HMq2&HM=HMq3&HM=HMq4&HM=HMq5&HM=HMq6&HM=HMq7]=8 for(ii in 1:8){ u2=(RHM==ii) y2a=y2[RHM==ii] y2q1=quantile(y2a,0.25) y2q2=quantile(y2a,0.5) y2q3=quantile(y2a,0.75) Ry2[RHM==ii&y2=y2q1&y2=y2q2&y2=y2q3]=4 } FT=TIM1[,7]+TIM1[,8]+TIM1[,9]+TIM1[,10]+TIM1[,11]+TIM1[,12]+TIM1[,13]+TIM1[,14]+TIM1[,15] ind1=matrix(NA,nrow=8,ncol=4) for(i in 1:8){for(j in 1:4){ #ind1[i,j]=sum(RHM==i&Ry2==j)}} ind1[i,j]=sum(RHM==i&Ry2==j&!is.na(FT))}} TIM1P=TIM1 DIF=matrix(nrow=8,ncol=4) for(ii in 1:8){for(jj in 1:4){ u1=(RHM==ii&Ry2==jj) TIM1a=TIM1[u1,] n1a=length(TIM1a[,1]) # apply ANOVA to all observations in TIM1a y3=matrix(NA,nrow=n1a,ncol=9) X1=y3 X2=y3 for(j in 1:9){y3[,j]=log(TIM1a[,6+j])} for(j in 1:9){X1[,j]=j} for(j in 1:9){X2[,j]=1:n1a} y4=as.vector(y3) X1=factor(as.vector(X1)) X2=factor(as.vector(X2)) lm1=lm(y4~X1+X2-1) y3P=outer(c(0,lm1$coef[10:(n1a+8)]),lm1$coef[1:9],"+") FT=exp(y3) %*% rep(1,9) FTP=exp(y3P) %*% rep(1,9) for(j in 1:9){ y5=TIM1P[u1,6+j] y5[is.na(y5)]=exp(y3P[is.na(y5),j]) TIM1P[u1,6+j]=y5 } DIF[ii,jj]=mean(abs(FT-FTP),na.rm=T) }} FT1P=TIM1P[,7]+TIM1P[,8]+TIM1P[,9]+TIM1P[,10]+TIM1P[,11]+TIM1P[,12]+TIM1P[,13]+TIM1P[,14]+TIM1P[,15] finishtime=TIM1P[1:5491,5]*60+TIM1P[1:5491,6]+FT1P[1:5491] # new DIF matrix # > DIF # [,1] [,2] [,3] [,4] # [1,] 1.0285592 0.9063071 1.1825426 3.829527 # [2,] 0.6711744 0.7526427 0.9076499 1.858349 # [3,] 0.6449548 0.6600607 0.8147289 1.609537 # [4,] 0.7011490 0.6867358 0.7909995 1.535157 # [5,] 0.7877394 0.8015749 0.8840656 1.783381 # [6,] 0.8918281 0.7886733 0.8730989 1.668652 # [7,] 1.0326554 0.8884029 1.0013911 1.775790 # [8,] 1.4496602 1.2439272 1.3547845 1.906898 sum(floor(finishtime)==885) sum(floor(finishtime)==886) sum(floor(finishtime)==887) sum(floor(finishtime)==888) sum(floor(finishtime)==889) sum(floor(finishtime)==890) sum(floor(finishtime)==891) sum(floor(finishtime)==892) sum(floor(finishtime)==893) sum(floor(finishtime)==894) sum(floor(finishtime)==895) # responses are 0,0,0,0,28,32,85,101,96,89,98 sum(finishtime<889) # response is 0 # write output table write.table(TIM1P,'C:/Users/rls/mar11/research/boston2013/TIM3_RS_ANOVA.txt') # redo everything for TIM2 # classify by HM time and 20-40 split y1=TIM2[,c(11,12,13,14,16)] HM=y1[,5] y1[is.na(y1[,1]),1]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),2]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),3]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,1]),4]=10*y1[is.na(y1[,1]),5]/42.195 y1[is.na(y1[,2]),2]=y1[is.na(y1[,2]),1] y1[is.na(y1[,3]),3]=y1[is.na(y1[,3]),2] y1[is.na(y1[,4]),4]=y1[is.na(y1[,4]),3] y2=(y1[,3]+y1[,4])*21.0975/(40*HM) y2=y1[,3]+y1[,4]-y1[,1]-y1[,2] y2=(y1[,1]+y1[,2]+y1[,3]+y1[,4])*21.0975/(40*HM) range(y2) HMq1=quantile(HM,0.125) HMq2=quantile(HM,0.25) HMq3=quantile(HM,0.375) HMq4=quantile(HM,0.5) HMq5=quantile(HM,0.625) HMq6=quantile(HM,0.75) HMq7=quantile(HM,0.875) print(c(min(HM),HMq1,HMq2,HMq3,HMq4,HMq5,HMq6,HMq7,max(HM))) RHM=rep(NA,length(HM)) Ry2=rep(NA,length(HM)) RHM[HM=HMq1&HM=HMq2&HM=HMq3&HM=HMq4&HM=HMq5&HM=HMq6&HM=HMq7]=8 for(ii in 1:8){ u2=(RHM==ii) y2a=y2[RHM==ii] y2q1=quantile(y2a,0.25) y2q2=quantile(y2a,0.5) y2q3=quantile(y2a,0.75) Ry2[RHM==ii&y2=y2q1&y2=y2q2&y2=y2q3]=4 } FT=TIM2[,7]+TIM2[,8]+TIM2[,9]+TIM2[,10]+TIM2[,11]+TIM2[,12]+TIM2[,13]+TIM2[,14]+TIM2[,15] ind1=matrix(NA,nrow=8,ncol=7) for(i in 1:8){for(j in 1:7){ #ind1[i,j]=sum(RHM==i&Ry2==j)}} ind1[i,j]=sum(RHM==i&Ry2==j&!is.na(FT))}} TIM2P=TIM2 DIF=matrix(nrow=8,ncol=4) for(ii in 1:8){for(jj in 1:4){ u1=(RHM==ii&Ry2==jj) TIM3=TIM2[u1,] n3=length(TIM3[,1]) # apply ANOVA to all observations in TIM3 y3=matrix(NA,nrow=n3,ncol=9) X1=y3 X2=y3 for(j in 1:9){y3[,j]=log(TIM3[,6+j])} for(j in 1:9){X1[,j]=j} for(j in 1:9){X2[,j]=1:n3} y4=as.vector(y3) X1=factor(as.vector(X1)) X2=factor(as.vector(X2)) lm1=lm(y4~X1+X2-1) y3P=outer(c(0,lm1$coef[10:(n3+8)]),lm1$coef[1:9],"+") FT=exp(y3) %*% rep(1,9) FTP=exp(y3P) %*% rep(1,9) for(j in 1:9){ y5=TIM2P[u1,6+j] y5[is.na(y5)]=exp(y3P[is.na(y5),j]) TIM2P[u1,6+j]=y5 } #print(mean(abs(FT-FTP),na.rm=T)) DIF[ii,jj]=mean(abs(FT-FTP),na.rm=T) }} # > DIF # [,1] [,2] [,3] [,4] # [1,] 1.1929717 0.8741298 1.2987122 3.726810 # [2,] 0.7330829 0.7302734 0.8619710 1.863231 # [3,] 0.6210816 0.6602148 0.8561416 1.729001 # [4,] 0.6737937 0.6638562 0.7441652 1.298914 # [5,] 0.7172731 0.7172514 0.8112101 1.744679 # [6,] 0.8325073 0.7877020 0.8887089 1.725032 # [7] 0.9532716 0.8795423 0.9503164 1.691412 # [8,] 1.3672442 1.1488583 1.2549168 1.948686 # compare TIM2P with relevant parts of TIM1 # # first need to reload TIM1 TIM1=read.table('C:/Users/rls/mar11/research/boston2013/TIM1.txt',header=T) TIM1Q=TIM1[5629:21930,] FTQ=TIM1Q[,7]+TIM1Q[,8]+TIM1Q[,9]+TIM1Q[,10]+TIM1Q[,11]+TIM1Q[,12]+TIM1Q[,13]+TIM1Q[,14]+TIM1Q[,15] FTP=TIM2P[,7]+TIM2P[,8]+TIM2P[,9]+TIM2P[,10]+TIM2P[,11]+TIM2P[,12]+TIM2P[,13]+TIM2P[,14]+TIM2P[,15] FT=TIM2[,7]+TIM2[,8]+TIM2[,9]+TIM2[,10]+TIM2[,11]+TIM2[,12]+TIM2[,13]+TIM2[,14]+TIM2[,15] y7=FTQ[is.na(FT)] y8=FTP[is.na(FT)] # y7 and y8 are the true and predicted finish times for all those in the crossvalidation sample mean(abs(y7-y8)) mean((y7-y8)^2) mean(abs(y7-y8)<=1) mean(abs(y7-y8)<=2) mean(abs(y7-y8)<=3) mean(abs(y7-y8)<=4) mean(abs(y7-y8)<=5) mean(abs(y7-y8)<=10) TIM1P[,7:15]=round(TIM1P[,7:15],2) TIM2P[,7:15]=round(TIM2P[,7:15],2) write.table(TIM1P,'C:/Users/rls/mar11/research/boston2013/TIM1_RS_ANOVA.txt') write.table(TIM2P,'C:/Users/rls/mar11/research/boston2013/TIM2_RS_ANOVA.txt') # comparison tables TIM1=read.table('C:/Users/rls/mar11/research/boston2013/TIM1.txt',header=T) TIM2=read.table('C:/Users/rls/mar11/research/boston2013/TIM2.txt',header=T) TIM2P=read.table('C:/Users/rls/mar11/research/boston2013/TIM2_RS_SPL.txt',header=T) TIM2P=read.table('C:/Users/rls/mar11/research/boston2013/TIM2_RS_ANOVA.txt',header=T) TIM1P=read.table('C:/Users/rls/mar11/research/boston2013/TIM1_RS_ANOVA.txt',header=T) TIM1Q=TIM1[5629:21930,] FT1P=TIM1P[,7]+TIM1P[,8]+TIM1P[,9]+TIM1P[,10]+TIM1P[,11]+TIM1P[,12]+TIM1P[,13]+TIM1P[,14]+TIM1P[,15] finishtime=TIM1P[1:5628,5]*60+TIM1P[1:5628,6]+FT1P[1:5628] sum(floor(finishtime)==885) sum(floor(finishtime)==886) sum(floor(finishtime)==887) sum(floor(finishtime)==888) sum(floor(finishtime)==889) sum(floor(finishtime)==890) sum(floor(finishtime)==891) sum(floor(finishtime)==892) sum(floor(finishtime)==893) sum(floor(finishtime)==894) sum(floor(finishtime)==895) FT1Q=TIM1Q[,7]+TIM1Q[,8]+TIM1Q[,9]+TIM1Q[,10]+TIM1Q[,11]+TIM1Q[,12]+TIM1Q[,13]+TIM1Q[,14]+TIM1Q[,15] FT2P=TIM2P[,7]+TIM2P[,8]+TIM2P[,9]+TIM2P[,10]+TIM2P[,11]+TIM2P[,12]+TIM2P[,13]+TIM2P[,14]+TIM2P[,15] # results by dropout point ABSERR=rep(NA,6) RMSE=rep(NA,6) NUMERR=rep(NA,6) PE1=rep(NA,6) PE2=rep(NA,6) PE3=rep(NA,6) PE4=rep(NA,6) PE5=rep(NA,6) PE10=rep(NA,6) for(i in 1:6){ if(i==1)u1=is.na(TIM2[,11]) if(i==2)u1=!is.na(TIM2[,11])&is.na(TIM2[,12]) if(i==3)u1=!is.na(TIM2[,12])&is.na(TIM2[,13]) if(i==4)u1=!is.na(TIM2[,13])&is.na(TIM2[,14]) if(i==5)u1=!is.na(TIM2[,14])&is.na(TIM2[,15]) if(i==6)u1=is.na(TIM2[,15]) NUMERR[i]=sum(u1) ABSERR[i]=mean(abs(FT1Q[u1]-FT2P[u1])) RMSE[i]=sqrt(mean((FT1Q[u1]-FT2P[u1])^2)) PE1[i]=mean(abs(FT1Q[u1]-FT2P[u1])>1) PE2[i]=mean(abs(FT1Q[u1]-FT2P[u1])>2) PE3[i]=mean(abs(FT1Q[u1]-FT2P[u1])>3) PE4[i]=mean(abs(FT1Q[u1]-FT2P[u1])>4) PE5[i]=mean(abs(FT1Q[u1]-FT2P[u1])>5) PE10[i]=mean(abs(FT1Q[u1]-FT2P[u1])>10) } allmeas=round(rbind(ABSERR,RMSE^2,1-PE1,1-PE2,1-PE3,1-PE4,1-PE5,1-PE10),3) t(allmeas) # results by gender, age and FT ABSERR=rep(NA,6) RMSE=rep(NA,6) NUMERR=rep(NA,6) PE1=rep(NA,6) PE2=rep(NA,6) PE3=rep(NA,6) PE4=rep(NA,6) PE5=rep(NA,6) PE10=rep(NA,6) for(i in 1:6){ if(i==1)u1=is.na(TIM2[,15])&TIM2[,4]==1 if(i==2)u1=is.na(TIM2[,15])&TIM2[,4]==2 if(i==3)u1=is.na(TIM2[,15])&TIM2[,3]<=45 if(i==4)u1=is.na(TIM2[,15])&TIM2[,3]>45 if(i==5)u1=is.na(TIM2[,15])&FT1Q<=265 if(i==6)u1=is.na(TIM2[,15])&FT1Q>265 NUMERR[i]=sum(u1) ABSERR[i]=mean(abs(FT1Q[u1]-FT2P[u1])) RMSE[i]=sqrt(mean((FT1Q[u1]-FT2P[u1])^2)) PE1[i]=mean(abs(FT1Q[u1]-FT2P[u1])>1) PE2[i]=mean(abs(FT1Q[u1]-FT2P[u1])>2) PE3[i]=mean(abs(FT1Q[u1]-FT2P[u1])>3) PE4[i]=mean(abs(FT1Q[u1]-FT2P[u1])>4) PE5[i]=mean(abs(FT1Q[u1]-FT2P[u1])>5) PE10[i]=mean(abs(FT1Q[u1]-FT2P[u1])>10) } allmeas=round(rbind(ABSERR,RMSE^2,1-PE1,1-PE2,1-PE3,1-PE4,1-PE5,1-PE10),3) t(allmeas)