# generic code for fitting of time series models in epidemiology # # This version posted by Richard L. Smith: March 16, 2015 # # Some of this is adapted from previously published code at # http://www.ihapss.jhsph.edu/ # # first run this function numdf <- function(vec,df=5) # Return a discounted df (based on 12 consecutive days of missings) { n<-length(vec) ll <- round(length(vec)/12) vecnew <- vec[1:(ll*12)] # eliminate 'not a multiple' warning message mat <- matrix(vecnew, ncol=12,byrow=T) m <- sum(ceiling(apply(mat,1,sum,na.rm=T)/12)) df <- round(12*m/365.25*df ) return(max(1,df)) } # variant on standard natural splines function: eliminate duplicate values before calculating knots ns1 = function(x,df){ knots=quantile(unique(x),(1:(df-1))/df,na.rm=T) return(ns(x,knots=knots)) } # now main function fitTS=function(y,met,AP,APmin,APmax,APint,APref,lags,ndf0,ndf1,ndf2,ndf3){ # # fit time series model to a series of daily death counts y # # y is a time series of length ny # # met is a ny x nmet matrix of met observations # # AP is a univeriate time series of air pollution observations # # Next four parameters are for plot: estimate response for AP levels from APmin to APmax at intervals APint, # using APref as the reference level. # # ndf0 is DF per year for long-term trend # # ndf1 and ndf2 are DF for current day and for mean of 3 previous days on the met variables # # ndf3 is the number of DF for the concentration-response function: # # If ndf3=1, fit a linear C-R curve # If ndf3>1, fit splines with ndf3 knots # If ndf3>1 and length(lags)>1, model fit is of distributed lag structure but nonlinear in leading term # # structure of model is motivated by NMMAPS papers such as Bell et al (JAMA, 2004), # Bell, Peng and Dominici (Env Health Persp 2006), Smith et al (Inhalation Toxicology, 2009). # ny=length(y) nmet=length(met[1,]) nlags=length(lags) # define 3-day-average-lags for met data met1=met for(i in 1:3){met1[i,]=rep(NA,nmet)} for(i in 4:ny){for(j in 1:nmet){met1[i,j]=(met[i-1,j]+met[i-2,j]+met[i-3,j])/3}} # calculate lagged values of air pollution variable as a matrix AP0 AP0=matrix(nrow=ny,ncol=nlags) for(i in 1:ny){for(j in 1:nlags){if(i>lags[j]){AP0[i,j]=AP[i-lags[j]]}}} # search for NA values # # initial version based only on met variables subset=!is.na(y+met%*%rep(1,nmet)+met1%*%rep(1,nmet)) nsub1=sum(subset) # number of non-missing values for "Met" analysis subset1=subset # # Define "Time" variable - spline representation of long-term trend # This function is based on ihapss code # Time<-(1:ny)[subset] df.Time = numdf(subset,ndf0) library(splines) if ( round(df.Time) == 1 | round(df.Time) == 0) { TIME = Time Time = matrix(NA,ncol=1,nrow=ny) Time[subset,] = TIME dimnames(Time) = list(NULL,paste("Time.",1,sep="")) } else { knots = quantile(Time, prob= (1:(round(df.Time)-1))/round(df.Time) ) TIME = ns(Time,knots=knots) Time = matrix(NA,ncol=ncol(TIME),nrow=ny) Time[subset,] <- TIME dimnames(Time) <- list(NULL,paste("Time.",1:ncol(TIME),sep="")) } # simpler model (ignores breaks in sequence) # Time=ns(1:ny,ndf0*round(ny/365.25)) # # compose spline function for met metsp=cbind(ns1(met[,1],ndf1),ns1(met1[,1],ndf2)) if(nmet>1){for(j in 2:nmet){metsp=cbind(metsp,ns1(met[,j],ndf1),ns1(met1[,j],ndf2))}} # add day of week dow=factor((1:ny)-7*floor((1:ny)/7)) # # fit model with all met terms # glm1=glm(y~metsp+Time+dow,family='quasipoisson',subset=subset) dev1=glm1$deviance # # Now drop one met term at a time # # lrt is the resulting set of p-values based on likelihood ratio statistic indcol=1:length(metsp[1,]) lrt=rep(NA,2*nmet) for(j in 1:nmet){ indcol[(ndf1+ndf2)*(j-1)+(1:ndf1)]=NA metsp2=metsp[,!is.na(indcol)] glm2=glm(y~metsp2+Time+dow,family='quasipoisson',subset=subset) lrt[2*j-1]=1-pchisq(glm2$deviance-glm1$deviance,ndf1) indcol=1:length(metsp[1,]) indcol[(ndf1+ndf2)*(j-1)+ndf1+(1:ndf2)]=NA metsp2=metsp[,!is.na(indcol)] glm2=glm(y~metsp2+Time+dow,family='quasipoisson',subset=subset) lrt[2*j]=1-pchisq(glm2$deviance-glm1$deviance,ndf2) # lrt are the p-values based on a likelihood ratio test for dropping # one met variable at a time indcol=1:length(metsp[1,]) } # # Next: add air pollution term, recompute missing values and Time # subset=!is.na(y+met%*%rep(1,nmet)+met1%*%rep(1,nmet)+AP0%*%rep(1,nlags)) subset2=subset nsub2=sum(subset) # number of non-missing values for "AirPollution + Met" analysis Time<-(1:ny)[subset] df.Time = numdf(subset,ndf0) library(splines) if ( round(df.Time) == 1 | round(df.Time) == 0) { TIME = Time Time = matrix(NA,ncol=1,nrow=ny) Time[subset,] = TIME dimnames(Time) = list(NULL,paste("Time.",1,sep="")) } else { knots = quantile(Time, prob= (1:(round(df.Time)-1))/round(df.Time) ) TIME = ns(Time,knots=knots) Time = matrix(NA,ncol=ncol(TIME),nrow=ny) Time[subset,] <- TIME dimnames(Time) <- list(NULL,paste("Time.",1:ncol(TIME),sep="")) } # # recode AP0 matrix: main variable of interest in row 1 # if(nlags>1){ AP0[,1]=AP0%*%rep(1/nlags,nlags) for(j in 2:nlags){AP0[,j]=AP0[,j]-AP0[,1]} } # # first case: linear distributed lag model # if(ndf3==1){ # final model include air pollution, met, day of week and long-term trend glm3=glm(y~AP0+metsp+Time+dow,family='quasipoisson',subset=subset) # extract coefficients and calculate relative risks coefest=summary(glm3)$coef[2,1] coefSD=summary(glm3)$coef[2,2] numpred=1+floor((APmax-APmin)/APint) APpred=matrix(nrow=numpred,ncol=3) for(i in 1:numpred){ APpred[i,1]=APmin+(i-1)*APint APpred[i,2]=(APpred[i,1]-APref)*coefest APpred[i,3]=(APpred[i,1]-APref)*coefSD } return(list(nsub1=nsub1,nsub2=nsub2,lrt=lrt,glmfit=glm3,glm1=glm1,subset1=subset1,subset2=subset2,coefest=coefest,coefSD=coefSD,AP=AP,APref=APref,APpred=APpred)) } # # second case: nonlinear model over average of lags # if(ndf3>1){ AP1=c(AP0[,1],APref) APsp=ns1(AP1,ndf3) APknots=quantile(unique(AP1),(1:(ndf3-1))/ndf3,na.rm=T) APboundaryknots=range(AP1,na.rm=T) # center to level of APref for(i in 1:ny){APsp[i,]=APsp[i,]-APsp[ny+1,]} if(nlags==1){glm3=glm(y~APsp[1:ny,]+metsp+Time+dow,family='quasipoisson',subset=subset)} if(nlags>1){glm3=glm(y~APsp[1:ny,]+AP0[,2:nlags]+metsp+Time+dow,family='quasipoisson',subset=subset)} # # reconstructed C-R curve and confidence bands # numpred=1+floor((APmax-APmin)/APint) APpred=matrix(nrow=numpred,ncol=3) APpred[,1]=APmin+(0:(numpred-1))*APint APsp1=ns(APpred[,1],knots=APknots,Boundary.knots=APboundaryknots) for(i in 1:numpred){APsp1[i,]=APsp1[i,]-APsp[ny+1,]} APpred[,2]=APsp1%*%glm3$coef[1+1:ndf3] APpred[,3]=sqrt(diag(APsp1 %*% summary(glm3)$cov.scaled[1+1:ndf3,1+1:ndf3] %*% t(APsp1))) return(list(nsub1=nsub1,nsub2=nsub2,lrt=lrt,glmfit=glm3,glm1=glm1,subset1=subset1,subset2=subset2,AP=AP,APref=APref,APpred=APpred)) } } # # Optional plotting function: f1 is returned list from fitTS # plotTS=function(f1){ par(mfrow=c(1,1),cex=1.5) plot(f1$AP[f1$subset1],f1$glm1$resid,xlab='Air Pollution Variable',ylab=list('Residual from Met Model',col='blue'),type='n') mtext('Change in Risk Compared With Reference Level',side=4,col='red',cex=1.5,line=0.5) points(f1$AP[f1$subset1],f1$glm1$resid,pch=20,col='blue',cex=0.6) lines(f1$APpred[,1],f1$APpred[,2],lw=5,col='red') lines(f1$APpred[,1],f1$APpred[,2]+1.96*f1$APpred[,3],lty=2,lw=3,col='red') lines(f1$APpred[,1],f1$APpred[,2]-1.96*f1$APpred[,3],lty=2,lw=3,col='red') lines(f1$AP,rep(0,length(f1$AP)),lw=3) lines(c(f1$APref,f1$APref),c(-1000,1000),lw=3) }