# generic code for fitting of time series models in epidemiology
#
# This version posted by Richard L. Smith: March 16, 2015
#
# Some of this is adapted from previously published code at 
# http://www.ihapss.jhsph.edu/
#
# first run this function
numdf <- function(vec,df=5)
# Return a discounted df (based on 12 consecutive days of missings)
{
   n<-length(vec)
   ll  <- round(length(vec)/12)
   vecnew <- vec[1:(ll*12)]     # eliminate 'not a multiple' warning message
   mat <- matrix(vecnew, ncol=12,byrow=T)
   m   <- sum(ceiling(apply(mat,1,sum,na.rm=T)/12)) 
   df  <- round(12*m/365.25*df   )   
   return(max(1,df))
}
# variant on standard natural splines function: eliminate duplicate values before calculating knots
ns1 = function(x,df){
knots=quantile(unique(x),(1:(df-1))/df,na.rm=T)
return(ns(x,knots=knots))
}
# now main function
fitTS=function(y,met,AP,APmin,APmax,APint,APref,lags,ndf0,ndf1,ndf2,ndf3){
#
# fit time series model to a series of daily death counts y
#
# y is a time series of length ny
#
# met is a ny x nmet matrix of met observations
#
# AP is a univeriate time series of air pollution observations
# 
# Next four parameters are for plot: estimate response for AP levels from APmin to APmax at intervals APint,
#   using APref as the reference level.
#
# ndf0 is DF per year for long-term trend
#
# ndf1 and ndf2 are DF for current day and for mean of 3 previous days on the met variables
# 
# ndf3 is the number of DF for the concentration-response function:
#
#   If ndf3=1, fit a linear C-R curve
#   If ndf3>1, fit splines with ndf3 knots
#   If ndf3>1 and length(lags)>1, model fit is of distributed lag structure but nonlinear in leading term
#
# structure of model is motivated by NMMAPS papers such as Bell et al (JAMA, 2004), 
# Bell, Peng and Dominici (Env Health Persp 2006), Smith et al (Inhalation Toxicology, 2009).
#
ny=length(y)
nmet=length(met[1,])
nlags=length(lags)
# define 3-day-average-lags for met data
met1=met
for(i in 1:3){met1[i,]=rep(NA,nmet)}
for(i in 4:ny){for(j in 1:nmet){met1[i,j]=(met[i-1,j]+met[i-2,j]+met[i-3,j])/3}}
# calculate lagged values of air pollution variable as a matrix AP0
AP0=matrix(nrow=ny,ncol=nlags)
for(i in 1:ny){for(j in 1:nlags){if(i>lags[j]){AP0[i,j]=AP[i-lags[j]]}}}
# search for NA values
#
# initial version based only on met variables
subset=!is.na(y+met%*%rep(1,nmet)+met1%*%rep(1,nmet))
nsub1=sum(subset) # number of non-missing values for "Met" analysis
subset1=subset
#
# Define "Time" variable - spline representation of long-term trend
# This function is based on ihapss code
#
Time<-(1:ny)[subset]
df.Time = numdf(subset,ndf0)
library(splines)
   if ( round(df.Time) == 1 | round(df.Time) == 0) {
     TIME = Time
     Time = matrix(NA,ncol=1,nrow=ny)
     Time[subset,]  = TIME
     dimnames(Time) = list(NULL,paste("Time.",1,sep=""))
   }    else {
     knots  = quantile(Time, prob= (1:(round(df.Time)-1))/round(df.Time) )
     TIME   = ns(Time,knots=knots)
     Time   = matrix(NA,ncol=ncol(TIME),nrow=ny)
     Time[subset,]  <- TIME
     dimnames(Time) <- list(NULL,paste("Time.",1:ncol(TIME),sep=""))
   }
# simpler model (ignores breaks in sequence)
# Time=ns(1:ny,ndf0*round(ny/365.25))
#
# compose spline function for met
metsp=cbind(ns1(met[,1],ndf1),ns1(met1[,1],ndf2))
if(nmet>1){for(j in 2:nmet){metsp=cbind(metsp,ns1(met[,j],ndf1),ns1(met1[,j],ndf2))}}
# add day of week
dow=factor((1:ny)-7*floor((1:ny)/7))
#
# fit model with all met terms
#
glm1=glm(y~metsp+Time+dow,family='quasipoisson',subset=subset)
dev1=glm1$deviance
#
# Now drop one met term at a time
#
# lrt is the resulting set of p-values based on likelihood ratio statistic
indcol=1:length(metsp[1,])
lrt=rep(NA,2*nmet)
for(j in 1:nmet){
indcol[(ndf1+ndf2)*(j-1)+(1:ndf1)]=NA
metsp2=metsp[,!is.na(indcol)]
glm2=glm(y~metsp2+Time+dow,family='quasipoisson',subset=subset)
lrt[2*j-1]=1-pchisq(glm2$deviance-glm1$deviance,ndf1)
indcol=1:length(metsp[1,])
indcol[(ndf1+ndf2)*(j-1)+ndf1+(1:ndf2)]=NA
metsp2=metsp[,!is.na(indcol)]
glm2=glm(y~metsp2+Time+dow,family='quasipoisson',subset=subset)
lrt[2*j]=1-pchisq(glm2$deviance-glm1$deviance,ndf2)
# lrt are the p-values based on a likelihood ratio test for dropping
# one met variable at a time
indcol=1:length(metsp[1,])
}
#
# Next: add air pollution term, recompute missing values and Time 
#
subset=!is.na(y+met%*%rep(1,nmet)+met1%*%rep(1,nmet)+AP0%*%rep(1,nlags))
subset2=subset
nsub2=sum(subset)  # number of non-missing values for "AirPollution + Met" analysis
Time<-(1:ny)[subset]
df.Time = numdf(subset,ndf0)
library(splines)
   if ( round(df.Time) == 1 | round(df.Time) == 0) {
     TIME = Time
     Time = matrix(NA,ncol=1,nrow=ny)
     Time[subset,]  = TIME
     dimnames(Time) = list(NULL,paste("Time.",1,sep=""))
   }    else {
     knots  = quantile(Time, prob= (1:(round(df.Time)-1))/round(df.Time) )
     TIME   = ns(Time,knots=knots)
     Time   = matrix(NA,ncol=ncol(TIME),nrow=ny)
     Time[subset,]  <- TIME
     dimnames(Time) <- list(NULL,paste("Time.",1:ncol(TIME),sep=""))
   }
#
# recode AP0 matrix: main variable of interest in row 1
#
if(nlags>1){
AP0[,1]=AP0%*%rep(1/nlags,nlags)
for(j in 2:nlags){AP0[,j]=AP0[,j]-AP0[,1]}
}
#
# first case: linear distributed lag model
#
if(ndf3==1){
# final model include air pollution, met, day of week and long-term trend
glm3=glm(y~AP0+metsp+Time+dow,family='quasipoisson',subset=subset)
# extract coefficients and calculate relative risks
coefest=summary(glm3)$coef[2,1]
coefSD=summary(glm3)$coef[2,2]
numpred=1+floor((APmax-APmin)/APint)
APpred=matrix(nrow=numpred,ncol=3)
for(i in 1:numpred){
APpred[i,1]=APmin+(i-1)*APint
APpred[i,2]=(APpred[i,1]-APref)*coefest
APpred[i,3]=(APpred[i,1]-APref)*coefSD
}
return(list(nsub1=nsub1,nsub2=nsub2,lrt=lrt,glmfit=glm3,glm1=glm1,subset1=subset1,subset2=subset2,coefest=coefest,coefSD=coefSD,AP=AP,APref=APref,APpred=APpred))
}
#
# second case: nonlinear model over average of lags
#
if(ndf3>1){
AP1=c(AP0[,1],APref)
APsp=ns1(AP1,ndf3)
APknots=quantile(unique(AP1),(1:(ndf3-1))/ndf3,na.rm=T)
APboundaryknots=range(AP1,na.rm=T)
# center to level of APref
for(i in 1:ny){APsp[i,]=APsp[i,]-APsp[ny+1,]}
if(nlags==1){glm3=glm(y~APsp[1:ny,]+metsp+Time+dow,family='quasipoisson',subset=subset)}
if(nlags>1){glm3=glm(y~APsp[1:ny,]+AP0[,2:nlags]+metsp+Time+dow,family='quasipoisson',subset=subset)}
# 
# reconstructed C-R curve and confidence bands
#
numpred=1+floor((APmax-APmin)/APint)
APpred=matrix(nrow=numpred,ncol=3)
APpred[,1]=APmin+(0:(numpred-1))*APint
APsp1=ns(APpred[,1],knots=APknots,Boundary.knots=APboundaryknots)
for(i in 1:numpred){APsp1[i,]=APsp1[i,]-APsp[ny+1,]}
APpred[,2]=APsp1%*%glm3$coef[1+1:ndf3]
APpred[,3]=sqrt(diag(APsp1 %*% summary(glm3)$cov.scaled[1+1:ndf3,1+1:ndf3] %*% t(APsp1)))
return(list(nsub1=nsub1,nsub2=nsub2,lrt=lrt,glmfit=glm3,glm1=glm1,subset1=subset1,subset2=subset2,AP=AP,APref=APref,APpred=APpred))
}
}

#
# Optional plotting function: f1 is returned list from fitTS
#
plotTS=function(f1){
par(mfrow=c(1,1),cex=1.5)
plot(f1$AP[f1$subset1],f1$glm1$resid,xlab='Air Pollution Variable',ylab=list('Residual from Met Model',col='blue'),type='n')
mtext('Change in Risk Compared With Reference Level',side=4,col='red',cex=1.5,line=0.5)
points(f1$AP[f1$subset1],f1$glm1$resid,pch=20,col='blue',cex=0.6)
lines(f1$APpred[,1],f1$APpred[,2],lw=5,col='red')
lines(f1$APpred[,1],f1$APpred[,2]+1.96*f1$APpred[,3],lty=2,lw=3,col='red')
lines(f1$APpred[,1],f1$APpred[,2]-1.96*f1$APpred[,3],lty=2,lw=3,col='red')
lines(f1$AP,rep(0,length(f1$AP)),lw=3)
lines(c(f1$APref,f1$APref),c(-1000,1000),lw=3)
}