### On the estimation of the dispersion parameter for grouped data (Accompanying code for Talk on 23/08/18) ### Slides on request from jochen.einbeck@durham.ac.uk ### Generate some toy data x <-rbinom(100,2,0.5) intercepts<- c(0,1,2,10) p<- c(0.3,0.4, 0.2,0.1) r.intercepts<-sample(intercepts, 100, replace=TRUE, prob=p) y<- rpois(100, r.intercepts+5*x) raw.data<-data.frame(x,y) raw.data dim(raw.data) ### Fit raw data model fit<- glm(y~x, family=poisson(link="identity"), data=raw.data) summary(fit) ### Create grouped data from raw data groupmeans<- aggregate(y, by=list(x), mean) grouped.data <- cbind(groupmeans, as.numeric(table(x))) names(grouped.data)<-c("X", "Y", "W") grouped.data dim(grouped.data) ### Fit grouped data model Gfit<- glm( Y~X, weights= W, family=poisson(link="identity"), data=grouped.data) summary(Gfit) ### Fit Quasi-Poisson models qfit <- glm(y~x, family=quasipoisson(link="identity"), data=raw.data) summary(qfit) qGfit <- glm(Y~X, family=quasipoisson(link="identity"), weights=W, data=grouped.data) summary(qGfit) ### How is dispersion actually estimated?? # Implement formula in Fahrmeir & Tutz (2001): dispersion<- function(y, mhat, p, ni=rep(1, length(y))){ n<-length(y) phi<- 1/(n-p)*sum(ni*(y-mhat)^2/mhat) return(phi) } # Apply on grouped data dispersion(grouped.data$Y, Gfit$fitted, 2, grouped.data$W) # Apply on raw data dispersion(raw.data$y, fit$fitted, 2) ### Simulation study... disp<-matrix(0,100000,2) intercepts<- c(0,1,2,10) pi<- c(0.3,0.4, 0.2,0.1) jmax=100000 j<-1 while (j <=jmax){ xM <-rbinom(100,2,0.5) r.intercepts<-sample(intercepts, 100, replace=TRUE, prob=p) yM<- rpois(100, r.intercepts+5*xM) #plot(jitter(xM), jitter(yM), col=as.factor(r.intercepts)) qfitM <- try(glm(yM~xM, family=quasipoisson(link="identity")) ) grouped.dataM<- aggregate(yM, by=list(xM), mean) xnewM<-grouped.dataM[,1] ynewM<-grouped.dataM[,2] wM<-table(xM) wfitM<- try(glm( ynewM~xnewM, weights= wM, family=quasipoisson(link="identity")) ) if (class(qfitM)[[1]]!="try-error" & class(wfitM)[[1]]!="try-error"){ disp[j,1]<- dispersion(yM, qfitM$fitted, 2) disp[j,2]<- dispersion(ynewM, wfitM$fitted, 2, wM) j<-j+1 } } #load("poisson.RData") #pdf("boxplot-pois.pdf") boxplot(disp) #dev.off() # colMeans(disp) #[1] 2.751761 2.161372