### Part I ### Read in data data(galaxies, package="MASS") data(rock) ### Examine data set galaxies ?galaxies mode(galaxies) length(galaxies) rock ?rock mode(rock) dim(rock) names(rock) is.data.frame(galaxies) is.data.frame(rock) ### Extract, say, # 5th ovservation from a vector galaxies[5] # 2nd row from a data frame (or matrix) rock[2,] # 1st column from a data frame rock[,1] rock$area # the element in the 2nd row and first colum rock[2,1] # subsets of interest, such as all cases for which perimeter > 200 rock[which(rock$peri>2000),] # the data frame orderd wrt to a particular variable, say perimeter rock[order(rock$peri),] ### Know your working directory getwd() ls() # Save stuff in working directory save.image("mixture.RData") # close R, and open up again load("mixture.RData") ### Basic programming # if-then, for, while as usual --- see hadndout # Important peculiarity of R: apply apply(rock,2,max) # gives the maximum over each colum of rock ### Data display hist(galaxies, breaks=20) plot(rock$peri, rock$shape) # or plot(rock[,2:3]) par(mfrow=c(2,1)) hist(rock$peri) hist(rock$shape) ### Part II estep<- function(dat, p, mu, sigma){ n <- length(dat) K <- length(mu) W <- matrix(0, n,K) for (i in 1:n){ W[i,]<- p*exp(-1/(2*sigma^2)*(dat[i]-mu)^2)/ sum( p*exp(-1/(2*sigma^2)*(dat[i]-mu)^2)) } return(W) } #or, alternatively: estep<- function(dat, p, mu, sigma){ n <- length(dat) K <- length(mu) W <- matrix(0, n,K) for (i in 1:n){ for (k in 1:K){ W[i,k]<- p[k]*exp(-1/(2*sigma^2)*(dat[i]-mu[k])^2)/ sum( p*exp(-1/(2*sigma^2)*(dat[i]-mu)^2)) } } return(W) } mstep<- function(dat, W){ n <- dim(W)[1] K <- dim(W)[2] p <- apply(W,2,sum)/n mu<- apply(W*dat,2,sum)/apply(W,2,sum) diff<-matrix(0,n, K) for (k in 1:K){ diff[,k]<- (dat -mu[k])^2 } var <- 1/n* sum(W*diff) sigma <- sqrt(var) return(list("p"=p, "mu"=mu, "sigma"=sigma)) } # or, alternatively mstep<- function(dat, W){ n <- dim(W)[1] K <- dim(W)[2] p <- apply(W,2,sum)/n mu <- rep(0,K) for (k in 1:K){ mu[k] <- sum(W[,k]*dat)/sum(W[,k]) } var <- 0 for (i in 1:n){ for (k in 1:K){ var<- var+ W[i,k]*(dat[i]-mu[k])^2 } } sigma <- sqrt(var/n) return(list("p"=p, "mu"=mu, "sigma"=sigma)) } em <- function(dat,K){ steps <- 400 # arbitrary 'sufficiently large' number p <- rep(1/K,K) mu <- quantile(dat, probs= (1:K)/K-1/(2*K)) sigma <- sd(dat) s<-1 while (s <=steps){ W <- estep(dat, p, mu, sigma) fit <- mstep(dat, W) p <- fit$p mu <- fit$mu sigma <-fit$sigma s<-s+1 } fit<- list("p"=p, "mu"=mu, "sigma"=sigma, "W" =W) return(fit) } test1 <- em(galaxies, K=4) # Visualization: par(mfrow=c(1,1)) hist(galaxies, breaks=20, freq=FALSE) x<-seq(-12000,36000, length=2001) for (j in 1:4){ lines(x, test1$p[j]*dnorm(x, test1$mu[j], test1$sigma), col=j) } test2<- em(rock$peri, K=2) round(test2$W, digits=4) ### Part III # 5a) gauss.mix.sim1<-function(K, p, mu, sigma){ x <-runif(1) sim<-0 cp<-cumsum(p) k <-1 while (x>cp[k]){ k<-k+1 } sim <- rnorm(1,mu[k],sigma) return(sim) } gauss.mix.sim1(2, c(0.5,0.5), c(1/4,3/4),1/2) # n observations gauss.mix.sim<-function(n, K, p, mu, sigma){ x<-runif(n) sim<-rep(0,n) cp<-cumsum(p) for (i in 1:n){ k <-1 while (x[i]>cp[k]){ k<-k+1 } #k<- which((x[i]-pi)[x[i]-pi>0]== min((x[i]-pi)[x[i]-pi>0])) sim[i] <- rnorm(1,mu[k],sigma) } return(sim) } save<-gauss.mix.sim(100,2, c(0.5,0.5), c(-1,1),1/2) hist(save) # improved EM: em <- function(dat,K,tol=1, steps=400){ p <- rep(1/K,K) mu <- mean(dat)+tol*quantile(dat-mean(dat), probs= (1:K)/K-1/(2*K)) sigma <- sd(dat) s<-1 while (s <=steps){ w <- estep(dat, p, mu, sigma) fit <- mstep(dat, w) mu <- fit$mu p <- fit$p sigma <-fit$sigma s<-s+1 } dens.all<-matrix(0,length(dat), K) for (k in 1:K){ dens.all[,k]<- p[k]*dnorm(dat, mu[k],sigma) } loglik<- sum(log(apply(dens.all,1,sum))) fit<- list("pi"=pi, "mu"=mu, "sigma"=sigma, "wik" =w,"data"=dat, "disp"=-2*loglik ) class(fit)<-"em" return(fit) } # Bootstrap: # Aitkin, Francis, Hinde, and Darnell, page 442. # H_0: K components # H_1: K+1 components n <- length(galaxies) fit4 <- em(galaxies, K=4,tol=1) fit5 <- em(galaxies, K=5,tol=1) LRTS <- fit4$disp-fit5$disp M <- 99 sim.LRTS <-rep(0,M) for (m in 1:M){ sim4 <- gauss.mix.sim(n, 4, p=fit4$p, mu=fit4$mu, sigma=fit4$sigma) fit.sim4<- em(sim4,4) fit.sim5<- em(sim4,5) sim.LRTS[m]<- fit.sim4$disp-fit.sim5$disp } p <- 1-rank(c(LRTS, sim.LRTS))[1]/100 p