### Part I: PRELIMINARY MATERIAL
### Please click through this BEFORE the practical, using RStudio


### Read in data

 data(galaxies, package="MASS")
 data(rock)

### Examine data sets
 
 galaxies
 ??galaxies  # looks for help file of an imported data set   
 mode(galaxies)
 length(galaxies)
 
 rock
 ?rock
 mode(rock)
 dim(rock)
 names(rock)

 is.data.frame(galaxies)
 is.data.frame(rock)


### Extract, say,
  # 5th ovservation from a vector
  galaxies[5]
  # 2nd row from a data frame (or matrix)
  rock[2,]
  # 1st column from a data frame
  rock[,1] 
  rock$area
  # the element in the 2nd row and first colum 
  rock[2,1]
  # subsets of interest, such as all cases for which perimeter > 200
  rock[which(rock$peri>2000),]
  # the data frame orderd wrt to a particular variable, say perimeter
  rock[order(rock$peri),]


### Know your working directory 
  getwd()
  ls()
  # Save stuff in working directory
  save.image("mixture.RData")
  # close R, and open up again
  load("mixture.RData")


### Basic programming
  # if-then, for, while as usual --- see handout (PDF)
  # Important functionality of R: apply  
  apply(rock,2,max)
  # gives the maximum over each colum of rock

### Data display 

  hist(galaxies, breaks=20)

  plot(rock$peri, rock$shape)
  # or
  plot(rock[,2:3])

  par(mfrow=c(2,1))
  hist(rock$peri)
  hist(rock$shape)

 
### Part II:  ACTUAL PRACTICAL
### Implementation of univariate Gaussian mixture

# Complete the E-step, M-step and EM-algorithm below.

# For the purpose of this exercise, please work with a single, constant, sigma, which is the same over all components.

# The given pieces of code are only suggestions; feel free to come up with different (or more efficient) solutions! 

estep<- function(dat, p, mu, sigma){
    # note: dat, p and mu are vectors but sigma is a scalar.
    # we use notation "p" rather than "pi" for the mixture probabilities as they can conflict with the built-in object pi=3.141593
    
   n <- length(dat)
   K <- ...
   W <- ...        #  build `empty' matrix. Check ?matrix for help.
   for (i in 1:n){
        W[i,]<- p*exp(...)/ sum(...)
      }
  return(W)
 }


mstep<- function(dat, W){
   n <- dim(W)[1]
   K <- ...
   p <- apply(W, 2, sum)/n
   mu<- ...
   
   ...

  #  There are several ways to implement sigma.  For instance, one can construct a n x K matrix, say diff, of squared differences (y_i-\mu_k)^2, and then use sum(W*diff).  Note that * is element-wise multiplication, and %*% is actual matrix multiplication    
   
   sigma <- ...
      
   return(list("p"=p, "mu"=mu, "sigma"=sigma))
}


em <- function(dat,K){
  
   # EM Starting values:
   # (Be creative: what could be sensible generic starting values,
   # in the absence of the knowledge of the true mixture parameters?) 
    
 p    <- ...
 mu    <- ...
 sigma <- ...

 s<-1
 while (...){    # define a simple stopping or convergence criterion
   W   <- estep(dat, p, mu, sigma)
   fit <- mstep(dat, W)
   p  <- fit$p
   ...
 }
 
 fit<- list("p"=p, "mu"=mu, "sigma"=sigma,  "W" =W)
 return(fit)  
}

# After completion,  apply your function on the galaxy data, and on the individual variables of the rock data. 


### PART III (SUPPLEMENTARY TOPICS -- do this if you have time)

### (A)  Read the section on Simulation given in Part III of the Handout, then complete the mssing code below to implement the simulator: 

gauss.mix.sim1<-function(K, p, mu, sigma){
   x <-runif(1)
   sim<-0
   cp<-cumsum(p) 
       k <-1
       while (x>cp[k]){
           k<-k+1
       }    
       sim <- ....
   return(sim)
 }

                                        
# gauss.mix.sim1  is a function which simulates  *one* observation.

# Extend the code and write a function gauss.mix.sim which simulates an arbitrary number, say  n, of replicates. Try your function using a value of  theta of your choice, plot the histogram,  and observe whether the result looks right.


### (B) Improve your function em so that it returns the likelihood and the disparity as additional outputs. Read the instructions in the handout before your attempt this. Then,  implement the likelihood ratio test for  testing for the number of mixture components K, following the instructions provided on the handout.