########################################################################
### File misdat.r                                                    ###
### Tom A.B. Snijders                                                ###
### Statistical Methods: Principles: Robustness                      ###
### Version November 25, 2011                                        ###
########################################################################

# This script contains a number of simulation experiments
# to illustrate concepts and procedures for missing data.

# If there is any R function used here that you do not understand,
# then look it up in the help function!!!

########################################################################
### A. The start                                                     ###
########################################################################

# First we generate a sample from a bivariate normal distribution.
# We set the same random seed so that everybody gets the same results.
# When you redo this exercise, you might use a different random seed.
set.seed(1234)
# Generate a random sample x0
x0 <- rnorm(100,125,25)
# Generate y0 with a linear regression on x0
y0 <- 0.6*(x0-125) + rnorm(100, 125, 20)
# Now (x0,y0) has a bivariate normal distribution.
plot(x0,y0)
# From the commands above, compute the expected value
# and covariance matrix of (x0, y0), and their correlation,
# in the normal population from which they were drawn.
# Now the sample values:
var(y0)
var(x0)
mean(y0)
mean(x0)
cor(x0,y0)
# Pretty close.
# Linear models both ways:
summary(lm(y0 ~ x0))
summary(lm(x0 ~ y0))
# Had you expected the outcomes of these two linear models
# to be so close to each other?

########################################################################
### B. MAR data                                                      ###
########################################################################

# Now construct a reduced data set
x <- x0
y <- y0
# No observations for second time point if first time point was less than 130:
y[x < 130] <- NA
plot(x,y)
# This is MAR! (Why???)
cor(x,y)
# This does not work.
?cor
# Here we can read that we may try
cor(x,y, use = "complete.obs")
# How many complete observations are there?
sum(!is.na(y))
summary(lm1 <- lm(y~x))
summary(lm2 <- lm(x~y))
# You must be able to explain,
# why the linear model y~x still gives an answer in line with
# the bivariate normal distribution for (x0, y0),
# but the linear model x~y does not.

# What are the observed means and covariance matrix?
mean(x)
mean(y, na.rm=TRUE)
# For the covariance matrix, two different options:
cov(cbind(x,y), use = "complete.obs")
cov(cbind(x,y), use = "pairwise.complete.obs")
# oops.

# Since the missing data mechanism is MAR, we can do
# full information maximum likelihood and obtain a good result.

library(mvnmle)
# What does function mlest do?
?mlest
mlest(cbind(x,y))
# The estimated variance
mlest(cbind(x,y))$sigmahat[2,2]
# of y is quite different from the observed value
var(y, na.rm=TRUE)
# and is a good estimate, unlike the observed variance.
# For x, which has no missings, the observed variance
var(x)
# and the mle
mlest(cbind(x,y))$sigmahat[1,1]
# are near each other, but still different.
# Explain why they are different!
# The estimated correlation is
sighat <- mlest(cbind(x,y))$sigmahat
sighat[1,2]/sqrt(sighat[1,1]*sighat[2,2])
# quite a bit closer to the true value (what is the true value here?)
# than the correlation between the observed values
cor(x,y, use = "complete.obs")

########################################################################
### C. MNAR data                                                     ###
########################################################################

# Now construct a different reduced data set
x1 <- x0
y1 <- y0
# Truncate observations for second time point:
y1[y1 < 130] <- NA
plot(x1,y1)
# This is MNAR! (Why???)
cor(x1,y1, use = "complete.obs")
# How many complete observations are there?
sum(!is.na(y1))
summary(lm3 <- lm(y1~x1))
summary(lm4 <- lm(x1~y1))
# You must be able to explain,
# why the linear model x1~y1 now gives an answer in line with
# the bivariate normal distribution for (x0, y0),
# but the linear model y1~x1 does not.

# What are the observed means and covariance matrix?
mean(x1)
mean(y1, na.rm=TRUE)
# For the covariance matrix, two different options:
cov(cbind(x1,y1), use = "complete.obs")
cov(cbind(x1,y1), use = "pairwise.complete.obs")

mlest(cbind(x1,y1))
# The estimated correlation is
sighat1 <- mlest(cbind(x1,y1))$sigmahat
sighat1[1,2]/sqrt(sighat1[1,1]*sighat1[2,2])
# hardly closer to the true value (what is the true value here?)
# than the correlation between the observed values
cor(x1,y1, use = "complete.obs")

# Thus we have seen an illustration
# that "fiml" gives good results for MAR data but not for MNAR data,
# even though superficially (if you know nothing about the data)
# the missingness pattern does not seem worse for the MNAR data set.

########################################################################
### D. Visualization of missing data patterns                        ###
########################################################################

# For some visualization we use package
library(VIM)
# 
?VIM
d <- data.frame(x,y)
marginplot(d)

d1 <- data.frame(x1,y1)
marginplot(d1)

########################################################################
### E. Multiple imputation : MAR                                     ###
########################################################################

# Here we use package
library(norm)

# This package requires some preliminary calculations
# (see the help page for em.norm)
pre <- prelim.norm(as.matrix(d))
# The MLE (fiml again) now is calculated by
xhat <- em.norm(pre)
# This has not directly the form of a parameter vector;
# the parameters are returned by requesting
getparam.norm(pre,xhat)
# A next step is the initialisation of the random number generator,
# see the help page for imp.norm where this is documented.
ru      <- trunc(1000000*runif(1) + 10)
rngseed(ru)

# The main function we use computes imputation of the missing values,
# based on the model of a bivariate normal distribution,
# with parameters xhat.
# First get one imputation to illustrate the operation of the function.
imp <- imp.norm(pre,xhat,d)
# For plotting, define the colors and plot
color <- as.numeric(is.na(d[2]))+1
plot(imp, col=color)
# Note that the shape of this cloud of points is determined
# by the black points and the X-values of the red points!
# The shape is quite reasonable indeed,
# given our original bivariate normal distribution.

# However, we need more imputations.
# Therefore we define a function that makes an arbitrary number
# of imputed data sets, arranged in a list.
multi.impute <- function(pre, est, dat, nimp){
     implist <- list()
     for (i in 1:nimp){implist[[i]] <- data.frame(imp.norm(pre,est,dat))}
     implist
}

# We now execute this function
mimp <- multi.impute(pre,xhat,d,20)
# This gives us 20 imputed data sets.
# We first define another function
# to get the estimates and standard errors from the data sets.
lm.results <- function(data.set, formula){
      lm0 <- lm(formula, data=data.set)
      c(coefficients(lm0), sqrt(diag(vcov(lm0))))
      } 
# The use of "formula" implies that we can use this
# for any formula that uses the variables in the data set.
# To see what this gives, we try it out for one data set:
lm.results(mimp[[1]],x~y)
# The first two numbers are the estimates, 
# the last two the standard errors.
# Now we apply this function to the 20 imputed data sets.
all.results <- sapply(mimp, function(d){lm.results(d,x~y)})
# What have we got?
dim(all.results)
# Take out of this the coefficients and standard errors:
coefs <- t(all.results[1:2,])
stdes <- t(all.results[3:4,])
plot(coefs)
# This shows a very strong correlation between the two estimates!
# You can understand this by making a plot of a single data set,
# and considering the intercept and the slope 
# of the least squares line.
# Thus, the strong correlation is not alarming;
# it is related to the large standard error of the intercept.
# Now we apply Rubin's formulae for combining the imputations.
# estimates
(estms  <- colMeans(coefs))
# between-imputation variances
(B      <- diag(cov(coefs)))              
# average within-imputation variances
(W      <- colMeans(stdes^2))             
# number of replications
(m      <- dim(coefs)[1])
# Rubin's formula for the overall standard error
(tot_se <- sqrt(W+(1+1/m)*B))
# fraction missing information
(fr_mis <- (1+1/m)*B/(W+(1+1/m)*B))

# In the first place, we conclude that the estimated coefficients
# are in line with what we know about the population parameters.
# This differs from the results for the complete cases.
# Since the missingness mechanism is MAR,
# we indeed expect that the parameters can be estimated well,
# provide we use a suitable method, such as multiple imputation.

# Second, the fraction of missing information is between 0.26 and 0.28,
# while in X we have no missings, and in Y we have a proportion of
sum(is.na(y))/length(y)
# missing values.
# The data set contains as much information as a data set without missings,
# of a sample size that is (1 - fr_mis) times as large
# as the current data set; in other words, a sample size of about 73.

########################################################################
### F. Multiple imputation : MNAR                                    ###
########################################################################

# Now we apply everything to the data set
# that was constructed by a MNAR mechanism.
pre1 <- prelim.norm(as.matrix(d1))
xhat1 <- em.norm(pre1)
mimp1 <- multi.impute(pre1,xhat1,d1,20)
# Note that now the variables are called x1 and y1
all.results1 <- sapply(mimp1, function(d){lm.results(d,y1~x1)})
coefs1 <- t(all.results1[1:2,])
stdes1 <- t(all.results1[3:4,])
plot(coefs1)
# This plot already shows that ALL imputed data sets
# have estimates that are far off.
# Let us look at one imputed data set:
color1 <- as.numeric(is.na(y1))+1
plot(mimp1[[1]], col=color1)
# it looks quite different from the original data set.

# Anyway, here we go with Rubins' formulae:
# Estimates
(estms1  <- colMeans(coefs1))
# between-imputation variances
(B1      <- diag(cov(coefs1)))              
# average within-imputation variances
(W1      <- colMeans(stdes1^2))             
# number of replications
(m      <- dim(coefs1)[1])
# Rubin's formula for the overall standard error
(tot_se1 <- sqrt(W1+(1+1/m)*B1))
# fraction missing information
(fr_mis1 <- (1+1/m)*B1/(W1+(1+1/m)*B1))
# For the MNAR data set, multiple imputation
# does not yield results close to the population.