########################################################################
### File missingdat.r                                                ###
### Tom A.B. Snijders                                                ###
### Statistical Methods: Principles: Robustness                      ###
### Version November 25, 2011                                        ###
########################################################################

# This script contains an analysis of the file cancer.dat
# to illustrate concepts and procedures for missing data.

# If there is any R function used here that you do not understand,
# then look it up in the help function!!!

########################################################################
### A. The data                                                      ###
########################################################################

# Read the data and give them names
childbeh <- data.frame(read.table("child_beh.txt"))
names(childbeh) <- c("SexP", "DeptP", "AnxtP", "DeptS",
                  "AnxtS", "SexChild", "Totbpt")

# These are data collected in a study by Compas
# the effect of parental cancer on behavior problems in children,
# distributed in a paper by D. Howell.
# Howell, D.C. (2008) The analysis of missing data. 
# In Outhwaite, W. & Turner, S.
# Handbook of Social Science Methodology. London: Sage.

# Totbpt:   Total Behavior Problem T score of the child,
#           from the Achenbach Child Behavior Checklist
# SexP:     gender parent with cancer
# SexChild: gender child
# AnxtP:    anxiety score of the cancer patient
# DeptP:    depression score of the cancer patient
# AnxtS:    anxiety score of the spouse
# DeptS:    depression score of the spouse

########################################################################
### B. Inspection of the missings                                    ###
########################################################################

# How many missings are there in the variables?
colSums(is.na(childbeh))

# For visualizing the missing data pattern, we use
library(mi)
# The following plot shows missings in red, observeds in blue,
# cases ordered as in the data set
mp.plot(childbeh,clustered=FALSE)
# It is illuminating to order cases in a more meaningful way
mp.plot(childbeh,clustered=TRUE)
# This shows that for some cases, all variables are observed;
# the main patterns of missingness are
# missingness of the two child variables,
# missingness of the two spouse variables, are both.
# For a few cases, some of the patient variables are missing,
# sometimes in combination with other missings.
# In a paper or report, the number of cases with these
# missingness patterns should be reported.

# This can be obtained, for example, as follows.
# First look at the function
?countpatterns
# Now we construct the input for countpatterns
mis.ind <- matrix(as.numeric(is.na(childbeh)),dim(childbeh))
# And we count the patterns
patterns <- countpattern(mis.ind)
# and drop the patterns that are not found
(patterns <- patterns[patterns > 0])
# This gives the full information about the existing
# patterns of missingness in the data, and their counts.
# For example, 26 complete cases,
# 11 cases with the two spouse variables missing, etc.
# Now we also wish to find the matches 
# between cases and patterns
patmats <- countpattern(mis.ind, matching=TRUE)
# How is the pattern coded?
patmats[[2]]
# The number is 1 + the integer number of which the pattern
# is the binary representation (starting from the left).
# The important thing is that this gives a
# unique representation of all patterns.
# We add the pattern code to the data matrix
childbeh <- cbind(childbeh[1:7] , patterns = patmats[[2]])
# and make the following plot.
matrixplot(childbeh,sortby=8)
# Look at the help page:
# red = missing, greyscale represents the variable.
# The matrixplot does not seem to reveal systematic low/high values
# for the observed variables, depending on missingness of other variables.

# For the missingness mechanism, Howell writes:
# "Unfortunately, due to the timing of the first round of 
# data collection, many of the observations were missing. 
# Out of 89 cases, only 26 had complete data. 
# The good thing is that it is reasonable to assume that missingness 
# was due almost entirely to the timing of data collection 
# (different families receive a diagnosis of cancer at different times) 
# and not to the potential value of the missing values. 
# So we can assume that the data are at least MAR 
# without too much concern."

# MAR is an untestable assumption within the confines of the data set.
# Therefore always, for deciding between MAR and MNAR,
# it is necessary to use information outside of the data set,
# about the way the data were collected 
# and what caused some values to be missing.
# We proceed further under the assumption that MAR is plausible here.

########################################################################
### C. Multiple imputation and a regression model                    ###
########################################################################

# We wish to find a model predicting the Problem Behavior Score,
# depending on the other variables. 
# For this purpose, we use the method used in script misdat.r.
# For explanation, we refer to that script.

# The package used is
library(norm)

# We drop the pattern identifiers from the data
d <- childbeh[1:7]

# Now something strange is going to happen.
# We assume multivariate normality but two variables are dichotomous.
# This is done to keep things relatively simple,
# but there is evidence that this works quite well; 
# see Graham (2009, p. 562-563).

# This package requires some preliminary calculations
# (see the help page for em.norm)
pre <- prelim.norm(as.matrix(d))
# The MLE (fiml again) now is calculated by
xhat <- em.norm(pre)
# This has not directly the form of a parameter vector;
# the parameters are returned by requesting
getparam.norm(pre,xhat)
# but this is not directly important for us now.
# A next step is the initialisation of the random number generator,
# see the help page for imp.norm where this is documented.
ru      <- trunc(1000000*runif(1) + 10)
rngseed(ru)

# The main function we use computes imputation of the missing values,
# based on the model of a multiivariate normal distribution,
# with parameters xhat.
# (See the remarks above about assuming normality,
# although some variables are binary.)
# First get one imputation to illustrate the operation of the function.
imp <- imp.norm(pre,xhat,d)
# To get some further impression, we plot 
# the imputed and observed values of variables DeptS and AnxtS,
# which have a lot of missings.
# For plotting, define the colors
# and plot
color <- as.numeric(is.na(d[4]+d[5]))+1
# and plot
plot(imp[,4],imp[,5], col=color)
# It seems that some of the imputed values are rather low.
# This would merit further investigation
# (but we do not pursue this here).

# However, we need more imputations.
# Therefore we define a function that makes an arbitrary number
# of imputed data sets, arranged in a list.
multi.impute <- function(pre, est, dat, nimp){
     implist <- list()
     for (i in 1:nimp){implist[[i]] <- data.frame(imp.norm(pre,est,dat))}
     implist
}

# We now execute this function
mimp <- multi.impute(pre,xhat,d,20)
# This gives us 20 imputed data sets.
# We first define another function
# to get the estimates and standard errors from the data sets.
lm.results <- function(data.set, formula){
      lm0 <- lm(formula, data=data.set)
      c(coefficients(lm0), sqrt(diag(vcov(lm0))))
      } 
# The use of "formula" implies that we can use this
# for any formula that uses the variables in the data set.
# For example, we could go on and explore interactions.
# To see what this function gives, we try it out for one data set:
lm.results(mimp[[1]],
     Totbpt ~ SexP + DeptP + AnxtP + DeptS + AnxtS + SexChild)
# The first seven numbers are the estimates, 
# the last seven the standard errors.
# Now we apply this function to the 20 imputed data sets
all.results <- sapply(mimp, function(d){lm.results(d,
     Totbpt ~ SexP + DeptP + AnxtP + DeptS + AnxtS + SexChild)})
# What have we got?
dim(all.results)
# Recall that the first seven numbers are estimates,
# the last seven standard errors.
# Let us look at one imputed data set
all.results[,1]

# Take out of this the coefficients and standard errors:
coefs <- t(all.results[1:7,])
stdes <- t(all.results[8:14,])
# Is the variation between the estimates from the imputed data sets
# reasonable? This can be gauged by listing the 20 estimates,
# with the number of missings per variable,
# and the average standard deviations.
coefs
colSums(is.na(d))
colMeans(stdes)
# The variation in the estimates seems in line
# with the average standard errors.

# Now we apply Rubin's formulae for combining the imputations.
# estimates
(estms  <- colMeans(coefs))
# between-imputation variances
(B      <- diag(cov(coefs)))              
# average within-imputation variances
(W      <- colMeans(stdes^2))             
# number of replications
(m      <- dim(coefs)[1])
# Rubin's formula for the overall standard error
(tot_se <- sqrt(W+(1+1/m)*B))
# fraction missing information
(fr_mis <- (1+1/m)*B/(W+(1+1/m)*B))

# The t-values for testing the seven parameters
estms/tot_se

# We can compare these results with the naive solution
# obtained by analysing only complete cases:
lm.d <- lm(  Totbpt ~ SexP + DeptP + AnxtP + DeptS + AnxtS + SexChild, data=d)
# Compare the estimates:
estms
coefficients(lm.d)
# the standard errors:
tot_se
sqrt(diag(vcov(lm.d)))
# the t-statistics:
estms/tot_se
coefficients(lm.d)/sqrt(diag(vcov(lm.d)))
# The results are different but not totally contradictory, 
# and the main difference # seems to be 
# that the standard errors are considerably smaller
# for the multiple imputation solution.
# Also the gender of the patient has a significant effect
# for the multiple imputation solution:
# an ill mother has worse consequences for the child than an ill father.