################################################################### ### ch10.r ### ### ### ### This is an R script for producing examples in ### ### Chapter 10 of ### ### Snijders, Tom A.B., and Bosker, Roel J. ### ### Multilevel Analysis: ### ### An Introduction to Basic and Advanced Multilevel Modeling, ### ### second edition. ### ### London etc.: Sage Publishers, 2012 ### ### ### ### version April 4, 2014 ### ### ### ################################################################### # Read data mlbook_red <- read.table("mlbook2_r.dat", header=TRUE) # what are the names names(mlbook_red) # Note that school means can be calculated by functions such as # schmeans <- aggregate(mlbook_red, by = list(mlbook_red$schoolnr), mean) # But this is not necessary here because the school means # are already in the data set. library(nlme) ################################################################### ### ### ### Within - group OLS residuals ### ### ### ################################################################### # First we estimate Model 1 of Table 8.1. summary(mod1 <- lme(langPOST ~ IQ_verb*ses + sex + sch_iqv*sch_ses, random = ~ IQ_verb|schoolnr, data = mlbook_red, method="ML")) VarCorr(mod1) # Next we calculate OLS regressions per school class. # To be able to use the compareFits function, # we do keep the school-level variables, although they have # no sense for the OLS regressions. olselist <- lmList(langPOST ~ IQ_verb*ses + sex + sch_iqv*sch_ses| schoolnr, data=mlbook_red) # The compareFits function compares the OLS estimates with the # posterior means of the multilevel model. # We do this only for the level-1 effects. comp.models <- compareFits(coef(olselist), coef(mod1), c(1,2,3,4,7)) plot(comp.models) # The plots shows the much greater variability of the # OLS estimates. # We see that there are a few OLS outliers. # For further use, we wish to have the within-group residual d.f. available # First see how olselist is structured: str(olselist, 1) # Apparently it is a list of results for each of the groups. str(olselist[[1]], 1) # This shows that we can get get the within-group residual d.f. as follows: df_res <- sapply(olselist, function(x){x$df.residual}) table(df_res) # To study the outliers for, e.g., the effect of sex, # the plot of comp.models shows that the value -10 may be used to # separate outliers from the rest. These schools are flagged by low.sex.effect <- which(comp.models[,1,"sex"] < -10) # Are these particularly small groups? df_res[low.sex.effect] # Yes indeed, they are. This may well account for # the deviating effects of sex. # The regression results for these groups are listed through comp.models[low.sex.effect,,] # The first group (labeled 10) has particularly deviant results, # but it has df_res = 0, # so the precision is indeed expected to be very poor. # The other schools also have so small df_res, # that these outliers seem not to be very bothering. # Further down we continue with the study of the OLS residuals. # Example 10.1 # Make a selection of the schools with d.f. at least 10 use <- df_res >= 10 # The number of such schools is sum(use) # Now we need to get the within-group residual standard deviations # To see how we get this, first study ?summary.lm # and search for the word "variance". # For the first group, residual s.d. is summary(olselist[[1]])$sigma # This shows that we can compute sigma2_res <- sapply(olselist, function(x){(summary(x)$sigma)^2}) # Make a plot of residual variance against residual d.f.: plot(df_res, sigma2_res) # There are some outliers for the very low residual d.f. # Formula (10.3) ls_tot <- sum((df_res*log(sigma2_res))[use])/sum(df_res[use]) d <- (sqrt(0.5*df_res))*(log(sigma2_res) - ls_tot) (H <- sum((d^2)[use])) # The associated p-value is 1-pchisq(H, sum(use)-1) qqnorm(d) # Find the outliers outliers <- which(abs(d) > 3) df_res[outliers] d[outliers] # Example 10.2 # The within-group OLS residuals are obtained by resi <- residuals(olselist) # We illustrate only the dependence of langPOST on IQ_verb. # First let us plot the raw variables with(mlbook_red,plot(IQ_verb,langPOST)) # This is not optimally informative, because there are many plot symbols # that represent many cases, and the number (multiplicity) of cases # is not visualized. # To visualize this multiplicity, we may use with(mlbook_red,plot(jitter(IQ_verb),jitter(langPOST))) with(mlbook_red,lines(lowess(IQ_verb,langPOST, f=.2), lwd=3)) # This figure suggests that for IQ_verb roughly between -1 and +1, # the slope is higher than outside of this interval. # However, these plots do not control for the other variables, # so this is not something that leads directly to # modification of the model. # Therefore we now make similar plots for the residuals. with(mlbook_red,plot(jitter(IQ_verb),jitter(resi))) with(mlbook_red,lines(lowess(IQ_verb,resi, f=.2), lwd=3)) # It's a bit black there in the middle! # As on pages 121-123 and 164, we include two quadratic spline functions: temp1 <- mlbook_red$IQ_verb^2 temp2 <- (abs(mlbook_red$IQ_verb))*mlbook_red$IQ_verb iq2.plus <- 0.5*(temp1+temp2) iq2.min <- 0.5*(temp1-temp2) # As a check: plot(mlbook_red$IQ_verb, iq2.plus) plot(mlbook_red$IQ_verb, iq2.min) # Now we estimate the model to which the two spline effects are added. # The number of iterations must be allowed to be larger than the default; # hence the "control" parameter. summary(mod5 <- lme(langPOST ~ IQ_verb*ses + sex + sch_iqv*sch_ses + iq2.plus + iq2.min, random = ~ IQ_verb|schoolnr, data = mlbook_red, method="ML", control=list(msMaxIter=100))) # Compute the within-group OLS residuals: mlbook_plus <- cbind(mlbook_red,iq2.min,iq2.plus) olselist5 <- lmList(langPOST ~ IQ_verb*ses + sex + iq2.plus + iq2.min | schoolnr, data=mlbook_plus) resi5 <- residuals(olselist5) with(mlbook_red,plot(jitter(IQ_verb),jitter(resi5))) with(mlbook_red,lines(lowess(IQ_verb,resi5, f=.2), lwd=3)) # Even though it still is black in the middle, # the lowess curve looks pretty flat indeed. # Now we wish to study standardized within-group OLS residuals. # We do this using "by", if necessary look up the help function # ?by # to understand the following construction. # First define a function that gives the studentized OLS residuals # for this model in a given data frame: res_st5 <- function(x){ rstudent(lm(langPOST ~ IQ_verb*ses + sex + iq2.plus + iq2.min, data=x)) } # Compute within-school studentized OLS residuals resi_st <- by(mlbook_plus, mlbook_red$schoolnr, res_st5) # Find out the structure of this object, and try to understand # why we use the following transformation rs <- unlist(resi_st) # Some of the residuals are NA or exactly 0. sum(is.na(rs)) sum(rs[!is.na(rs)]==0) # This is because of the presence of small groups. # However, it is unclear why unit 1250, with schoolnr=94, # has a NaN standardized residual. # Perfect collinearity of the other elements in this group??? # We leave this question aside. # To make a QQ plot, we use the vector defined above # and use the schools that are large enough: school.match <- match(1:max(mlbook_red$schoolnr),unique(mlbook_plus$schoolnr)) qqnorm(rs[use[school.match[mlbook_plus$schoolnr]]]) ################################################################### ### ### ### Residuals at level two ### ### ### ################################################################### # Example 10.3 # Here we are going to use package lme4. detach("package:nlme") library(lme4) (mod103 <- lmer(langPOST ~ IQ_verb*ses + sex + sch_iqv*sch_ses + iq2.plus + iq2.min + (IQ_verb|schoolnr), data = mlbook_red)) re.mod103 <- ranef(mod103, condVar=TRUE, standard=TRUE) # The postVar parameter will also give the posterior variances. # What is the structure of this object? str(re.mod103) # The posterior means are postmean <- re.mod103$schoolnr[,1] postslope <- re.mod103$schoolnr$IQ_verb # and the posterior variances are postmeanvar <- attr(re.mod103$schoolnr,'postVar')[1,1,] postslopevar <- attr(re.mod103$schoolnr,'postVar')[2,2,] # These are the comparative variances, cf. Section 4.8. # Note that the parameters of the random part are given by VarCorr(mod103) # The diagnostic variance is calculated using (4.18): diagmeanvar <- VarCorr(mod103)$schoolnr[1,1] - postmeanvar diagslopevar <- VarCorr(mod103)$schoolnr[2,2] - postslopevar # The school-level variables in the data frame can be # taken from the pupil level (3758 units) to the # school level (211 units) as follows: school.which <- match(unique(mlbook_plus$schoolnr),mlbook_plus$schoolnr) # Check what this is: school.which # the first pupil in each school; another check: mlbook_plus$schoolnr[school.which] # Using this, the vector of IQ averages per school is IQ.school <- mlbook_plus$sch_iqv[school.which] # Now we can make a plot like figure 10.4: plot(IQ.school, postmean) lines(lowess(IQ.school, postmean)) # and figure 10.5: plot(IQ.school, postslope) lines(lowess(IQ.school, postslope)) # Finally, figure 10.6: postmean.stand <- postmean/sqrt(diagmeanvar) qqnorm(postmean.stand) ################################################################### ### ### ### Influence of level-two units ### ### ### ################################################################### # We shall need to invert matrices, and therefore attach library(MASS) # Above we estimated model mod103. The fixed effects are in (fe <- fixef(mod103)) # with covariance matrix (vcov(mod103)) # Making the calculations for Section 10.7 requires a bit of patience, # but it is just a matter of going through the formulae # and finding what you need in the object returned by lmer. # In the following we do not exactly follow the technique in the book, # so that the numbers produced will be slightly different. # In the first place, we only consider the influence statistics # for the parameters in the fixed part. # Second, we use the REML estimates for the deletion statistics # rather than the one-step estimates. # Since the calculations take only a couple of minutes, # there is no need to use the less precise one-step estimates. # First we prepare the vectors to be filled # (as a check, we start filling them with negative numbers; # note that the formulae used must lead to positive numbers). cFj <- rep.int(-99, 211) # this will be the influence statistic nj <- rep.int(-99, 211) # this will be the number of cases st.mult.res <- rep.int(-99, 211) # this will be the standardized multivariate residual # Next we shall need the design matrix for the fixed effects. # See # ?getME design.matrix <- getME(mod103, "X") # Understand what this is: dim(design.matrix) design.matrix[1:20,] # We calculate the deletion statistics in a long loop. # Before executing it, first go through it to understand what it does. # Note that the closing brace is almost 30 lines down. for (i in 1:211){ thisgroup <- mlbook_plus$schoolnr == mlbook_plus$schoolnr[school.which][i] nj[i] <- sum(thisgroup) thisframe <- mlbook_plus thisframe$langPOST[thisgroup] <- NA thismod <- lmer(langPOST ~ IQ_verb*ses + sex + sch_iqv*sch_ses + iq2.plus + iq2.min + (IQ_verb|schoolnr), data = thisframe) # formula (10.6) cFj[i] <- (t(fixef(thismod)-fe) %*% ginv(as.matrix(vcov(thismod))) %*% (fixef(thismod)-fe))/length(fe) # formula above (10.9) fit.group.i <- design.matrix[thisgroup,] %*% fixef(thismod) # formula (10.9) dif.group.i <- mlbook_plus$langPOST[thisgroup] - fit.group.i # the level-1 variance var.lev1 <- attr(VarCorr(thismod),"sc")^2 # estimated covariance matrix of dependent variable for this group; # this is the matrix equation for formulae (5.5) and (5.6) estvar <- cbind(1,mlbook_plus$IQ_verb[thisgroup]) %*% VarCorr(thismod)$schoolnr %*% rbind(1,mlbook_plus$IQ_verb[thisgroup]) + diag(var.lev1, nrow=length(fit.group.i)) # formula (10.10) st.mult.res[i] <- t(dif.group.i) %*% ginv(estvar) %*% dif.group.i # Report to the screen as a confirmation that something still is going on; # you will have to wait for 211 times "." cat(".") flush.console() } # Now sort the influence statistics. # Request $sort.int to get the meaning of ...$ix cFj.sort <- sort.int(cFj, decreasing=TRUE, index.return=TRUE) cFj.sorted <- cbind(cFj.sort$x, cFj.sort$ix, mlbook_plus$schoolnr[school.which[cFj.sort$ix]]) # The following is the analogue of Table 10.1, # but now for the fixed parameters only. # Note the meaning of the 2nd and 3d columns. cFj.sorted[1:20,] # The correspondence with Table 10.1 is pretty good. # To this overview we may add the standardized multivariate residuals influence.sorted <- cbind(cFj.sort$x, st.mult.res[cFj.sort$ix], nj[cFj.sort$ix], 1-pchisq(st.mult.res[cFj.sort$ix],nj[cFj.sort$ix]), cFj.sort$ix, mlbook_plus$schoolnr[school.which[cFj.sort$ix]]) influence.sorted[1:20,]