########################################################################
### File LM_ResidualChecking.r                                       ###
### Tom A.B. Snijders                                                ###
### Statistical Methods: Principles: Model Checking                  ###
### Version October 31, 2011                                         ###
########################################################################

# Create independent variable
x <- 0.01*(1:100)
# Create dependent variable
y <-  5 + 2.0*x + rnorm(100,0,1)
# Analyze by a linear model
plot(x,y)
data1 <- data.frame(x,y)
(lm1 <- lm(y ~ x, data1))
summary(lm1)
plot(lm1)
# Later we shall explain the 'leverage' in the last plot.
# Cook's distance may be mentioned but not shown (see later).

# Now we are going to create a couple of aberrant data sets.
# Outlier in x-space.
xa <- x
xa[100] <- 5
ya <- y
ya[100] <-  5 + 2.0*xa[100] + rnorm(1,0,1)
plot(xa,ya)
data1a <- data.frame(x=xa,y=ya)
(lm1a <- lm(y ~ x, data1a))
summary(lm1a)

# Outlier in x&y-space.
xb <- x
xb[100] <- 5
yb <- y
yb[100] <-  5 + 0.0*xb[100] + rnorm(1,0,1)
plot(xb,yb)
data1b <- data.frame(x=xb,y=yb)
(lm1b <- lm(y ~ x, data1b))
summary(lm1b)

# Stronger outlier in x&y-space.
xbb <- x
xbb[100] <- 50
ybb <- y
ybb[100] <-  5 + 0.0*xbb[100] + rnorm(1,0,1)
plot(xbb,ybb)
data1bb <- data.frame(x=xbb,y=ybb)
(lm1bb <- lm(y ~ x, data1bb))
summary(lm1bb)

# Outlier in y-space, at the x-side.
xc <- x
yc <- y
yc[100] <-  5 - 2.0*xc[100] + rnorm(1,0,1)
plot(xc,yc)
data1c <- data.frame(x=xc,y=yc)
(lm1c <- lm(y ~ x, data1c))
summary(lm1c)

# Outlier in y-space, in the x-middle.
xd <- x
xd[100] <- 0.5
yd <- y
yd[100] <-  5 - 5.0*xd[100] + rnorm(1,0,1)
plot(xd,yd)
data1d <- data.frame(x=xd,y=yd)
(lm1d <- lm(y ~ x, data1d))
summary(lm1d)

# Now let us look at some of the diagnostics.
r1 <- residuals(lm1)
r1a <- residuals(lm1a)
r1b <- residuals(lm1b)
r1bb <- residuals(lm1bb)
r1c <- residuals(lm1c)
r1d <- residuals(lm1d)

# What are the residuals exactly?
fv1 <- fitted.values(lm1)
plot(y-fv1,r1)

# What do the residual plots look like for the six cases?
plot(x,r1)     # regular
plot(xa,r1a)   # residuals are OK, xa looks fishy
plot(xb,r1b)   # xb looks fishy, residuals not OK
plot(xbb,r1bb) # xbb looks very fishy, residuals seem OK
plot(xc,r1c)   # look at the point in the lower right corner
plot(xd,r1d)   # look at the point in the bottom middle

# Leverage values are the hatvalues here.
h1 <- hatvalues(lm1)
h1a <- hatvalues(lm1a)
h1b <- hatvalues(lm1b)
h1bb <- hatvalues(lm1bb)
h1c <- hatvalues(lm1c)
h1d <- hatvalues(lm1d)

# They average out to p/n.
mean(h1)
# hat values express outliers in the x-space
plot(x,h1)  # Is the nice shape expected or does it point to a worry?
plot(x,h1a)
plot(x,h1b)
plot(x,h1bb)
plot(x,h1c)
plot(x,h1d)

# Plotting residuals versus leverage
plot(h1,r1)
plot(h1a,r1a)
plot(h1b,r1b)
plot(h1bb,r1bb)
plot(h1c,r1c)
plot(h1d,r1d)

# All this is nice except for the data1bb case.
# There the residual is very small, so looking only at residuals
# does not give a warning.
# Studentized residuals have variance 1 under the model.
# Equivalently, they are standardized by a standard deviation estimate
# obtained by leaving out this case.
rs1 <- rstudent(lm1)
rs1a <- rstudent(lm1a)
rs1b <- rstudent(lm1b)
rs1bb <- rstudent(lm1bb)
rs1c <- rstudent(lm1c)
rs1d <- rstudent(lm1d)

# Now compare the following plots.
plot(xbb,r1bb)
plot(xbb,rs1bb)
# Also (but less extreme)
plot(xb,r1b)
plot(xb,rs1b)
# The studentized residual identifies the ill fitting point.

# But the default lm plots also give this kind of information, and more:
plot(lm1a)
plot(lm1bb)

# What if we have two mutually consistent outliers.
xe <- xbb
xe[99] <- 50
ye <- ybb
ye[99] <-  5 + 0.0*xe[99] + rnorm(1,0,1)
plot(xe,ye)
data1e <- data.frame(x=xe,y=ye)
(lm1e <- lm(y ~ x, data1e))
summary(lm1e)
r1e <- residuals(lm1e)
h1e <- hatvalues(lm1e)
rs1e <- rstudent(lm1e)
plot(xe,r1e)
plot(h1e,r1e)
plot(xe,rs1e)
# Alas, the two outliers mask each other with respect to
# even the studentized residuals.
# Of course the leverage of these two values still is extreme.
# Fortunately, on the series of plots
plot(lm1e)
# there is enough to arouse suspicion - 
# but not the sizes of the residuals.

# The following plots are obtained for a totally regular data set.
# They might still show some special features;
# it can be a good illustration to construct several independent
# data sets and calculate lm1 and associated diagnostics,
# to get a feeling for what kind of patterns you may see 
# even for regular data. 
cd1 <- cooks.distance(lm1)
plot(x,cd1)
plot(h1,cd1)
plot(cd1,rs1)

# Now for the irregular data sets.
cd1a <- cooks.distance(lm1a)
plot(xa,cd1a)
plot(h1a,cd1a)
plot(cd1a,rs1a)

cd1b <- cooks.distance(lm1b)
plot(xb,cd1b)
plot(h1b,cd1b)
plot(cd1b,rs1b)
# Here Cook's distance clearly betrays the special point.

cd1bb <- cooks.distance(lm1bb)
plot(xbb,cd1bb)
plot(h1bb,cd1bb)
plot(cd1bb,r1bb)

cd1c <- cooks.distance(lm1c)
plot(xc,cd1c)
plot(h1c,cd1c)
plot(cd1c,r1c)

cd1d <- cooks.distance(lm1d)
plot(xd,cd1d)
plot(h1d,cd1d)
plot(cd1d,r1d)

cd1e <- cooks.distance(lm1e)
plot(xe,cd1e)
plot(h1e,cd1e)
plot(cd1e,r1e)
# Cook's distance can be useful also for the case with
# two mutually consistent outliers - but it can be fooled.

dff1 <- dffits(lm1)
plot(dff1)
# In this case the index is meaningful; but it is not always!

dff1b <- dffits(lm1b)
plot(dff1b,h1b)
dff1bb <- dffits(lm1bb)
plot(dff1bb,h1bb)
dff1e <- dffits(lm1e)
plot(dff1e,h1e)

# To demonstrate partial residual plots, we use the car library,
# connected to the book by Fox (and Fox & Weisberg).
library(car)
# The data frame used is:
?Prestige
detach(Prestige)
hist(Prestige$prestige)

# An initial linear model:
prestige.lm1 <- lm(prestige ~ income + education + women, 
                   data = Prestige)
summary(prestige.lm1)
plot(prestige.lm1)
# The residuals are more nearly normal than the dependent variable.
# The point with the highest Cook's distance is
Prestige[2,]

# Bivariate relations:
scatterplot(prestige,income,lwd=2)
scatterplot(prestige,education,lwd=2)
# These do not say so much, because they do not control
# for the other variables.
# To get an insight into non-linearity, we display added-variable plots.
# In car, these are called component + residual plots.
?crPlots
crPlots(prestige.lm1)

# Especially the dependence on income seems nonlinear:
# try a logarithm.
logi <- log(Prestige$income)
prestige.lm2 <- lm(prestige ~ income + logi + education + women, 
                   data = Prestige)
summary(prestige.lm2)
plot(prestige.lm2)
crPlots(prestige.lm2)

# Now the dependence on proportion women seems still non-linear.
women2 <- Prestige$women*Prestige$women
prestige.lm3 <- lm(prestige ~ income + logi + education + women + women2,
                   data = Prestige)
summary(prestige.lm3)
crPlots(prestige.lm3)

# For parsimony, we drop income,
# but we retain proportion women to have a full quadratic dependence 
# on this variable
prestige.lm4 <- lm(prestige ~ logi + education + women + women2,
                   data = Prestige)
summary(prestige.lm4)
crPlots(prestige.lm4)
# Not perfect, but reasonable.
plot(prestige.lm4)
# Is a transformation of the dependent variable indicated?
boxcox(prestige.lm4)
# No, the value lambda = 1 is in the confidence interval.

# Which occupation has the highest influence?
prestige.dff4 <- dffits(prestige.lm4)
(sort(prestige.dff4))
# Ministers and babysitters.
# What is the standardized influence of the cases 
# on the individual parameter estimates?
(prestige.dfb4 <- dfbetas(prestige.lm4))


