# Load the data
r.aw <- read.table("http://www.stats.ox.ac.uk/~laws/LMs/data/resist.txt", header = TRUE)
# notice the balanced design
subjects <- 16
electrodes <- 5
# sophisticated
# r<-data.frame(Resistance=unlist(r.aw[,-1],use.names=F),
# Subject=gl(subjects,1,subjects)[row(r.aw[,-1])],
# Electrode=gl(electrodes,1,electrodes)[col(r.aw[,-1])])
# more obvious - but not generic
r <- data.frame(Resistance = unlist(r.aw[, -1], use.names = F),
Subject = as.factor(rep(1:subjects, electrodes)),
Electrode = as.factor(rep(1:electrodes, rep(subjects, electrodes))))
str(r)
n <- dim(r)[1]
plot(r) #not a very helpful display here with the categorical variables
boxplot(Resistance ~ Subject, data = r, xlab = "Subject ID", ylab = "Resistance")
# notice the Resistances are a bit positively skewed this
# will be relevant later on when we transform the response
# Y=Resistance to make it 'more normal'.
# We note also Subject 15 is a bit odd with variable
# readings.
boxplot(Resistance ~ Electrode, data = r, xlab = "Electrode", ylab = "Resistance")
# Nothing much here maybe a bit skew - but we can't normally
# trust that as this maybe an artifact of the other variable
r.lm <- lm(Resistance ~ Electrode + Subject, data = r)
summary(r.lm) #premature as we have to do outlier analysis
# misfit
plot(fitted(r.lm), rstudent(r.lm), xlab = "Fitted values", ylab = "Studentised residuals",
col = as.numeric(r$Electrode), pch = 16)
# the following lets you work out which point is which:
# after running the identify(...) command below,
# left mouse click on a point, or on a few points, in the plot and then
# in RStudio: click finish at top right of plot window
# (or if you are in R: mouse right click to exit)
# and you get the labels of the points you clicked on
identify(fitted(r.lm), rstudent(r.lm))
# or plot the labels onto the figure
text(fitted(r.lm), rstudent(r.lm), r$Subject, adj = 1.5)
r[order(r$Subject), ]
# the outlier data are subject 15. See the remarks in
# http://www.statsci.org/data/general/resist.html
# leverages are all equal... why?
hatvalues(r.lm)
# influence - again we see subject 15 data has high influence
p <- r.lm$rank
plot(cooks.distance(r.lm))
abline(h = 8/(n - 2 * p))
text(1:n, cooks.distance(r.lm), r$Subject)
# remove the data for subject 15
rd <- r[-which(r$Subject == 15), ]
# this is delicate as '15' is a level of a categorical
# variable but we can check and this does what we hoped
rd
n <- dim(rd)[1]
rd.lm <- lm(Resistance ~ Electrode + Subject, data = rd)
summary(rd.lm)
# misfit
plot(fitted(rd.lm), rstudent(rd.lm), xlab = "Fitted values",
ylab = "Studentised residuals", col = as.numeric(rd$Electrode), pch = 16)
# plot the labels onto the figure
text(fitted(rd.lm), rstudent(rd.lm), rd$Subject, adj = 1.5)
# subject 2 now has a misfitting data point and there are
# clear signs of heteroscedasticity
# influence - is this point influential?
p <- rd.lm$rank
plot(cooks.distance(rd.lm))
abline(h = 8/(n - 2 * p))
text(1:n, cooks.distance(rd.lm), rd$Subject)
# Yes. Let us leave it in for now - we could consider
# deleting subject 2, or just this one point ... why might we
# delete all the data for subject 2 rather than just this
# single point - hint: think design.
# To deal with the varying variance lets try transforming
# Resistance look for the power using boxcox
library(MASS) #for boxcox
par(mfrow = c(1, 1))
boxcox((Resistance) ~ Electrode + Subject, data = rd)
# lets say 0.25 for the MLE for lambda
# OK define the transformed response
rd$Y <- rd$Resistance^0.25
# fit the model
rdt.lm <- lm(Y ~ Electrode + Subject, data = rd)
# goodness of fit - misfit
plot(fitted(rdt.lm), rstudent(rdt.lm), xlab = "Fitted values",
ylab = "Studentised residuals", col = as.numeric(rd$Electrode), pch = 16)
# influence
p <- rdt.lm$rank
plot(cooks.distance(rdt.lm))
abline(h = 8/(n - 2 * p))
# qqplot
qqnorm(res <- rstudent(rdt.lm))
qqline(res)
# all fine. In particular 8/(n-2p) is much larger than the CD
# of any point
# model selection
anova(rdt.lm)
# no effect due to electrode at 5%
# notice it is an orthogonal design so it doesnt matter that
# electrode is at the top of the ANOVA table - check it
# doesnt make a difference by swapping the covariates
rdtswap.lm <- lm(Y ~ Subject + Electrode, data = rd)
anova(rdtswap.lm)
# or looking at the covariance matrix
round(summary(rdt.lm)$cov.unscaled, 2)
# AIC keeps both covariates
step(rdt.lm)