# SB1.1 Applied Statistics Non-Assessed Practical Normal Linear Models
######################################################################
#sample answer to Frogs data example question
#See http://www.statsci.org/data/general/frogs.html for original data
## If at any point you would like to go back to the start and remove all
## the variables from the workspace (a "clean slate") use the following
## command
rm(list=ls())
##load data
dir()
a <- read.table("http://www.stats.ox.ac.uk/~nicholls/sb1a/data/frogs.txt", header=T)
a
str(a)
################################################################
#Step 1 - Prepare data for analysis.
#The Species variable is categorical. We must tell R this - otherwise
#lm() will treat this variable as integer/dimensionful
# - which is nonsense
a$Species <- as.factor(a$Species) #important
str(a)
#Each frog is measured twice - they a Consumption value at Rest and Exercise.
#We need to reshape the data so each row has one response column and all the
#other columns give the corresponding covariate values for that response.
#Rebuild data into single observations - make sure you understand what each
#of these commands does.
Activity <- c(rep('Rest',16),rep('Exercise',16))
d <- rbind(a,a)
d$Activity <- as.factor(Activity)
d$Consumption <- c(a$Rest,a$Exercise)
d <- d[,-c(4,5)]
d
#We might also remove outliers in the data at this point. See next lecture.
#We are going to fit a NLM in which we assume each response is independent.
#We note that each frog has been measured twice. The responses from different frogs
#are probably independent (given covariates) - at least that's a fair starting point.
#We may be concerned that the pair of responses from a single frog are correlated.
#This would be a model violation in our NLM. We will assume the responses are
#conditionally independent _given_covariates_ - it is enough that the errors are
#iid normal given the covariates - the conditioning helps and maks it at least
#possible that our assumption is reasonable.
###########################################################
#Step 2 - Exploratory data analysis
##EDA this one plot says everything for me - there is an offset in consumption due to
##Activity="Rest"/"Exercise" and also Temperature="High"/"Low". Possibility that
##the species represented by the symbol "+" is a bit different.
plot(d$Subject,d$Consumption,col=d$Activity,pch=as.numeric(d$Species),cex=as.numeric(d$Temperature),
main='Symbol=Species, Black/Red=Active/Rest, small/big=High/Low temp')
#showing the orthogonal design (balanced)
table(d$Species,d$Temperature)
table(d$Species,d$Temperature,d$Activity)
#########################################################################
#Step 3 - Model Selection
#The near-saturated model with 2 observations per parameter might be
#a starting point for backwards elimination search via ANOVA.
#This leads to the simple additive model we anticipated from our EDA
fg1.lm <- lm(Consumption~Temperature*Species*Activity,data=d)
summary(fg1.lm)
anova(fg1.lm)
#no strong evidence for different species to have a different response due to temp
#ie not temp*spec term - this is borderline - maybe further data needed
#with larger "n" value
#Above ANOVA table leads to simple additive model
fg0.lm <- lm(Consumption~Temperature+Species+Activity,data=d)
summary(fg0.lm)
#Direct F-test for null model 0 aganst alternative model 1
anova(fg0.lm,fg1.lm)
#notice again the orthogonality - what are the consequences for ANOVA?
vcov(fg0.lm)
#We can pull out confidence intervals for paramters
confint(fg0.lm,level=0.95)
#########################################################################
#Steep 4 - Goodness of fit analysis - if the model doesnt have
#normal iid errors then we cant trust the F-test.
plot(fitted(fg0.lm), rstudent(fg0.lm))
qqnorm(rstudent(fg0.lm)); qqline(rstudent(fg0.lm))
#all good - no sign of problems with the model or data
#rstudent() computes "studentised" residuals. We will define these next lecture.
#For "ordinary" residuals e=y-yhat replace "rstudent(fg.lm)" with "residuals(fg.lm)"
#END
########################################################################
########################################################################
##Notes
#One or two commands for methods we havnt seen in lectures.
#(1) We will shortly look at the Akaike Information Criterion (AIC)
#as a basis for model selction. "stepAIC" takes the full model and
#looks at the support for deleting terms one at a time.
#This is a similar approach to our use of an ANOVA table,
#but is based on a different (not F-test) criterion for deciding
#whether or not to include terms. Actually here is decides all
#terms are important. This function is in the "MASS" library,
#which we must load.
library(MASS)
stepAIC(fg1.lm)
#(2) On thursday we will look at some more advanced methods for
#checking goodness of fit, and identifying outlier data points.
#The "Cook's Distance" is particularly natural, using a cross validation
#idea which essentially asks how well ach data point is predicted
#by all the other data points. The following is a healthy plot.
p=length(fg0.lm$coefficients)
n=length(fg0.lm$fitted.values)
cdt=8/(n-2*p)
plot(cooks.distance(fg0.lm),ylim=c(0,cdt*1.1)); abline(h=cdt)
#Model with temp*spec interaction is suggested by visual inspection of EDA plot
#Overfits a bit - the p-value for an interaction is borderline and we are doing
#too many tests (and data-dredging) to be able to trust all except extreme p-values
fg.lm <- lm(Consumption~Temperature*Species+Activity,data=d)
summary(fg.lm)
anova(fg.lm)
p=length(fg.lm$coefficients)
n=length(fg.lm$fitted.values)
cdt=8/(n-2*p)
plot(cooks.distance(fg.lm),ylim=c(0,cdt*1.1)); abline(h=cdt)
plot(fitted(fg.lm),rstudent(fg.lm))
qqnorm(rstudent(fg.lm)); qqline(rstudent(fg.lm))
#the sample quantiles are slightly underdispersed
#hinting at overfitting