# # Modern Data Analysis with S-PLUS. Brian Ripley 23 October 1999 New Orleans # #============================================================================ library(MASS) # source of many of the datasets #============= Looking at Data ============== # Density estimation library(logsplin) # on Windows x <- seq(8, 35, 0.2) gal <- galaxies/1000 plot(range(x), c(0, 0.35), type="n", xlab="velocity of galaxy", ylab="density") rug(gal) lines(x, dlogspline(x, logspline.fit(gal)),col=8) lines(density(gal, n=200, window="gaussian", width=width.SJ(gal)), col=3) abline(h=0, col=2) library(ksmooth) # on Windows plot(range(x), c(0, 0.35), type="n", xlab="velocity of galaxy", ylab="density") lines(locpoly(gal, degree=1, bandwidth=0.625), col=8) lines(locpoly(gal, degree=2, bandwidth=0.625), col=3) rug(gal) abline(h=0, col=2) library(locfit, first=T) plot(range(x), c(0, 0.35), type="n", xlab="velocity of galaxy", ylab="density") rug(gal) abline(h=0, col=2) lines(locfit(~ gal, flim=c(8, 35)), m=500, col=8) fit <- locfit(~ gal, alpha=0.45, flim=c(8, 35), kern="gauss", ev="grid", mg=200) lines(fit, m=500, col=3) galaxies.bin <- data.frame(velocity=seq(8, 35, 0.5), count=hist(gal, breaks=seq(7750, 35250, 500)/1000, plot=F)$counts) fit2 <- locfit(count ~ velocity, data=galaxies.bin, weights=rep(82*0.5, nrow(galaxies.bin)), alpha=c(0, 0, 2), family="poisson") lines(fit2, m=500, col=4) #======= Visualization # crabs crabs.grp <- c("B", "b", "O", "o")[rep(1:4, rep(50,4))] trellis.device() def <- trellis.par.get("superpose.symbol") def$pch <- rep(16, 7) def$cex <- rep(0.5, 7) trellis.par.set("superpose.symbol", def) splom(~ crabs[ ,4:8], groups=crabs.grp, panel=panel.superpose, key = list(text = list(c("Blue male", "Blue female", "Orange Male", "Orange female")), points = Rows(trellis.par.get("superpose.symbol"), 1:4), columns = 4) ) lcrabs <- log(crabs[,4:8]) lcrabs.pc <- predict(princomp(lcrabs)) trellis.par.set("superpose.symbol", def) splom(~ lcrabs.pc[, 1:3], groups = crabs.grp, panel = panel.superpose, key = list(text = list(c("Blue male", "Blue female", "Orange Male", "Orange female")), points = Rows(trellis.par.get("superpose.symbol"), 1:4), columns = 4) ) xgobi(lcrabs, colors=c("SkyBlue", "SlateBlue", "Orange", "Red")[rep(1:4, rep(50,4))]) cr1.iso <- isoMDS(dist(as.matrix(lcrabs))) cr.scale <- 0.5 * log(crabs$CL * crabs$CW) slcrabs <- lcrabs - cr.scale cr2.iso <- isoMDS(dist(as.matrix(slcrabs))) tmp <- apply(cr1.iso$points, 2, function(x)diff(range(x))) xyplot(cr1.iso$points[,2] ~ cr1.iso$points[,1], groups = crabs.grp, panel = panel.superpose, aspect=tmp[2]/tmp[1], xlab="", ylab="", key = list(text = list(c("Blue male", "Blue female", "Orange Male", "Orange female")), points = Rows(trellis.par.get("superpose.symbol"), 1:4), columns = 4)) tmp <- apply(cr2.iso$points, 2, function(x)diff(range(x))) xyplot(cr2.iso$points[,2] ~ cr2.iso$points[,1], groups = crabs.grp, panel = panel.superpose, aspect=tmp[2]/tmp[1], xlab="", ylab="", key = list(text = list(c("Blue male", "Blue female", "Orange Male", "Orange female")), points = Rows(trellis.par.get("superpose.symbol"), 1:4), columns = 2)) dev.off() # Forensic Glass fgl0 <- fgl[ ,-10] # omit type. fgl.df <- data.frame(type = rep(fgl$type, 9), y = as.vector(as.matrix(fgl0)), meas = factor(rep(1:9, rep(214,9)), labels=names(fgl0))) trellis.device() stripplot(type ~ y | meas, data=fgl.df, scales=list(x="free"), strip=function(...) strip.default(style=1, ...), xlab="", cex=0.5, pch=18, col=6) dev.off() fgl.col <- c("SkyBlue", "SlateBlue", "Orange", "Orchid", "Green", "HotPink")[fgl$type] fgl1 <- fgl0[1:185,] xgobi(fgl0, colors=fgl.col) xgobi(fgl1, colors=fgl.col[1:185]) fgl.sam <- sammon(dist(as.matrix(fgl0[-40, ]))) fgl.iso <- isoMDS(dist(as.matrix(fgl0[-40, ]))) eqscplot(fgl.sam$points, type="n", xlab="", ylab="") for(i in seq(along=levels(fgl$type))) { set <- fgl$type[-40] == levels(fgl$type)[i] points(fgl.sam$points[set,], pch=18, cex=0.6, col=2+i) } key(text=list(levels(fgl$type), col=3:8)) eqscplot(fgl.iso$points, type="n", xlab="", ylab="") for(i in seq(along=levels(fgl$type))) { set <- fgl$type[-40] == levels(fgl$type)[i] points(fgl.iso$points[set,], pch=18, cex=0.6, col=2+i) } key(text=list(levels(fgl$type), col=3:8)) #===================== Regression and Beyond =================== # Scottish Hill Races hills.lm <- lm(time ~ dist + climb, hills) plot(fitted(hills.lm), studres(hills.lm)) identify(fitted(hills.lm), studres(hills.lm)) qqnorm(studres(hills.lm)); qqline(studres(hills.lm)) hills.hat <- lm.influence(hills.lm)$hat cbind(hills, lev=hills.hat)[hills.hat > 3/35, ] cbind(hills, pred=predict(hills.lm))["Knock Hill", ] hills1.lm <- lm(time ~ dist + climb, hills[-18, ]) hills1.lm lm(time ~ dist + climb, hills[-c(7,18), ]) plot(hills1.lm) # Cook-Weisberg test rr <- residuals(hills.lm)^2 rr <- rr/mean(rr) hills.lm1 <- lm(rr ~ fitted(hills.lm)) 0.5*anova(hills.lm1)$"Sum of Sq"[1] # value plot(fitted(hills.lm), rr) # added variable plot identify(fitted(hills.lm), rr, row.names(hills[-18,])) boxcox(hills1.lm) summary(hills1.lm, cor=F) summary(lm(time ~ dist + climb, hills[-18, ], weight=1/dist^2)) lm(time ~ -1 + dist + climb, hills[-18, ], weight=1/dist^2) hills <- hills # S-PLUS 5.x needs this hills$ispeed <- hills$time/hills$dist hills$grad <- hills$climb/hills$dist hills2.lm <- lm(ispeed ~ grad, hills[-18, ]) hills2.lm hills2.hat <- lm.influence(hills2.lm)$hat cbind(hills[-18,], lev=hills2.hat) hills.lm hills1.lm # omitting Knock Hill rlm(time ~ dist + climb, hills) summary(rlm(time ~ dist + climb, hills, weights=1/dist^2, method="MM"), cor=F) lqs(time ~ dist + climb, data=hills, nsamp="exact") summary(hills2.lm) # omitting Knock Hill summary(rlm(ispeed ~ grad, hills), cor=F) summary(rlm(ispeed ~ grad, hills, method="MM"), cor=F) lqs(ispeed ~ grad, data=hills) hills.lts <- lqs(ispeed ~ grad, data=hills, nsamp="exact") eqscplot(hills.lts$fitted, hills$ispeed) abline(0, 1, lty=2) identify(hills.lts$fitted, hills$ispeed, row.names(hills)) hills1 <- hills[-c(7,18),] hills1.lts <- lqs(ispeed ~ grad, data=hills1, nsamp="exact") eqscplot(hills1.lts$fitted, hills1$ispeed) abline(0, 1, lty=2) identify(hills1.lts$fitted, hills1$ispeed, row.names(hills1)) # Bootstrapping regressions library(boot) fit <- lm(time ~ dist + climb, hills, weights=1/dist^2) tmp <- data.frame(hills, res=resid(fit), fitted=fitted(fit)) tmp.fun <- function(data, i) { d <- data d$time <- d$fitted + d$res[i] coef(update(fit, data=d)) } lm.boot <- boot(tmp, tmp.fun, R=499) lm.boot boot.ci(lm.boot, index=2, type="norm") # index = 2 means 2nd coef boot.ci(lm.boot, index=2, type=c("perc", "bca")) fit <- rlm(time ~ dist + climb, hills, weights=1/dist^2, method="MM") tmp <- data.frame(hills, res=resid(fit), fitted=fitted(fit)) rlm.boot <- boot(tmp, tmp.fun, R=499) rlm.boot boot.ci(rlm.boot, index=2, type="norm") boot.ci(rlm.boot, index=2, type=c("perc", "bca")) # CPUs performance data boxcox(perf ~ syct+mmin+mmax+cach+chmin+chmax, data=cpus, lambda=seq(0, 1, 0.1)) cpus1 <- cpus attach(cpus) for(v in names(cpus)[2:6]) cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])), include.lowest = T) detach() boxcox(perf ~ syct+mmin+mmax+cach+chmin+chmax, data = cpus1, lambda = seq(-0.25, 1, 0.1)) set.seed(123) cpus2 <- cpus1[, 2:8] # excludes names, authors' predictions cpus.samp <- sample(1:209, 100) cpus.lm <- lm(log10(perf) ~ ., data=cpus2[cpus.samp,2:8]) test.cpus <- function(fit) sqrt(sum((log10(cpus2[-cpus.samp, "perf"]) - predict(fit, cpus2[-cpus.samp,]))^2)/109) test.cpus(cpus.lm) cpus.lm2 <- stepAIC(cpus.lm, trace=F) cpus.lm2$anova test.cpus(cpus.lm2) res1 <- log10(cpus1[-cpus.samp, "perf"]) - predict(cpus.lm, cpus0[-cpus.samp,]) res2 <- log10(cpus1[-cpus.samp, "perf"]) - predict(cpus.lm2, cpus2[-cpus.samp,]) wilcox.test(res1^2, res2^2, paired=T, alternative="greater") library(mda) test2 <- function(fit) { Xp <- as.matrix(cpus0[-cpus.samp,1:6]) sqrt(sum((log10(cpus0[-cpus.samp, "perf"]) - predict(fit, Xp))^2)/109) } cpus0 <- cpus[, 2:8] for(i in 1:3) cpus0[,i] <- log10(cpus0[,i]) cpus.bruto <- bruto(Xin, log10(cpus[cpus.samp,8])) test2(cpus.bruto) cpus.bruto$type cpus.bruto$df # examine the fitted functions par(mfrow=c(3,2)) Xp <- matrix(sapply(cpus0[cpus.samp, 1:6], mean), 100, 6, byrow=T) for(i in 1:6) { xr <- sapply(cpus0, range) Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=100) Xf <- predict(cpus.bruto, Xp1) plot(Xp1[ ,i], Xf, xlab=names(cpus0)[i], ylab="", type="l") } cpus.mars <- mars(Xin, log10(cpus[cpus.samp,8])) showcuts <- function(obj) { tmp <- obj$cuts[obj$sel, ] dimnames(tmp) <- list(NULL, dimnames(Xin)[[2]]) tmp } showcuts(cpus.mars) test2(cpus.mars) # examine the fitted functions Xp <- matrix(sapply(cpus0[samp, 1:6], mean), 100, 6, byrow=T) for(i in 1:6) { xr <- sapply(cpus0, range) Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=100) Xf <- predict(cpus.mars, Xp1) plot(Xp1[ ,i], Xf, xlab=names(cpus0)[i], ylab="", type="l") } cpus.mars2 <- mars(Xin, log10(cpus[cpus.samp,8]), degree=2) showcuts(cpus.mars6) test2(cpus.mars2) cpus.mars6 <- mars(Xin, log10(cpus[cpus.samp,8]), degree=6) showcuts(cpus.mars2) test2(cpus.mars6) # AVAS attach(cpus0) cpus.avas <- avas(cpus0[, 1:6], perf) plot(log10(perf), cpus.avas$ty) par(mfrow=c(2,3)) for(i in 1:6) { o <- order(cpus0[, i]) plot(cpus0[o, i], cpus.avas$tx[o, i], type="l", xlab=names(cpus0[i]), ylab="") } detach() #===================== Random Effects and Mixed Models =================== # POMS data is not freely available xyplot(Y ~ EP | No, data = petrol, xlab = "ASTM end point (deg. F)", ylab = "Yield as a percent of crude", panel = function(x, y) { m <- sort.list(x) panel.grid() panel.xyplot(x[m], y[m], type = "b", cex = 0.5) }) Petrol <- petrol names(Petrol) Petrol[, 2:5] <- scale(as.matrix(Petrol[, 2:5]), scale = F) pet1.lm <- lm(Y ~ No/EP - 1, Petrol) pet2.lm <- lm(Y ~ No - 1 + EP, Petrol) anova(pet2.lm, pet1.lm) pet3.lm <- lm(Y ~ SG + VP + V10 + EP, Petrol) anova(pet3.lm, pet2.lm) library(nlme3, first=T) # must have first=T: not needed on 2000 pet3.lme <- lme(Y ~ SG + VP + V10 + EP, random = ~ 1 | No, data = Petrol) summary(pet3.lme) pet3.lme <- update(pet3.lme, method="ML") summary(pet3.lme) pet4.lme <- update(pet3.lme, fixed = Y ~ V10 + EP) anova(pet4.lme, pet3.lme) pet5.lme <- update(pet4.lme, random = ~ 1 + EP | No) anova(pet4.lme, pet5.lme) ## Rabbits # Find starting values Fpl <- deriv(~ A + (B-A)/(1 + exp((log(d) - ld50)/th)), c("A","B","ld50","th"), function(d, A, B, ld50, th) {}) st <- nls(BPchange ~ Fpl(Dose, A, B, ld50, th), start = c(A=25, B=0, ld50=4, th=0.25), data = Rabbit)$par # Separate models for each treatment Rc.nlme <- nlme(BPchange ~ Fpl(Dose, A, B, ld50, th), fixed = list(A ~ 1, B ~ 1, ld50 ~ 1, th ~ 1), random = A + ld50 ~ 1 | Animal, data = Rabbit, subset = Treatment=="Control", start = list(fixed=st)) Rm.nlme <- update(Rc.nlme, subset = Treatment=="MDL") Rc.nlme Rm.nlme # combined model options(contrasts=c("contr.treatment", "contr.poly")) c1 <- c(28, 1.6, 4.1, 0.27, 0) R.nlme1 <- nlme(BPchange ~ Fpl(Dose, A, B, ld50, th), fixed = list(A ~ Treatment, B ~ Treatment, ld50 ~ Treatment, th ~ Treatment), random = A + ld50 ~ 1 | Animal/Run, data = Rabbit, start = list(fixed=c1[c(1,5,2,5,3,5,4,5)])) summary(R.nlme1) R.nlme2 <- update(R.nlme1, fixed = list(A ~ 1, B ~ 1, ld50 ~ Treatment, th ~ 1), start = list(fixed=c1[c(1:3,5,4)])) anova(R.nlme2, R.nlme1) summary(R.nlme2) xyplot(BPchange ~ log(Dose) | Animal * Treatment, Rabbit, xlab = "log(Dose) of Phenylbiguanide", ylab = "Change in blood pressure (mm Hg)", subscripts = T, aspect = "xy", panel = function(x, y, subscripts) { panel.grid() panel.xyplot(x, y) sp <- spline(x, fitted(R.nlme2)[subscripts]) panel.xyplot(sp$x, sp$y, type="l") }) #===================== Survival Analysis =================== # Kernel approaches attach(Aids2) plot(muhaz(death-diag+0.9, status=="D"), n.est.grid=250) library(heft) aids.heft <- heft.fit(death-diag+0.9, status=="D") heft.summary(aids.heft) par(mfrow=c(2,2)) heft.plot(aids.heft, what="s", ylim=c(0,1)); heft.plot(aids.heft) library(sm); attach(heart[heart$transplant==1,]) sm.survival(age+48, log10(stop - start), event, h=5, p=0.50) detach() library(locfit) attach(heart[heart$transplant==1,]) td <- stop - start; Age <- age+48 plot(locfit(~ td + Age, cens=1-event, scale=0, alpha=0.5, xlim=list(td=c(0,1e10)), flim=list(td=c(0,365))), type="persp") library(hazcov) heart.hc <- hazcov(Surv(td, event) ~ Age, span=0.5) plot(heart.hc) persp.hazcov(Hazard.Rate ~ Time*Age, heart.hc) detach() aids.wei <- survreg(Surv(survtime + 0.9, status) ~ state + T.categ + sex + age, data=Aidsp) summary(aids.wei, correlation=F) aids.ps <- survreg(Surv(survtime+0.9,status) ~ state + T.categ + pspline(age,df=6), data=Aidsp) aids.ps # Plot predictions of NSW hs patient against age. zz <- predict(aids.ps, data.frame( state=factor(rep("NSW",83), levels=levels(Aidsp$state)), T.categ=factor(rep("hs", 83), levels=levels(Aidsp$T.categ)), age=0:82), se=T, type="linear") plot(0:82, exp(zz$fit)/365.25, type="l", ylim=c(0,2), xlab="age", ylab = "expected lifetime (years)") lines(0:82, exp(zz$fit+1.96*zz$se.fit)/365.25, lty=3, col=2) lines(0:82, exp(zz$fit-1.96*zz$se.fit)/365.25, lty=3, col=2) rug(Aidsp$age+runif(length(Aidsp$age), -0.5, 0.5), 0.015) #===================== Multivariate Analysis =================== # Correspondence analysis corresp(caith) caith <- caith dimnames(caith)[[2]] <- c("F", "R", "M", "D", "B") par(mfcol=c(1,3)) plot(corresp(caith, nf=2)); title("symmetric") plot(corresp(caith, nf=2), type="rows"); title("rows") plot(corresp(caith, nf=2), type="col"); title("columns") farms.mca <- mca(farms, abbrev=T) # Use levels as names plot(farms.mca, cex=rep(0.7,2)) # Discriminant analysis ir <- rbind(iris[,,1], iris[,,2], iris[,,3]) ir.species <- factor(c(rep("s",50), rep("c",50), rep("v",50))) # 2 groups ir2 <- ir[-(51:100), ] ir2.species <- factor(c(rep("s",50), rep("v",50))) ir2.lda <- lda(ir2, ir2.species) plot(ir2.lda, type="density", width=1.5) # automatic choice fails cc <- predict(ir2.lda, ir[51:100, ])$x lines(density(cc, width=width.SJ(cc)), lty=3) lir <- log(ir) lir2 <- lir[-(51:100), ] lir2.lda <- lda(lir2, ir2.species) plot(ir2.lda, type="density", width=1.5) # automatic choice fails cc <- predict(lir2.lda, lir[51:100, ])$x lines(density(cc, width=1.5), lty=3) # 3 groups ir.lda <- lda(log(ir), ir.species) ir.lda ir.ld <- predict(ir.lda, dimen=2)$x eqscplot(ir.ld, type="n", xlab = "first linear discriminant", ylab = "second linear discriminant") text(ir.ld, labels = as.character(ir.species[-143]), col = 3+codes(ir.species), cex = 0.8) fgl.lda <- lda(type ~ ., fgl) fgl.ld <- predict(fgl.lda, dimen=2)$x eqscplot(fgl.ld, type="n", xlab="LD2", ylab="LD1") # either for(i in seq(along=levels(fgl$type))) { set <- fgl$type[-40] == levels(fgl$type)[i] points(fgl.ld[set,], pch=18, cex=0.6, col=2+i)} key(text=list(levels(fgl$type), col=3:8)) # or text(fgl.ld, labels = c("F", "N", "V", "C", "T", "H")[fgl$type[-40]], cex=0.6) fgl.rlda <- lda(type ~ ., fgl, method="t") fgl.rld <- predict(fgl.rlda, dimen=2)$x eqscplot(fgl.rld, type="n", xlab="LD2", ylab="LD1") # either for(i in seq(along=levels(fgl$type))) { set <- fgl$type[-40] == levels(fgl$type)[i] points(fgl.rld[set,], pch=18, cex=0.6, col=2+i)} key(text=list(levels(fgl$type), col=3:8)) # or text(fgl.rld, labels = c("F", "N", "V", "C", "T", "H")[fgl$type[-40]], cex=0.6) fgl.mlda <- lda(type ~ ., fgl, method="mve") fgl.mld <- predict(fgl.mlda, dimen=2)$x eqscplot(fgl.mld, type="n", xlab="LD2", ylab="LD1") # either for(i in seq(along=levels(fgl$type))) { set <- fgl$type[-40] == levels(fgl$type)[i] points(fgl.mld[set,], pch=18, cex=0.6, col=2+i)} key(text=list(levels(fgl$type), col=3:8)) # or text(fgl.mld, labels = c("F", "N", "V", "C", "T", "H")[fgl$type[-40]], cex=0.6) #increase len if you have enough memory. predplot <- function(object, main="", len=50, ...) { plot(Cushings[,1], Cushings[,2], log="xy", type="n", xlab="Tetrahydrocortisone", ylab = "Pregnanetriol", main=main) for(il in 1:4) { set <- Cushings$Type==levels(Cushings$Type)[il] text(Cushings[set, 1], Cushings[set, 2], labels=as.character(Cushings$Type[set]), col = 2 + il) } xp <- seq(0.6, 4.0, length=len) yp <- seq(-3.25, 2.45, length=len) cushT <- expand.grid(Tetrahydrocortisone=xp, Pregnanetriol=yp) Z <- predict(object, cushT, ...); zp <- as.numeric(Z$class) zp <- Z$post[,3] - pmax(Z$post[,2], Z$post[,1]) contour(xp/log(10), yp/log(10), matrix(zp, len), add=T, levels=0, labex=0) zp <- Z$post[,1] - pmax(Z$post[,2], Z$post[,3]) contour(xp/log(10), yp/log(10), matrix(zp, len), add=T, levels=0, labex=0) invisible() } cush <- log(as.matrix(Cushings[, -3])) tp <- factor(Cushings$Type[1:21]) cush.lda <- lda(cush[1:21,], tp); predplot(cush.lda, "LDA") cush.qda <- qda(cush[1:21,], tp); predplot(cush.qda, "QDA") predplot(cush.qda, "QDA (predictive)", method = "predictive") predplot(cush.qda, "QDA (debiased)", method = "debiased") # Logistic library(nnet) Cf <- data.frame(tp = tp, Tetrahydrocortisone = log(Cushings[1:21,1]), Pregnanetriol = log(Cushings[1:21,2]) ) cush.multinom <- multinom(tp ~ Tetrahydrocortisone + Pregnanetriol, Cf, maxit=250) xp <- seq(0.6, 4.0, length=100); np <- length(xp) yp <- seq(-3.25, 2.45, length=100) cushT <- expand.grid(Tetrahydrocortisone=xp, Pregnanetriol=yp) Z <- predict(cush.multinom, cushT, type="probs") plot(Cushings[,1], Cushings[,2], log="xy", type="n", xlab="Tetrahydrocortisone", ylab = "Pregnanetriol") for(il in 1:4) { set <- Cushings$Type==levels(Cushings$Type)[il] text(Cushings[set, 1], Cushings[set, 2], labels=as.character(Cushings$Type[set]), col = 2 + il) } zp <- Z[,3] - pmax(Z[,2], Z[,1]) contour(xp/log(10), yp/log(10), matrix(zp, np), add=T, levels=0, labex=0) zp <- Z[,1] - pmax(Z[,2], Z[,3]) contour(xp/log(10), yp/log(10), matrix(zp, np), add=T, levels=0, labex=0) # Classification tree cush.tr <- tree(tp ~ Tetrahydrocortisone + Pregnanetriol, Cf) plot(cush[,1], cush[,2], type="n", xlab="Tetrahydrocortisone", ylab = "Pregnanetriol") for(il in 1:4) { set <- Cushings$Type==levels(Cushings$Type)[il] text(cush[set, 1], cush[set, 2], labels= as.character(Cushings$Type[set]), col = 2 + il) } par(cex=1.5); partition.tree(cush.tr, add=T); par(cex=1)