# script for Statistical Data Mining Course. # Copyright (C) 1998 B. D. Ripley # # Partitioning cpus.tr <- tree(perf ~ syct+mmin+mmax+cach+chmin+chmax, cpus) summary(cpus.tr) print(cpus.tr) par(mfrow=c(1,2)) plot(cpus.tr, type="u"); text(cpus.tr, srt=90) cpus.ltr <- tree(log10(perf) ~ syct + mmin + mmax + cach + chmin + chmax, cpus) summary(cpus.ltr) plot(cpus.ltr, type="u"); text(cpus.ltr, srt=90) par(mfrow = c(1,1)) ir.species <- factor(c(rep("s",50), rep("c", 50), rep("v", 50))) ird <- data.frame(rbind(iris[,,1], iris[,,2], iris[,,3])) ir.tr <- tree(ir.species ~., ird) summary(ir.tr) ir.tr plot(ir.tr) text(ir.tr, all=T) ir.tr1 <- snip.tree(ir.tr, nodes = c(12, 7)) ir.tr1 summary(ir.tr1) par(pty="s") plot(ird[, 3],ird[, 4], type="n", xlab="petal length", ylab="petal width") text(ird[, 3], ird[, 4], as.character(ir.species)) par(cex=2) partition.tree(ir.tr1, add=T) par(cex=1) par(pty="m") fgl.tr <- tree(type ~ ., fgl) summary(fgl.tr) plot(fgl.tr); text(fgl.tr, all=T, cex=0.5) fgl.tr1 <- snip.tree(fgl.tr, node=c(11, 53, 105, 108, 31)) tree.screens() plot(fgl.tr1) tile.tree(fgl.tr1, fgl$type) close.screen(all = T) shuttle.tr <- tree(use ~ ., shuttle, subset=1:253, mindev=1e-6, minsize=2) shuttle.tr plot(shuttle.tr, type="u"); text(shuttle.tr, pretty=0) shuttle1 <- shuttle[254:256, ] # 3 missing cases predict(shuttle.tr, shuttle1) # Cutting trees down to size par(mfrow=c(1,2), pty="s") plot(prune.tree(cpus.ltr)) cpus.ltr1 <- prune.tree(cpus.ltr, best=8) plot(cpus.ltr1); text(cpus.ltr1) summary(prune.tree(fgl.tr, k=10)) set.seed(123) plot(cv.tree(cpus.ltr,, prune.tree)) set.seed(123) fgl.cv <- cv.tree(fgl.tr,, prune.tree) for(i in 2:5) fgl.cv$dev <- fgl.cv$dev + cv.tree(fgl.tr,, prune.tree)$dev fgl.cv$dev <- fgl.cv$dev/5 plot(fgl.cv) misclass.tree(fgl.tr) misclass.tree(prune.tree(fgl.tr, best=5)) set.seed(123) fgl.cv <- cv.tree(fgl.tr,, prune.misclass) for(i in 2:5) fgl.cv$dev <- fgl.cv$dev + cv.tree(fgl.tr,, prune.misclass)$dev fgl.cv$dev <- fgl.cv$dev/5 fgl.cv plot(fgl.cv) prune.misclass(fgl.tr) par(mfrow=c(1,1), pty="m") # Low birth weights # in case you did not do this in course A. if(!exists("bwt")) { attach(birthwt) race <- factor(race, labels=c("white", "black", "other")) ptd <- factor(ptl > 0) ftv <- factor(ftv); levels(ftv)[-(1:2)] <- "2+" bwt <- data.frame(low=factor(low), age, lwt, race, smoke=(smoke>0), ptd, ht=(ht>0), ui=(ui>0), ftv) detach(); rm(race, ptd, ftv) } bwt.tr <- tree(low ~ ., bwt) summary(bwt.tr) bwt.tr1 <- prune.tree(bwt.tr, k=2) summary(bwt.tr1) prune.misclass(bwt.tr1) bwt.tr3 <- prune.misclass(bwt.tr1, best=5) bwt.cv <- cv.tree(bwt.tr,, prune.misclass) for(i in 2:5) bwt.cv$dev <- bwt.cv$dev + cv.tree(bwt.tr,, prune.misclass)$dev bwt.cv$dev <- bwt.cv$dev/5 bwt.cv plot(bwt.tr3) text(bwt.tr3, label="1") # Library RPart library(rpart) set.seed(123) cpus.rp <- rpart(log10(perf) ~ ., cpus[ ,2:8], cp=1e-3) cpus.rp ird <- data.frame(rbind(iris[,,1], iris[,,2],iris[,,3]), Species=c(rep("s",50), rep("c",50), rep("v",50))) ir.rp <- rpart(Species ~ ., data=ird, method="class", cp=1e-3) ir.rp printcp(cpus.rp) plotcp(cpus.rp) print(cpus.rp, cp=0.006, digits=3) cpus.rp1 <- prune(cpus.rp, cp=0.006) plot(cpus.rp1, branch=0.4, uniform=T) text(cpus.rp1, digits=3) printcp(ir.rp) summary(ir.rp) ir.rp1 <- rpart(Species ~ ., ird, cp=0, minsplit=5, maxsurrogate=0) printcp(ir.rp1) print(ir.rp1, cp=0.015) set.seed(123) fgl.rp <- rpart(type ~ ., fgl, cp=0.001) plotcp(fgl.rp) printcp(fgl.rp) print(fgl.rp, cp=0.02) fgl.rp2 <- prune(fgl.rp, cp=0.02) plot(fgl.rp2); text(fgl.rp2) set.seed(123) bwt.rp <- rpart(low ~ ., bwt, cp=0.001, minsplit=10, minbucket=5) bwt.rp plotcp(bwt.rp) bwt.rp2 <- rpart(low ~ ., bwt, cp=0.0, xval=0, minsplit=10, minbucket=5, parms=list(split="information")) bwt.rp2