library(nnet) library(glmnet) trainx <- read.table("trainx.txt") trainy <- read.table("trainy.txt") testx <- read.table("testx.txt") testy <- read.table("testy.txt") trainx <- as.matrix(trainx / 256) testx <- as.matrix(testx / 256) trainy <- trainy trainy <- as.factor(trainy[,1]) testy <- as.factor(testy[,1]) n <- 2000 ### 0a k <- 9 y <- trainy==k ty <- testy==k m <- glmnet(trainx,y,family="binomial") trainerr <- sum(predict(m,newx=trainx,type="class",s=0)!=y) testerr <- sum(predict(m,newx=testx,type="class",s=0)!=ty) print(trainerr) print(testerr) # Overfitting. Training error 0. ### 0b m <- cv.glmnet(trainx,y,family="binomial",type.measure="class") lambda <- m$lambda validerr <- m$cvm g <- m$glmnet trainp <- predict(g,newx=trainx,type="class") testp <- predict(g,newx=testx ,type="class") trainerr <- lambda testerr <- lambda for (i in 1:length(lambda)) { trainerr[i] <- sum(trainp[,i]!=y)/n testerr[i] <- sum(testp[,i]!=ty)/n } plot(log(lambda),trainerr,type="l") lines(log(lambda),validerr,col=2) lines(log(lambda),testerr,col=3) ### 0c,d numiter <- 20 trainitererr <- matrix(0,1,numiter) testitererr <- matrix(0,1,numiter) net <- nnet(trainx,trainy==9,size=10,entropy=TRUE, MaxNWts=100000,maxit=0,decay=0) for (i in 1:numiter) { net <- nnet(trainx,trainy==9,size=10,entropy=TRUE, MaxNWts=100000,maxit=10,decay=0,Wts=net$wts) trainp <- predict(net,trainx,type="raw")>.5 testp <- predict(net,testx ,type="raw")>.5 trainitererr[i] <- sum((trainy==9)!=trainp)/n testitererr[i] <- sum((testy==9)!=testp)/n } plot(seq(10,200,by=10),trainitererr,type="l") lines(seq(10,200,by=10),testitererr,col=3) ### 1a [2 marks] err <- matrix(1,2,10) m <- vector('list',10) for (k in 0:9) { y <- trainy==k ty <- testy==k m[[k+1]] <- glmnet(trainx,y,family="binomial") err[1,k+1] <- sum(predict(m[[k+1]],newx=trainx,type="class",s=0)!=y) err[2,k+1] <- sum(predict(m[[k+1]],newx=testx,type="class",s=0)!=ty) } print(err) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] #[1,] 0 0 0 0 0 0 0 0 0 0 #[2,] 28 65 66 71 27 63 26 29 177 77 # Overfitting. Training error 0. ### 1b [2 marks] lambda <- vector('list',10) trainerr <- vector('list',10) validerr <- vector('list',10) testerr <- vector('list',10) for (k in 0:9) { y <- trainy==k ty <- testy==k m[[k+1]] <- cv.glmnet(trainx,y,family="binomial",type.measure="class") lambda[[k+1]] <- m[[k+1]]$lambda validerr[[k+1]] <- m[[k+1]]$cvm g <- m[[k+1]]$glmnet trainp <- predict(g,newx=trainx,type="class") testp <- predict(g,newx=testx ,type="class") trainerr[[k+1]] <- lambda[[k+1]] testerr[[k+1]] <- lambda[[k+1]] for (i in 1:length(lambda[[k+1]])) { trainerr[[k+1]][i] <- sum(trainp[,i]!=y)/n testerr[[k+1]][i] <- sum(testp[,i]!=ty)/n } } for (k in 0:9) { plot(m[[k+1]],ylim=c(0,.1)) lines(log(lambda[[k+1]]),trainerr[[k+1]]/n,type="l") lines(log(lambda[[k+1]]),testerr[[k+1]]/n,col=3) readline() } # cross-validation works. Particularly digits 9 and 0 is quite pronounced. # curves generally look to be expected, though there wasn't as much # overfitting observed for digits 1...8 ### 1c [2 marks] testprob <- matrix(1,n,10) testerr <- matrix(1,1,10) for (k in 0:9) { testprob[,k+1] <- predict(m[[k+1]],newx=testx,type="response") testerr[k+1] <- sum((testprob[,k+1]>.5)!=(testy==k))/n } testp <- apply(testprob,1,which.max)-1 alltesterr <- sum(testp!=testy)/n # Does a lot worse than individual 1-vs-rest tasks would indicate. But to # be expected, this is harder, and not optimized properly ### 1d [3 marks] m10 <- cv.glmnet(trainx,trainy,family="multinomial",type.measure="class") plot(m10) testp <- predict(m10,newx=testx,type="class") sum(testp!=testy)/n # Produces slightly improved result. ### 2a,b [2 marks, 1 mark] numiter <- 20 trainitererr <- matrix(0,1,numiter) testitererr <- matrix(0,1,numiter) net <- nnet(trainx,class.ind(trainy),size=10,softmax=TRUE, MaxNWts=100000,maxit=0,decay=0) for (i in 1:numiter) { net <- nnet(trainx,class.ind(trainy),size=10,softmax=TRUE, MaxNWts=100000,maxit=10,decay=0,Wts=net$wts) trainp <- predict(net,trainx,type="class") testp <- predict(net,testx ,type="class") trainitererr[i] <- sum(trainy!=trainp)/n testitererr[i] <- sum(testy!=testp)/n } plot(1:numiter,trainitererr) plot(1:numiter,testitererr) # training and test both decrease. sometimes test errors increase, this is # a form of overfitting, as weights become larger and more overfitting as # more iterations are used. each repeat gives somewhat different results. ### 2c [4 marks] ii <- sample(1:n) traini <- ii[1:(n*.7)] validi <- ii[(n*.7+1):n] decay <- exp(seq(-5,3,by=1)) traindecayerr <- matrix(0,1,length(decay)) validdecayerr <- matrix(0,1,length(decay)) testdecayerr <- matrix(0,1,length(decay)) for (i in 1:length(decay)) { net <- nnet(trainx[traini,],class.ind(trainy[traini]),size=10,softmax=TRUE, MaxNWts=100000,maxit=200,decay=decay[i]) trainp <- predict(net,trainx[traini,],type="class") validp <- predict(net,trainx[validi,],type="class") testp <- predict(net,testx ,type="class") traindecayerr[i] <- sum(trainy[traini]!=trainp)/n*2 validdecayerr[i] <- sum(trainy[validi]!=validp)/n*2 testdecayerr[i] <- sum(testy!=testp)/n } plot(log(decay),traindecayerr,type="l") lines(log(decay),validdecayerr,col=2) lines(log(decay),testdecayerr,col=3) bestdecay <- decay[which.min(validdecayerr)] ### 2d [2 marks] net <- nnet(trainx,class.ind(trainy),size=10,softmax=TRUE, MaxNWts=100000,maxit=200,decay=bestdecay) testp <- predict(net,testx ,type="class") besttestdecayerr <- sum(testy!=testp)/n points(log(bestdecay),besttestdecayerr,col=3) # achieved 7% error on my run. Better than linear model. ### 2e [4 marks] size <- c(10,20,30,50,75,100) trainsizeerr <- matrix(0,1,length(size)) validsizeerr <- matrix(0,1,length(size)) testsizeerr <- matrix(0,1,length(size)) for (i in 1:length(size)) { net <- nnet(trainx[traini,],class.ind(trainy[traini]),size=size[i],softmax=TRUE, MaxNWts=100000,maxit=200,decay=bestdecay) trainp <- predict(net,trainx[traini,],type="class") validp <- predict(net,trainx[validi,],type="class") testp <- predict(net,testx ,type="class") trainsizeerr[i] <- sum(trainy[traini]!=trainp)/n*2 validsizeerr[i] <- sum(trainy[validi]!=validp)/n*2 testsizeerr[i] <- sum(testy!=testp)/n } plot(log(decay),traindecayerr,type="l") lines(log(decay),validdecayerr,col=2) lines(log(decay),testdecayerr,col=3) bestsize <- size[which.min(validdecayerr)] # the larger the better. ### 2f [3 marks] # We should optimize both parameters at once, to achieve better results. # But this is computationally expensive. # instead we can alternate between optimizing both a few times, might be # cheaper.