######################################################################### #Stats Taster day 30th March 2017 Dept of Statistics Oxford #Multidimensional Scaling (MDS) as a visualisation tool in Data Mining ######################################################################### #load some libraries of functions we will u library(MASS); library(cluster); require(rgdal); require(sp); require(plotGoogleMaps) ######################################################################### # # Do an MDS projection using distances between postcodes # Compare the MDS map with the real map ### #First the real map #Here is the postcode data - this is just the district code NW1 etc #I found these data here https://www.doogal.co.uk/PostcodeDownloads.php pc.g=read.csv("http://www.stats.ox.ac.uk/~nicholls/StatsTasterDay/Postcode_districts.csv") head(pc.g) #pull out the Oxford codes - Change this to eg NW for the NW codes codes.of.interest <- grep("OX",pc.g$Postcode) pc.l=pc.g[codes.of.interest,] #just the data for the area of interest pc.n=length(codes.of.interest) #the number of selected postcodes head(pc.l) #top rows of dataframe for selected poscodes #choose the labels we want to display on the map - the first listed is displayed datamap<-pc.l[,c("Postcode","Population","Region")] row.names(datamap) <- 1:pc.n #give the locations of the poscodes datamap_mat<- cbind(pc.l$Longitude,pc.l$Latitude) row.names(datamap_mat) <- 1:nrow(datamap_mat) #tell Google Maps we are using Lat & Long and prepar the map AACRS <- CRS("+proj=longlat +ellps=WGS84") UK_Map <- SpatialPointsDataFrame(datamap_mat, datamap, proj4string = AACRS, match.ID = TRUE) #display the map m <- plotGoogleMaps(UK_Map , filename='MAP_UK.html') ### #Second the MDS projection - we KNOW the data has a 2D projection - because the #data came off a map #compute the distances between postcodes pc.d=dist(datamap_mat) #for example compute the distance between OX1 and OX2 by hand datamap[c(1,2),] p12=datamap_mat[c(1,2),]; p12 #the next two numbers should be equal sqrt( (p12[1,1]-p12[2,1])^2+(p12[1,2]-p12[2,2])^2 ) pc.d[1] #calculate the MDS projection - points located to reproduce the distances in pc.d pc.sam=sammon(pc.d,y=matrix(runif(pc.n*2),pc.n,2)) #plot the points - the first line makes a blank plot of the right area, the second #writes the text labels for the postcodes in the right places plot(pc.sam$points,xaxt='n',yaxt='n',xlab='',ylab='',main="My Town",type='n') text(pc.sam$points,labels=pc.l$Postcode,pos=4,cex=0.7) #Compare this figure with google map - you will find the MDS points are in #the same relative locations as the real postcodes - probably rotated - #the distances by themselves dont determine the rotation. ################################################################################### # # apply this to show similarities between peoples' interests # #the peoples names are in coloumn 1 of the data ch<-read.csv("http://www.stats.ox.ac.uk/~nicholls/StatsTasterDay/choices.csv",row.names=1) ch #some basic operations we use to pull out different parts of the data frame names(ch) ch[,1:9] #I wont use age or gender info in MDS, just in final display ch$TV ch["Geoff",] #my preferences ch["Keith",] #my Dad's preferences ch["Geoff",1:9]!=ch["Keith",1:9] #we differ on 5 out of 9 #we will use this to define the distance between each pair of individuals mean(ch["Geoff",1:9]!=ch["Keith",1:9]) #calculate distances between all pairs np=dim(ch)[1]; d=matrix(0,np,np); for (i in 1:np) { for (j in 1:np) { d[i,j]=mean(ch[i,1:9]!=ch[j,1:9]) } } d=as.dist(d) #convert to standard format R uses for distance matrices d #show the distances as an image image(as.matrix(d),xaxt='n',yaxt='n') axis(1, at=seq(0,1,length.out=np),label=rownames(ch),cex.axis=0.7,las=2) axis(2, at=seq(0,1,length.out=np),label=rownames(ch),cex.axis=0.7,las=2) #calculate the MDS point pattern matching the distances in d ch.mds<-sammon(d,magic=0.50001); pts=ch.mds$points #plot the points in the same way as before, color by age, point char by gender plot(1.5*pts,xaxt='n',yaxt='n',xlab='',ylab='',main="Favorites",type='n') palette(rainbow(length(levels(ch$Age)))) #the number of colors points(pts,pch=as.numeric(ch$Gender)) text(pts,labels=row.names(ch),pos=4,cex=0.9,col=as.numeric(ch$Age)) #We get a simple visualisation of how different people have similar interests # # EXERCISE - OK now your turn. Analyse the data we gathered via the google form # ... open a new script, paste the text from this section into the new script # and then edit it to handle the new data. #Here is a loader for the data ch<-read.csv("http://www.stats.ox.ac.uk/~nicholls/StatsTasterDay/Interests.csv",row.names=2) # #You write this bit # ########################################################################### # # If you have time - an analysis of some political data - this is a big calculation # but it gives a nice visualisation of some very high dimensional data - real data mining. # # Here is some data on MP voting in parliament. It shows for each MP whether # they voted for or against each motion in parliament - or didnt vote at all - # we will make a display showing similarities and differences between MP votes. # # Data downloaded from http://www.publicwhip.org.uk/ vote.data<-function(year) { handle=paste("vote",as.character(year),sep='') web.handle=paste("http://www.stats.ox.ac.uk/~nicholls/StatsTasterDay/",handle,sep='') #how they voted X<-read.table(paste(web.handle,".csv",sep=''),header=T,sep=",",comment.char="",quote="") #who they are N<-read.table(paste(web.handle,".txt",sep=''),header=T,sep=",",comment.char="",quote="") X[X==-9]<-NA #DNA X[X==2]<-1 #voted yes (by asking someone else to record their vote) X[X==5]<-4 #voted no X[X==4]<-2 #make the yes's all 1 and no's all 2 X[X==3]<-NA #abstained X<-X[,-c(1:4)] #info re motion/law voted on dX=dim(X) X<-X[,-dX[2]] #kill unwanted empty col at end #replace the MP id's with names and match to info about MP in "M" M=N[order(as.character(N[,1])),] Y=X[,order(names(X))] names(Y)<-M[,3] return(list(M=M,Y=Y)) } #load the data and put it in a form suitable for analysis dat2010=vote.data(year=2010) M=dat2010$M; Y=dat2010$Y head(M) #MP names and parties Y[c(1:4),c(1:10)] #the votes data (top few lines) NA is "did not vote" dim(Y) #there are 664 distinct MP's voting on 1226 motions #make a distance matrix - use binary distance ignoring NAvalues d<-dist(t(Y==1),'binary') #this takes a few seconds d<-d+0.001 #MDS doesnt allow zero distance image(as.matrix(d)) #(*1) calculate the MDS point mattern n=dim(Y)[2]; y.init=matrix(runif(2*n),n,2); sam<-sammon(d,y=y.init,magic=0.50001) #(*2) plot the points showing the party palette(rainbow(length(levels(M$party)))) plot(1.25*sam$points,xaxt='n',yaxt='n',xlab='',ylab='',main=as.character(2010),type='n') text(sam$points,labels=M$party,pos=4,cex=0.7,col=as.numeric(M$party)) #replace labels=M$party with labels=names(Y) to get MP names #this may not be so clear - it is a hard optimisation problem to find the point pattern #best matching the distances d - try repeating the code from (*1, *2) to get a lower stress #Now do exactly the same for each of the parliaments #1997-2001, 2001-2005, 2005-2010, 2010-2015 vote.mds<-function(year='',M,Y,K=10,show=TRUE) { #a function that loads each data set and computes the MDS projection #name for title of graph handle=paste("vote",as.character(year),sep='') #distance matrix d<-dist(t(Y==1),'binary') d<-d+0.001 #MDS doesnt allow zero distance n=dim(Y)[2]; bs=1; bm=NA for (k in 1:K) { y.init=matrix(runif(2*n),n,2); mg=runif(1); di.sam<-sammon(d,y=y.init,magic=mg,niter=10000,tol=1e-8) if (di.sam$stress