##############################################
# R script for getting started with R and the
# packages 'network' and 'sna'
# v. January 19, 2015
# (c) Christian Steglich, Tom Snijders
##############################################
# The IMF data were collected by David de Jong
# at the Institute for Finance Management in
# Dar Es Salaam, Tanzania, as part of his
# sociology master thesis (2011, University
# of Groningen).
##############################################
#
# Note that what comes after the symbol #
# is interpreted by R as a comment, not a command.
#
#
# ===================
# Overview of topics:
# ===================
# a) Installation of R and R packages
# b) Getting help
# c) Basic operations
# d) Vectors
# e) Matrices
# f) Lists
# g) Data frames
# h) Data import
# i) Basic statistics
# j) Towards statistical inference
# k) Basic programming in R
# l) Regression in R (linear and logistic)
# m) Network data in R
# n) Plotting networks
# o) Basic network statistics
#
#
#
# ===================================
# a) Installation of R and R packages
# ===================================
#
# To install R, follow these steps:
# >> Go to http://cran.r-project.org/
# >> Select mirrors on the left-hand menu.
# >> Select the location nearest you.
# >> Select the link for your operating system in the
# >> "Download and Install R" section.
# >> Click on base.
# >> Follow the instructions for download.
#
# Start R after the download. To install R packages, you can use the RGui
# menu "Packages > Install package(s)..." or simply type:
install.packages("network")
install.packages("sna")
install.packages("igraph")
# Installation does not yet make the commands in a package available
# for use inside R. To make these commands available, add them to the
# current namespace by typing:
library(network)
library(sna)
# Now your "R namespace" contains the commands in these two packages,
# i.e., R will from now on understand what you mean when you want to
# use these commands.
# What if we add the igraph-package to the namespace?
library(igraph)
# Many commands have the same names as in the other two packages!
# Better is not to mix the two... (they can be used in parallel,
# but the syntax becomes more complicated - here, let us drop
# the igraph-package again:
detach(package:igraph)
# ===============
# b) Getting help
# ===============
# You can get information about functions (such as their inputs, outputs, and purpose)
# by using the "help()" function. Help files are important as they describe the
# arguments to the functions as well the values that result from running the function.
# For example, say we wanted to calculate the density of a network.
# If you knew the name of the appropriate sna-function is 'gden' you can type
help(gden)
# and learn how to do it. An alias for this command is "?gden".
# If you search for a function but are not sure of the name, type
help.search("density")
# which gives you function names that might match. An alias for this command is "??gden".
# So, what about reciprocity indices available in 'sna'?
help.search("reciprocity")
# If you search for object and function names which include an exact text quote,
# use the function "apropos()".
apropos("rec")
# This will give a list of available commands that contain the quote.
# It also is possible to look up all the functions within a package, by typing:
help(package=sna)
# This shows you the range of things you could do using that particular package.
# ===================
# c) Basic operations
# ===================
# Assigning values to variables:
a <- 20
# An alias for this would be "a=20"
# Looking at the assigned object:
a
# should give value "20".
# Basic calculations:
a+10
a/10
sqrt(a)
# Storing calculation results in new variables:
d <- sqrt(a)
d
# Alternatively, you could have written
d <- sqrt(a);d
# The semicolon separates command lines just like a carriage return.
# Another alternative for doing the calculation
# and also showing the result is putting the command between parentheses:
(d <- sqrt(a))
# Logicals:
d==a
# should give value "FALSE" (shorthand "F" or "0" when used in assignments).
# The exclamation mark ! is used for logical negation ("not..."):
d!=a
# should give value "TRUE" (shorthand "T" or "1" when used in assignments).
# Inequalities:
d=a
# To keep an overview of the available objects in your session, type
ls() # an alias is 'objects()'
# which give a list of all objects. To remove an object, type
rm(d) # an alias is 'remove()'
ls()
# Object d should be gone now.
# ==========
# d) Vectors
# ==========
# Creating a vector:
v=c(2,3,4)
v
# Looking up the second element in the vector:
v[2]
# You can create a longer vector from two smaller vectors:
v2=c(v,v)
v2
# The letter c can be read as "concatenation".
# Mixing text (character type) with numerics makes it all a character vector:
v3=c(2,3,"apple")
v3
# Check this:
is.character(v3)
is.character(v2)
# Creating vectors quickly using replication and sequence:
# To create a vector that goes from 1 to 10 by steps of 1, type
1:10
# or
seq(1,10)
# To create a vector that goes from from 2 to 10 by steps of 0.5, type
seq(2,10,by=.5)
# If you want help for the function seq(), ask for it:
?seq
# To create a vector of 100 3s, type
rep(3,100)
# You can also replicate vectors:
rep(c(3,2),100)
# There are some more fancy options of the replicate function:
# Doing each element 100 times:
rep(c(3,2),each=100)
# Vary the number of replications for each element:
rep(c(3,2),time=c(1,5))
# Now, consider operations on a vector. First define a nice one:
(a=rep(c(3,2),time=c(1,5)))
# Elementwise operations:
a*3
a+1
a+a
# Logicals for vectors:
a==3
# Subvectors:
# First create another example vector:
(b <- c(a, 1:4))
# Show a subvector:
b[1:4]
# Another example of "logicals for vectors":
(b <= 2)
# Useful function which()
which(b <= 2)
# Use this for a subvector
b[b <= 2]
# or alternatively
b[which(b <= 2)]
#
# How many elements are equal to 3?
sum(b==3)
length(b[b==3])
# Subvectors will help later to select subsets of actors from larger data sets!
# ===========
# e) Matrices
# ===========
# Creating a matrix:
mat <- matrix(1:36, nr=6,ncol=6)
mat
mat[1,] # getting the first row
mat[,1] # getting the first column
mat[2,3] # getting the element in the 2nd row and 3rd column
mat[1:3,] # getting the 1st through 3rd rows
mat[1:3,1:3] # getting the 1st through 3rd rows
# and the 1st through 3rd columns
t(mat) # transposing the matrix
# We can also make matrices using cbind or the rbind command.
# cbind=puts together as columns:
mat2=cbind(1:10,21:30)
mat2
# rbind puts together as rows:
mat3=rbind(1:10,21:30)
mat3
# Some more functions for matrices:
dim(mat2) # check the number of rows and columns using dim
nrow(mat2) # check the number of rows
ncol(mat2) # check the number of columns
# We can operate on the matrix 'cellwise':
mat4 <- mat2*2 # scalar times matrix
mat4
mat2+mat4 # sum per cell
mat2>4 # logicals
# ..or do more complex matrix multiplication:
mat2 %*% mat3
# ========
# f) Lists
# ========
# Using lists can be a good way to store data. Anything can be put into a
# list, next to one another: vectors, matrices, characters - without altering
# the type of the other elements in the list.
# The results of analytical procedures often come in this format.
alist=list(1:6,mat2,"some text")
alist
alist[[1]] # pulling the first part of the list
alist[[2]] # pulling the second part of the list
alist[[3]] # pulling the third part of the list
alist[[1]][5] # get the 5th element in the first part of the list
# It would be nicer to work with names instead of indices!
# Put names on the list:
alist=list(element1=1:6,element2=mat2)
alist
# Finding out the names a list type object has:
names(alist)
# Using the names to grab the first part of the list
alist$element1
# What if calling the name without the list?
element1
# Should result in an error. But after you type
attach(alist)
# the names of "alist" become general vocabulary, and typing
element1
# yields the desired result.
# To un-attach the names of "alist", type
detach(alist)
# When you now again type
element1
# you should again get the error message you got earlier.
# Attaching names can help typing effort!
# But it can also be confusing if the same names
# occur in different attached objects, so use it consciously.
# ==============
# g) Data frames
# ==============
# Data frames are special lists which contain variables with the same number
# of row entries. To learn more, type
?data.frame
# An example:
dat <- data.frame(id=1:6, gender=rep(c("M","F"),3))
dat
# Calculations 'as usual, e.g.:
dat[,1]*8
# Text variables are treated as categorical data:
class(dat[,2])
# The sex character has type "factor"; in many analytical methods this means
# the first category is used as reference category & the data are
# interpreted as nominal; actually "factor" is an ordinal type.
?factor
# To turn the variable type back into character, type
(dat[,2] <- as.character(dat[,2]))
# ==============
# h) Data import
# ==============
# We will often want to read in data from a saved location. See
?read.table
?read.csv
# For example, let's read in some data on the employees of the IFM
# which should be saved somewhere on your computer.
# First, use
getwd()
# which shows what R considers to be the current working directory.
# Then use
setwd("")
# to identify the folder where you placed the IFM data files.
# (When running with NppToR, this is automatically done.)
# You can use
list.files()
# to show the files in the working directory.
# If you wish to use the data set, download it from
# http://www.stats.ox.ac.uk/~snijders/siena/lab-intro.zip
# and unzip in your working directory.
# Read the attribute data:
IFMattrib <- read.csv("IFM-attributes.csv")
# Let's take a look:
IFMattrib[1:10,]
# ===================
# i) Basic statistics
# ===================
# Average, standard deviation, quartiles:
mean(IFMattrib$age) # calculating the mean gpa in our dataset
sd(IFMattrib$age) # calculating the standard deviation of age
quantile(IFMattrib$age) # looking at the quartiles(! default)
hist(IFMattrib$tenure,main="Years at the IFM") # a histogram
# As shown above, you can get rid of always writing "IFMattrib$"
# as follows:
attach(IFMattrib)
# Some more examples of what is possible is basic R:
table(jobtitle) # tabulation of the data
mean(tenure[sex=='F']) # mean tenure of female employees
mean(tenure[sex=='M']) # mean tenure of male employees
cor(age,tenure) # correlation
plot(jobtitle,tenure) # boxplots of tenure by job title
plot(density(MotSoc)) # a density plot of social job motivation
plot(MotSoc,MotEgo) # a scatterplot of social by ego motivation
# etc.
# You can make the attach command undone by this command:
detach(IFMattrib)
# which is useful to avoid future confusion (see above).
# ================================
# j) Towards statistical inference
# ================================
# We can take draws from an underlying model distribution as a way
# of generating a reference (null) distribution, then compare what
# this with some fixed (typically: observed) value.
# For example, let's takes some draws from a normal distribution,
# mean =0, sd=1, and ask how likely we would be to find a value
# of 1.96 or greater.
# Of course, this should be a familar question to anyone familar
# with traditional hypothesis testing...
# Take a random sample of size 50000 from the N(0,1) distribution:
rand.dist <- rnorm(50000,0,1)
# Take a look:
rand.dist
# Inspect distribution:
plot(density(rand.dist))
# Should look pretty normal...
# What is the relative frequency of a value of 1.96 or higher?
sum(rand.dist>1.96)/50000
# Should be about 2.5% of the time.
# Bootstrapping & permutations:
# =============================
# We can make perform statistical comparisons without assuming
# the data is normally distributed. we can take draws from the
# underlying data, with replacement, and use that as our
# random baseline comparison.
# Let's see how to do that using the IFM data.
# Set the sample size at the number of observations:
sample.size <- nrow(IFMattrib)
sample.size
# Is seventy-seven.
# Take 77 draws withOUT replacement from the age data:
sim.age <- sample(IFMattrib$age,replace=FALSE,size=sample.size)
sim.age
# This can be viewed as a permutation of the original data,
# because we sampled without replacement. Statistics should be
# identical to the original sample:
quantile(sim.age)
quantile(IFMattrib$age)
# indeed identical - etc.
# Now take 100 draws WITH replacement from the age data:
sim2.age <- sample(IFMattrib$age,replace=TRUE,size=100)
sim2.age
# Statistics now should be similar, but not identical to the
# original sample:
quantile(sim2.age)
quantile(IFMattrib$age)
# Let's ask a hypothetical question:
# Is an age of 44 large relative to the sample?
# Divide the number of draws greater than or equal to 44 by the
# total number of draws:
sum(sim2.age>=44)/length(sim.age)
# Apparently this is not significantly larger than the age
# average. As last step, let's visualise the bootstrapped age
# distribution:
plot(density(sim2.age))
# Looks like a mixture distribution of old and young staff.
# Of course, this could already be seen in the original data...
# =========================
# k) Basic programming in R
# =========================
# A very short introduction to loops, if-else statements and functions.
# A LOOP allows you to iteratively do the same process, for example
# let's loop through each column of our dataset, finding the maximum
# value for each column. There isn't a maximum value for our character
# variables (that doesn't really make sense), so let's also use an
# IF statement so that we only perform the "max()" function on the
# columns that are numeric.
# First set up an output vector to put values in it:
maxVector <- NA
maxVector
# Now do the calculations:
for (i in 1:ncol(IFMattrib)) {
if (class(IFMattrib[,i]) %in% c("integer", "numeric" )) {
maxVector[i] <- max(IFMattrib[,i])
} else {
maxVector[i] <- NA
}
}
# Let's take a look at our output:
maxVector
# For clarity, attach labels:
names(maxVector) <- colnames(IFMattrib)
maxVector
# Now, a simple example of a FUNCTION.
# It is often helpful to be able to write your own function in R.
# Functions allows us to specify some inputs, what to do with those inputs
# and what the output is, given our input and the operation on the inputs.
# Let's use a real simple example calculating the mean to get the idea:
mean.myfunc <- function(x){
return(sum(x)/length(x))
}
# The input is x, a vector, and the operation is sum(x)/length(x).
# Now let's see what it does:
mean.myfunc(x=IFMattrib$MotSoc)
# Note that the input into the function does need to be named "x";
# you just need to tell the function what x is equal to. If there
# is just one argument (here x), you can also omit this identification:
mean.myfunc(IFMattrib$MotSoc)
# As we know the same function already exists in R:
mean(IFMattrib$MotSoc)
# ========================================
# l) Regression in R (linear and logistic)
# ========================================
# Linear model
# ------------
# Still using the IFM data, a simple linear model predicting tenure
# as a function of job title and gender is this:
tenure.lm <- lm(tenure~factor(jobtitle)+sex,data=IFMattrib)
summary(tenure.lm)
# Logistic model
# --------------
# Now let's try to predict who has above average social motivation!
# Make it a dummy variable; first reserve a vector of good length:
motsoc.dummy <- rep(0,nrow(IFMattrib))
# Now replace the zero with one for above average MotSoc:
motsoc.dummy[IFMattrib$MotSoc>mean(IFMattrib$MotSoc)]=1
motsoc.dummy
# Put the created dummy onto the dataset:
IFMattrib$motsoc.dummy <- motsoc.dummy
# Estimate a logistic regression model predicting motsoc-dummy from job & gender:
motsoc.glm <- glm(motsoc.dummy~factor(jobtitle)+sex,data=IFMattrib,family="binomial")
summary(motsoc.glm)
# ====================
# m) Network data in R
# ====================
# To be sure that the commands are available:
library(network)
library(sna)
# Before we can describe, plot and ultimately run statistical models on our network
# we need to get the network into R and into a useable format
# There are lots of different formats, but two common are edgelists and matrices
# it is also possible to read in from the pajek format, see "?read.paj".
# We are going to work with the IFM advice network which should be saved
# as a matrix and edgelist somewhere on your computer.
# the network corresponds to the data we've already been using above
# so the ids on the data link to the ids on the network
# Let's first read in the IFM advice network in the MATRIX format and make it
# a network object. For reading in the network as a matrix of 1s and 0s, type...
# (Make sure that the working directory is set correctly before executing this)
IFM.mat <- read.csv("IFM-advice.csv",header=TRUE)
IFM.mat
# Grab the ids from the input matrix:
ids <- IFM.mat[,1]
ids
# Take off the first column (as it is the ids):
IFM.mat <- IFM.mat[,-1]
# What class does the read object have?
class(IFM.mat)
# Transform into network object:
IFM.mat <- as.matrix(IFM.mat)
class(IFM.mat)
IFM.net <- network(IFM.mat,directed=TRUE)
class(IFM.net)
IFM.net
# Now once more the same thing, but reading in the saved EDGELIST:
IFM.edges <- read.csv("IFM-advice-edges.csv",header=TRUE)
# Initializing the network object to be the right size and undirected:
IFM2.net <- network.initialize(n=77,directed=TRUE)
# Put edges on network object:
IFM2.net <- network.edgelist(IFM.edges,IFM2.net)
IFM2.net
# Check if the two are identical:
IFM2.net==IFM.net
# Oops! Not implemented... but as matrices:
table(as.sociomatrix(IFM2.net)==as.sociomatrix(IFM.net))
# however, not as objects:
?identical
identical(IFM2.net, IFM.net)
# Now that we have our network we want to put the characteristics
# of the people onto the network.
# We've already read in the IFMattrib data so we can use that.
# Put gender onto the network:
IFM.net %v% "sex" <- as.character(IFMattrib$sex)
# The 'as.character()' is necessary because variables of the class
# 'factor' cannot be attached to network objects!
# NOTE that you need to make sure that the characteristics (here IFMattrib$sex)
# are in the same order as the ids on the network, otherwise the data won't
# match up with people in the network.
# Check this by comparing the id vectors:
sum(ids==IFMattrib$ID)
# Should be 77, if the ids match up.
# Add some more attributes:
IFM.net %v% "religion" <- as.character(IFMattrib$religion)
IFM.net %v% "age" <- IFMattrib$age
IFM.net %v% "tenure" <- IFMattrib$tenure
IFM.net %v% "marital" <- as.character(IFMattrib$marital.status)
IFM.net %v% "jobtitle" <- as.character(IFMattrib$jobtitle)
IFM.net %v% "motsoc" <- IFMattrib$MotSoc
IFM.net %v% "motego" <- IFMattrib$MotEgo
# ====================
# n) Plotting networks
# ====================
# It is important to get impressions of a network before jumping into
# any statistical analysis.
# Are there indegree- or outdegree outliers?
sociomatrixplot(IFM.net,diaglab=FALSE,cex.lab=.5)
plot(IFM.net)
# Colour the nodes by gender:
plot(IFM.net,vertex.col="sex")
# Colour the nodes by jobtitle
plot(IFM.net,vertex.col="jobtitle")
# Scale the vertex by tenure:
plot(IFM.net,vertex.col="jobtitle",vertex.cex="tenure")
# Oops! scale size down a bit:
plot(IFM.net,vertex.col="jobtitle",vertex.cex=1+(IFMattrib$tenure/10))
# For more plotting options, try
?gplot
?gplot.layout
# Nicer plots are available in the igraph package, by the way:
igraph::plot.igraph(igraph::graph.adjacency(IFM.net),
vertex.size=7,vertex.color=IFMattrib$jobtitle,
edge.arrow.size=0.5)
# Here you can also see how a package not added to the namespace
# (but installed on the machine) can be used: you give the
# command, but add the package name and two colons before.
# ===========================
# o) Basic network statistics
# ===========================
# Some important graph level statistics:
gden(IFM.net) # density
grecip(IFM.net,measure="edgewise") # reciprocity
grecip(IFM.net,measure="dyadic")
# dyadic reciprocity index, considering N dyads reciprocal
?grecip
# Dyad and triad census:
dyad.census(IFM.net)
triad.census(IFM.net)
# For the fun of it, once more in igraph:
igraph::dyad.census(igraph::graph.adjacency(IFM.net))
igraph::triad.census(igraph::graph.adjacency(IFM.net))
# Slightly worse layout...
# # To arrive at a transitivity index, we (A) need to count how many
# transitive triplets each triad type contains. Then (B) we can
# calculate how many transitive triplets are in the whole network.
# For turning this count into an index, we (C) must calculate how
# many opportunities for transitive closure there are in the data,
# i.e., how many 2-paths (which may or may not be 'cut short' by a
# direct transitive link) Then (D) the ratio "number of transitive
# triples" divided by "number of two-paths" is the transitivity index.
# Let's calculate, then.
# (T.A) In the slides "Introduction.pdf" you see the triad configura-
# tions and a sum indicating how many transitive triplets they contain.
# For the sequence of types 003 012 102 021D 021U 021C 111D 111U 030T
# 030C 201 120D 120U 120C 210 300, we get these weights:
trans.weight <- c(0,0,0,0,0,0,0,0,1,0,0,2,2,1,3,6)
# (T.B) Now use these numbers to weight the triad census and count the
# total transitive triplet number:
tr.triplets <- sum(trans.weight*triad.census(IFM.net))
# (T.C) The same way we can calculate the number of two-pathes:
path2.weight <- c(0,0,0,0,0,1,1,1,1,3,2,2,2,3,4,6)
path2 <- sum(path2.weight*triad.census(IFM.net))
# (T.D) Now calculate it:
(transitivity <- tr.triplets/path2)
# This was one possible calculation... another would be the following:
# (T2.B) Calculate who is indirectly linked (i.e., in two steps) to whom:
IFM.mat.squared <- IFM.mat %*% IFM.mat
# (T2.C) Exclude loops back to self (i.e., reciprocal ties):
diag(IFM.mat.squared) <- 0
# (T2.D) Triplets are ties co-occurring with indirect links:
tr.triplets.matrix <- IFM.mat.squared * IFM.mat
# (T2.E) Transitivity index recalculated:
(transitivity2 <- sum(tr.triplets.matrix)/sum(IFM.mat.squared))
# Is it the same result?
transitivity2
# ... should be:
transitivity==transitivity2
# Note that there might be differences due to rounding errors...
#
# Ok, move on...
# Outdegree distribution:
outdegree <- degree(IFM.net,cmode="outdegree")
outdegree
hist(outdegree)
quantile(outdegree)
# Calculate the distance matrix:
IFMdist <- geodist(IFM.net)
IFMdist$gdist
table(IFMdist$gdist)
hist(IFMdist$gdist)
plot(table(IFMdist$gdist),type='b')
# Calculate the reachability matrix:
IFMreach <- reachability(IFM.net)
table(IFMreach)