################### sienaGOF_vdB.R ############################################# # # This is an R script for demonstrating sienaGOF # using the van de Bunt data (available from the Siena website). # R script written by Tom Snijders. # Version September 9, 2020. ################################################################################ # # Where are you? getwd() # If necessary, change your directory. # We use the van de Bunt data. # For meaning and codes, consult the Siena website or the Siena manual. # Look at # https://www.stats.ox.ac.uk/~snijders/siena/vdBunt_data.htm # # You may read the data outside of R, but from within R the following may work: download.file("https://www.stats.ox.ac.uk/~snijders/siena/vdBunt_data.zip", destfile='vdb.zip') unzip('vdb.zip') # Then continue with reading into this R session # the files that now were put in your working directory: vdb.w0 <- as.matrix(read.table("VRND32T0.DAT")) vdb.w1 <- as.matrix(read.table("VRND32T1.DAT")) vdb.w2 <- as.matrix(read.table("VRND32T2.DAT")) vdb.w3 <- as.matrix(read.table("VRND32T3.DAT")) vdb.w4 <- as.matrix(read.table("VRND32T4.DAT")) vdb.w5 <- as.matrix(read.table("VRND32T5.DAT")) vdb.w6 <- as.matrix(read.table("VRND32T6.DAT")) vdb.attr <- as.matrix(read.table("VARS.DAT")) # Take account of missing data codes 6 and 9: vdb.w0[vdb.w0 %in% c(6,9)] <- NA vdb.w1[vdb.w1 %in% c(6,9)] <- NA vdb.w2[vdb.w2 %in% c(6,9)] <- NA vdb.w3[vdb.w3 %in% c(6,9)] <- NA vdb.w4[vdb.w4 %in% c(6,9)] <- NA vdb.w5[vdb.w5 %in% c(6,9)] <- NA vdb.w6[vdb.w6 %in% c(6,9)] <- NA # Recode 4 (acquaintance) and 5 (difficult) to no tie vdb.w0[vdb.w0 %in% c(4,5)] <- 0 vdb.w1[vdb.w1 %in% c(4,5)] <- 0 vdb.w2[vdb.w2 %in% c(4,5)] <- 0 vdb.w3[vdb.w3 %in% c(4,5)] <- 0 vdb.w4[vdb.w4 %in% c(4,5)] <- 0 vdb.w5[vdb.w5 %in% c(4,5)] <- 0 vdb.w6[vdb.w6 %in% c(4,5)] <- 0 # Use the "friendly relation" relation by recoding: vdb.w0[vdb.w0 %in% c(1,2,3)] <- 1 vdb.w1[vdb.w1 %in% c(1,2,3)] <- 1 vdb.w2[vdb.w2 %in% c(1,2,3)] <- 1 vdb.w3[vdb.w3 %in% c(1,2,3)] <- 1 vdb.w4[vdb.w4 %in% c(1,2,3)] <- 1 vdb.w5[vdb.w5 %in% c(1,2,3)] <- 1 vdb.w6[vdb.w6 %in% c(1,2,3)] <- 1 # Attach RSiena library(RSiena) # Attributes: sex <- coCovar(vdb.attr[,1]) program <- coCovar(vdb.attr[,2]) smoke <- coCovar(vdb.attr[,3]) # waves 2-3-4 friends234 <- sienaDependent(array(c(vdb.w2,vdb.w3,vdb.w4), dim=c(32, 32, 3)), allowOnly=FALSE) vdb.data234 <- sienaDataCreate(friends234,sex,program,smoke) # Algorithm creation: vdb.algo <- sienaAlgorithmCreate(projname = 'vdb_gof234', seed=123456) # Define some functions from ?'sienaGOF-auxiliary' # (the quotes are necessary because this name contains a hyphen) # The geodesic distribution is not available from within RSiena, # and therefore is copied from the help page of sienaGOF-auxiliary: # GeodesicDistribution calculates the distribution of non-directed # geodesic distances; see ?sna::geodist # The default for \code{levls} reflects the usual phenomenon # that geodesic distances larger than 5 # do not differ appreciably with respect to interpretation. # Note that the levels of the result are named; # these names are used in the \code{plot} method. GeodesicDistribution <- function (i, data, sims, period, groupName, varName, levls=c(1:5,Inf), cumulative=TRUE, ...) { x <- networkExtraction(i, data, sims, period, groupName, varName) require(sna) a <- sna::geodist(symmetrize(x))$gdist if (cumulative) { gdi <- sapply(levls, function(i){ sum(a<=i) }) } else { gdi <- sapply(levls, function(i){ sum(a==i) }) } names(gdi) <- as.character(levls) gdi } # The following function is taken from the help page for sienaTest testall <- function(ans){ for (i in which(ans$test)){ sct <- score.Test(ans,i) cat(ans$requestedEffects$effectName[i], '\n') print(sct)} invisible(score.Test(ans)) } ################################################################################ # After having defined these functions, we can now go to applying sienaGOF # to the van de Bunt data for waves 2, 3, 4. # Define a model to decide between three specifications of transitivity. # This forward model selection is not recommended in general; # it is used here only to demonstrate the use of the sienaGOF function. vdb.eff234 <- getEffects(vdb.data234) vdb.eff234 <- includeEffects(vdb.eff234, transTrip, transTies, gwespFF, test=TRUE, fix=TRUE) vdb.eff234 # This model only estimates parameters for rates, outdegree, and reciprocity. # The three transitivity parameters are tested by a score-type test, # and not estimated. # Note that these models are very inadequate from many points of view! (ans0 <- siena07(vdb.algo, data=vdb.data234, effects=vdb.eff234, returnDeps=TRUE)) testall(ans0) (gofi0 <- sienaGOF(ans0, IndegreeDistribution, verbose=TRUE, join=TRUE, varName="friends234")) (gofo0 <- sienaGOF(ans0, OutdegreeDistribution, verbose=TRUE, join=TRUE, levls=c(0:10,15,20),varName="friends234")) (gof0.tc <- sienaGOF(ans0, TriadCensus, verbose=TRUE, join=TRUE, varName="friends234")) (gof0.gd <- sienaGOF(ans0, GeodesicDistribution, cumulative=FALSE, verbose=TRUE, join=TRUE, varName="friends234")) # Fit for indegree distribution is OK; # for the other three auxiliary statistics it is not. # For example: plot(gof0.gd) # The model simulates too many geodesic distances 2, and too few distances 5 descriptives.sienaGOF(gof0.gd) # What about distances 6 or 7? (gof0.gd7 <- sienaGOF(ans0, GeodesicDistribution, cumulative=FALSE, levls=c(1:7,Inf), verbose=TRUE, join=TRUE, varName="friends234")) descriptives.sienaGOF(gof0.gd7) # No distances 6 are observed, and some are simulated (depending on the random simulations) # No distances 7 are observed, and none are simulated: descriptives.sienaGOF(gof0.gd7, showAll=TRUE) # The summaries give the approximate value of the Mahalanobis distance, # if the tested & fixed effects would be added: summary(gof0.tc) summary(gof0.gd) # These values are given here as MHD. # See Lospinoso & Snijders 2019, at https://doi.org/10.1177/2059799119884282 # and the help page: ?sienaGOF # The summaries for gof0.tc and gof0.gd suggest that the model # should be extended by gwespFF or transTrip. We go for gwespFF. vdb.eff234.a <- includeEffects(vdb.eff234, gwespFF) (ansa <- siena07(vdb.algo, data=vdb.data234, effects=vdb.eff234.a, prevAns=ans0, returnDeps=TRUE)) testall(ansa) # The two other transitivity effects now have become non-significant. (gofia <- sienaGOF(ansa, IndegreeDistribution, verbose=TRUE, join=TRUE, varName="friends234")) (gofoa <- sienaGOF(ansa, OutdegreeDistribution, verbose=TRUE, join=TRUE, levls=c(0:10,15,20),varName="friends234")) (gofa.tc <- sienaGOF(ansa, TriadCensus, verbose=TRUE, join=TRUE, varName="friends234")) (gofa.gd <- sienaGOF(ansa, GeodesicDistribution, cumulative=FALSE, verbose=TRUE, join=TRUE, varName="friends234")) # Now the fit is OK for all four auxiliary statistics. plot(gofoa) # To get a plot for the triad census, the values should be centered and scaled, # because they differ wildly plot(gofa.tc) plot(gofa.tc, center=TRUE, scale=TRUE) # It is helpful to widen the plot. # The codes for the triads (003 etc.) are the usual MAN notation; # for example, see https://doi.org/10.1371/journal.pone.0197514.g001 # Is the model homogeneous for the two periods? sienaTimeTest(ansa) # Yes. ############################################################################### # To get an impression of the distribution of the number of components, # use the following. # The components are computed by the function geo.desc, see below. # This employs the package igraph; the following function is required. # The first function again is taken from the help page for sienaGOF-auxiliary. # igraphNetworkExtraction extracts simulated and observed networks # from the results of a siena07 run. # It returns the network as an edge list of class "graph" # according to the igraph package. # Ties for ordered pairs with a missing value for wave=period or period+1 # are zeroed; # note that this also is done in RSiena for calculation of target statistics. # However, changing structurally fixed values are not taken into account. igraphNetworkExtraction <- function(i, data, sims, period, groupName, varName) { require(igraph) dimsOfDepVar<- attr(data[[groupName]]$depvars[[varName]], "netdims") missings <- is.na(data[[groupName]]$depvars[[varName]][,,period]) | is.na(data[[groupName]]$depvars[[varName]][,,period+1]) if (is.null(i)) { # sienaGOF wants the observation: original <- data[[groupName]]$depvars[[varName]][,,period+1] original[missings] <- 0 returnValue <- graph.adjacency(original) } else { missings <- graph.adjacency(missings) #sienaGOF wants the i-th simulation: returnValue <- graph.difference( graph.empty(dimsOfDepVar) + edges(t(sims[[i]][[groupName]][[varName]][[period]][,1:2])), missings) } returnValue } # The following function is not documented in sienaGOF-auxiliary # but is meant just as an additional self-defined auxiliary function # calculating the size of the largest component, # the number of components, and the diameter. # It uses igraph to calculate the components. geo.desc <- function(i, data, sims, period, groupName, varName){ require(igraph) ai <- igraphNetworkExtraction(i, data, sims, period, groupName, varName) n <- vcount(ai) # number of nodes # size largest weak component componentList <- clusters(ai,mode="weak") comp1 <- max(componentList$csize) # number of components ncomp <- componentList$no # diameter diam <- diameter(as.undirected(ai), unconnected=TRUE) descr <- c(comp1, ncomp, diam) names(descr) <- c("comp1", "ncomp", "diam") descr } # To get an impression of the distribution of the number of components, # use the following. # Note that join=FALSE is used here, to get separate calculations # for the two periods; join=TRUE will calculate the sum over the periods. (gofa.geo <- sienaGOF(ansa, geo.desc, verbose=TRUE, join=FALSE, varName="friends234")) descriptives.sienaGOF(gofa.geo, period=1) descriptives.sienaGOF(gofa.geo, period=2)