################### test_gof_vdB.R #############################################
#
# This is an R script for demonstrating test_gof
# using the van de Bunt data (available from the Siena website).
# R script written by Tom Snijders.
# Version May 24, 2026.
################################################################################
#

# Where are you?
getwd()
# If necessary, change your directory.

# We use the van de Bunt data.
# For meaning and codes, consult the Siena website or the Siena manual.
# Look at
# https://www.stats.ox.ac.uk/~snijders/siena/vdBunt_data.htm
#
# You may read the data outside of R, but from within R the following may work:

download.file("https://www.stats.ox.ac.uk/~snijders/siena/vdBunt_data.zip",
                                                        destfile='vdb.zip')
unzip('vdb.zip')

# Then continue with reading into this R session
# the files that now were put in your working directory:

vdb.w0 <- as.matrix(read.table("VRND32T0.DAT"))
vdb.w1 <- as.matrix(read.table("VRND32T1.DAT"))
vdb.w2 <- as.matrix(read.table("VRND32T2.DAT"))
vdb.w3 <- as.matrix(read.table("VRND32T3.DAT"))
vdb.w4 <- as.matrix(read.table("VRND32T4.DAT"))
vdb.w5 <- as.matrix(read.table("VRND32T5.DAT"))
vdb.w6 <- as.matrix(read.table("VRND32T6.DAT"))
vdb.attr <- as.matrix(read.table("VARS.DAT"))

# Take account of missing data codes 6 and 9:

vdb.w0[vdb.w0 %in% c(6,9)] <- NA
vdb.w1[vdb.w1 %in% c(6,9)] <- NA
vdb.w2[vdb.w2 %in% c(6,9)] <- NA
vdb.w3[vdb.w3 %in% c(6,9)] <- NA
vdb.w4[vdb.w4 %in% c(6,9)] <- NA
vdb.w5[vdb.w5 %in% c(6,9)] <- NA
vdb.w6[vdb.w6 %in% c(6,9)] <- NA

# Recode 4 (acquaintance) and 5 (difficult) to no tie

vdb.w0[vdb.w0 %in% c(4,5)] <- 0
vdb.w1[vdb.w1 %in% c(4,5)] <- 0
vdb.w2[vdb.w2 %in% c(4,5)] <- 0
vdb.w3[vdb.w3 %in% c(4,5)] <- 0
vdb.w4[vdb.w4 %in% c(4,5)] <- 0
vdb.w5[vdb.w5 %in% c(4,5)] <- 0
vdb.w6[vdb.w6 %in% c(4,5)] <- 0

# Use the "friendly relation" relation by recoding:

vdb.w0[vdb.w0 %in% c(1,2,3)] <- 1
vdb.w1[vdb.w1 %in% c(1,2,3)] <- 1
vdb.w2[vdb.w2 %in% c(1,2,3)] <- 1
vdb.w3[vdb.w3 %in% c(1,2,3)] <- 1
vdb.w4[vdb.w4 %in% c(1,2,3)] <- 1
vdb.w5[vdb.w5 %in% c(1,2,3)] <- 1
vdb.w6[vdb.w6 %in% c(1,2,3)] <- 1

# Attach RSiena
library(RSiena)

# Attributes:
sex     <- as_covariate_rsiena(vdb.attr[,1])
program <- as_covariate_rsiena(vdb.attr[,2])
smoke   <- as_covariate_rsiena(vdb.attr[,3])

# waves 2-3-4
friends234 <- as_dependent_rsiena(array(c(vdb.w2,vdb.w3,vdb.w4),
                            dim=c(32, 32, 3)), allowOnly=FALSE)
vdb.data234 <- make_data_rsiena(friends234,sex,program,smoke)

# Algorithm creation:
alg_alg <- set_algorithm_saom(seed=123456)

# Define a function from
?test_gof_auxiliary

# The geodesic distribution is not available from within RSiena,
# and therefore is copied from the help page of test_gof_auxiliary:

# GeodesicDistribution calculates the distribution of non-directed
   # geodesic distances; see ?sna::geodist
   # The default for \code{levls} reflects the usual phenomenon
   # that geodesic distances larger than 5
   # do not differ appreciably with respect to interpretation.
   # But take care, perhaps you want to change it!
   # Note that the levels of the result are named;
   # these names are used in the \code{plot} method.
GeodesicDistribution <- function (i, data, sims, period, groupName,
   varName, levls=c(1:5,Inf), cumulative=TRUE, ...) {
     x <- networkExtraction(i, data, sims, period, groupName, varName)
     require(sna)
     a <- sna::geodist(symmetrize(x))$gdist
     if (cumulative)
     {
       gdi <- sapply(levls, function(i){ sum(a<=i) })
     }
     else
     {
       gdi <- sapply(levls, function(i){ sum(a==i) })
     }
     names(gdi) <- as.character(levls)
     gdi
}

# The following function is taken from the help page for sienaTest

testall <- function(ans){
    for (i in which(ans$test)){
    sct <- test_parameter(ans, method='score', tested=i)
    cat(ans$requestedEffects$effectName[i], '\n')
    print(sct)}
test_parameter(ans, method='score')
}

################################################################################
# After having defined these functions, we can now go to applying test_gof
# to the van de Bunt data for waves 2, 3, 4.
# First we specify the model and estimate it.

# Define a model to decide between three specifications of transitivity.
# This forward model selection is not recommended in general;
# it is used here only to demonstrate the use of the test_gof function.

vdb.eff234 <- make_specification(vdb.data234)
vdb.eff234 <- set_effect(vdb.eff234, list(transTrip, transTies, gwespFF),
         fix=TRUE, test=TRUE)
vdb.eff234

# This model only estimates parameters for rates, outdegree, and reciprocity.
# The three transitivity parameters are tested by a score-type test,
# and not estimated.
# Note that these models are very inadequate from many points of view!
# To enable a goodness of fit test, the simulated networks
# have to be included in the object produced by siena.
# This is specified by returnDeps=TRUE.

(ans0  <- siena(data=vdb.data234, effects=vdb.eff234, returnDeps=TRUE,
         control_algo=alg_alg))
testall(ans0)

# We see that all score-type tests are highly significant,
# highlighting the inadequacy of this model.
# But we look at the goodness of fit anyway.

###############################################################################

(gofi0 <- test_gof(ans0, IndegreeDistribution, verbose=TRUE, join=TRUE,
     varName="friends234"))
(gofo0 <- test_gof(ans0, OutdegreeDistribution, verbose=TRUE, join=TRUE,
     levls=c(0:10,15,20),varName="friends234"))
(gof0.gd <- test_gof(ans0, GeodesicDistribution, cumulative=FALSE,
     verbose=TRUE, join=TRUE, varName="friends234"))
# For the triad count, we demonstrate below how to use test_gof
# to give an impression of how much the fit would improve
# if a specific effect among the fixed effects would be set free.
# This will require some extra computation time.
# This is done by adding "tested=NULL";
# the default "tested=FALSE" calculates no information of this kind,
# see the help page "test_gof" to see how this can be focused
# on only a few of the fixed effects.
(gof0.tc <- test_gof(ans0, TriadCensus, verbose=TRUE, join=TRUE,
     varName="friends234", tested=NULL))

# The fit is given by the p-value;
# if it is high enough, the fit is good.
# For "high enough", we can have the threshold of "at least 0.05" in our mind,
# but this should not be employed as a sacrosanct criterion.
# Fit for indegree distribution is OK;
# for the other three auxiliary statistics it is not.
# For example:
plot(gof0.gd)
# The model simulates too many geodesic distances equal to 2,
# and too few distances 5.
# More information is in
descriptives(gof0.gd)
# where you see for each value of the auxiliary statistic
# (here: frequency count of the geodesic distribution)
# how the observed value ("obs") is positioned in the simulated distribution;
# the row "p>" gives the proportion of larger simulated values
# and the row "p>=" gives the proportion of simulated values
#     that are at least as large as the observed value.

# What about distances 6 or 7?
(gof0.gd7 <- test_gof(ans0, GeodesicDistribution, cumulative=FALSE,
            levls=c(1:7,Inf), verbose=TRUE, join=TRUE, varName="friends234"))
descriptives.test_gof(gof0.gd7)
# No distances 6 are observed, and some are simulated
# (but how many these are depends on the random simulations).
# No distances 7 are observed, and none are simulated:
descriptives(gof0.gd7, showAll=TRUE)
# If you would not have added "showAll=TRUE", you would not have seen
# the column for outcome 7.

# The goodness of fit is measured by the Mahalanobis distance
# from the oberved value of the auxiliary statistic,
# to their simulated distribution.
# The p-value is taken from its permutation distribution.
# Given that "tested=NULL" was used for the goodness of fit
# of the triad census,
# the summaries give the approximate value of the Mahalanobis distance,
# if the tested & fixed effects would be added,
# together with the one-step estimates.
# All this is very approximate
# (for the mathematical insight: based on a Taylor series)
# so it may be very imprecise.
# See Lospinoso & Snijders 2019, at https://doi.org/10.1177/2059799119884282
# and the help page:
?test_gof

# Inspect
summary(gof0.tc)
# MHD means Mahalanobis distance.
# We see that the current value of the MHD is 120.21
# (this depends on the random numbers used
#   - here specified by the random number seed)
# and that the prediction is that it will be diminished
# by the addition of any of the three effects, but most by the gwespFF effect
# and almost equally much by transitive triplets.

###############################################################################

# We go for adding the gwespFF effect.

vdb.eff234.a <- set_effect(vdb.eff234, gwespFF)
(ansa  <- siena(data=vdb.data234, effects=vdb.eff234.a, returnDeps=TRUE,
         prevAns=ans0, control_algo=alg_alg))
testall(ansa)
# The two other transitivity effects now have become non-significant.
(gofia <- test_gof(ansa, IndegreeDistribution, verbose=TRUE, join=TRUE,
     varName="friends234"))
(gofoa <- test_gof(ansa, OutdegreeDistribution, verbose=TRUE, join=TRUE,
     levls=c(0:10,15,20),varName="friends234"))
(gofa.tc <- test_gof(ansa, TriadCensus, verbose=TRUE, join=TRUE,
     varName="friends234"))
(gofa.gd <- test_gof(ansa, GeodesicDistribution, cumulative=FALSE,
     verbose=TRUE, join=TRUE, varName="friends234"))
# Now the fit is OK for all four auxiliary statistics.
plot(gofoa)
# To get a plot for the triad census, the values should be centered and scaled,
# because they differ wildly
plot(gofa.tc)
plot(gofa.tc, center=TRUE, scale=TRUE)
# It is helpful to widen the plot.
# The codes for the triads (003 etc.) are the usual MAN notation;
# see the figure in Section 16.5
# ( "Goodness of fit with auxiliary statistics: test_gof" )
# of the RSiena manual.

# Is the model homogeneous for the two periods?
test_time(ansa)
# Yes.

###############################################################################

# Let us illustrate how to use the plot
# to get an idea for how to improve the fit.
# The fit of the triad census is acceptable but not great.
# For these random numbers it is p=0.088.
# From the plot we see that all elements of the auxiliary statistic
# have an observed value that is within the 95% confidence bounds,
# but for triad 021C it seems to fall just on the boundary.
# From
descriptives(gofa.tc)
# we see that indeed it is just outside the boundary.
# Look in the manual (figure in Section 16.5) at the shape of this triad.
# It has one incoming and one outgoing tie at the same node.
# This can be represented by
# the outdegree popularity effect
#  (more outgoing ties for an actor lead to more incoming ties)
# ot the indegree activity effect
#  (more incoming ties for an actor lead to more outgoing ties).
vdb.eff234.b <- set_effect(vdb.eff234.a, outPop)
(ansb  <- siena(data=vdb.data234, effects=vdb.eff234.b, returnDeps=TRUE,
         prevAns=ansa, control_algo=alg_alg))
# Indeed the outdegree popularity effect is significant.
(gofb.tc <- test_gof(ansb, TriadCensus, verbose=TRUE, join=TRUE,
     varName="friends234"))
# Now the p-value is 0.683, way better than before.
plot(gofb.tc, center=TRUE, scale=TRUE)

###############################################################################
