Sunbelt2016: movingBeyondDescriptives.R

File movingBeyondDescriptives.R, 15.9 KB (added by morrism, 20 months ago)
Line 
1##1 Getting Started
2install.packages("statnet")
3library(statnet)
4library(network)  ##this is part of statnet, but might need to be loaded separately to attach data
5
6##these next two packages are only necessary for a few routines
7install.packages("numDeriv")
8library(numDeriv)
9install.packages("yacca")
10library(yacca)
11
12
13###########################################################
14###########################################################
15##2 Node Level Indices, Network Covariates, and Network Regression
16##2.1 Network visualization and basic descriptives
17
18#make sure you're in the correct file directory or change the following command
19load("movingBeyondDescriptives.Rdata")
20
21data(emon) # Load Drabek et al. data
22
23# Begin by examining the 'emon' dataset
24?emon
25class(emon)
26class(emon[[1]])
27emon[[1]]
28emon[[1]]%v%"vertex.names" # Display vertex names
29
30# Extract ties from the Cheyenne EMON communicating at least "every few hours"
31g<-as.sociomatrix(emon[[1]],"Frequency") # Need to get the frequency info
32g<-symmetrize((g>0)&(g<4)) # Note the reverse coding!
33
34#Get some potential covariates
35drs<-emon[[1]]%v%"Decision.Rank.Score" # Get decision rank (see man page)
36crs<-emon[[1]]%v%"Command.Rank.Score" # Get command rank
37
38# Plot Cheyenne EMON communications
39gplot(emon[[1]])
40gplot(emon[[1]], label.cex=0.5, label.col=4, label=network.vertex.names(emon[[1]])) # Basic display, with labels
41
42#Calculate some basic centrality measures
43deg<-degree(g,gmode="graph")
44bet<-betweenness(g,gmode="graph")
45clo<-closeness(g,gmode="graph")
46
47#Raw correlations
48cor(cbind(deg,bet,clo),cbind(drs,crs))
49
50#Classical tests (using asymptotic t distribution)
51cor.test(deg,drs)
52cor.test(bet,drs)
53cor.test(clo,drs)
54
55###########################################################
56#2.2 Testing correlations
57#Lets build a permutation test
58deg
59drs
60c.obs<-cor(deg,drs)
61c.obs
62
63#Permute one of the data sets
64sample(drs)
65cor(deg,sample(drs))
66
67#write a for loop to permute one of the data sets many times
68c.rep<-vector(length=1000)
69for(n in 1:1000){
70  c.rep[n]<-cor(deg,sample(drs))
71}
72
73#look at what we've created
74c.rep
75hist(c.rep)
76
77#compare to empirical data
78abline(v=c.obs,col="red")
79
80#put it all into a function!
81perm.cor.test<-function(x,y,n=1000){ #Define a simple test function
82  c.obs<-cor(x,y)
83  c.rep<-vector()
84  for(i in 1:n)
85    c.rep[i]<-cor(x,sample(y))
86  c.rep
87}
88
89#look at the results
90new.c.rep<-perm.cor.test(deg,drs)
91hist(new.c.rep)
92abline(v=c.obs,col="red")
93
94#calculate quantiles
95mean(new.c.rep>=c.obs)
96mean(new.c.rep<=c.obs)
97mean(abs(new.c.rep)>=abs(c.obs))
98
99#make the output display these quantiles
100perm.cor.test<-function(x,y,niter=1000){ #Define a simple test function
101  c.obs<-cor(x,y)
102  c.rep<-vector()
103  for(i in 1:niter)
104    c.rep[i]<-cor(x,sample(y),use="complete.obs")
105  cat("Vector Permutation Test:\n\tObserved correlation: ",c.obs,"\tReplicate quantiles
106      (number of iterations = ",n,")\n",sep="")
107  cat("\t\tPr(rho>=obs):",mean(c.rep>=c.obs),"\n")
108  cat("\t\tPr(rho<=obs):",mean(c.rep<=c.obs),"\n")
109  cat("\t\tPr(|rho|>=|obs|):",mean(abs(c.rep)>=abs(c.obs)),"\n")
110  invisible(list(obs=c.obs,rep=c.rep))
111}
112
113#examine the fancy output
114fancy<-perm.cor.test(deg,drs)
115attributes(fancy)
116fancy$obs
117fancy$rep
118
119#For more information....
120?cor.test
121?t.test
122?sample
123
124###########################################################
125#2.3 Using NLIs as regression covariates
126
127pstaff<-emon[[1]]%v%"Paid.Staff" # Get more EMON covariates
128vstaff<-emon[[1]]%v%"Volunteer.Staff"
129govt<-((emon[[1]]%v%"Sponsorship")!="Private")
130
131#Simple model: decision rank is linear in size, degree, and govt status
132mod<-lm(drs~deg+pstaff+vstaff+govt)
133summary(mod)
134anova(mod) #Some useful lm tools
135AIC(mod)
136
137#Try with alternative measures....
138mod2<-lm(drs~bet+pstaff+vstaff+govt) #Betweenness
139summary(mod2)
140mod3<-lm(drs~clo+pstaff+vstaff+govt) #Closeness
141summary(mod3)
142AIC(mod,mod2,mod3) #Closeness wins!
143
144#For more information....
145?lm
146?anova
147?AIC
148
149###########################################################
150#2.4 Graph correlation and bivariate QAP
151# Remember the Florentine families data
152data(florentine)
153gplot(flobusiness) # Examine business ties
154gplot(flomarriage) # Examine marriage ties
155
156# Could the two be related?
157par(mfrow=c(1,2))
158coords<-plot(flobusiness,label=flobusiness%v%"vertex.names",label.cex=.5,pad=1)
159title("Business Ties")
160plot(flomarriage,label=flomarriage%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
161title("Marriage Ties")
162par(mfrow=c(1,1))
163
164# Let's try a graph correlation
165gcor(flobusiness,flomarriage)
166
167# Why can't we use our previous permutation test?
168# instead, use rmperm
169# take a look
170rmperm
171
172par(mfrow=c(1,2))
173gplot(flobusiness[,],label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
174title("Business Ties")
175gplot(rmperm(flobusiness),label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
176title("Permuted Business Ties")
177par(mfrow=c(1,1))
178
179#now look at qaptest
180qaptest
181
182#and use it
183qt<-qaptest(list(flobusiness,flomarriage),gcor,g1=1,g2=2)
184summary(qt) # Examine the results
185plot(qt) # Plot the QAP distribution
186
187# Testing against covariate effects
188wealth<-sapply(flomarriage%v%"wealth",rep,network.size(flomarriage))
189wealth
190wealthdiff<-abs(outer(flomarriage%v%"wealth",flomarriage%v%"wealth","-"))
191wealthdiff
192qt1<-qaptest(list(flomarriage,wealth),gcor,g1=1,g2=2)
193qt2<-qaptest(list(flomarriage,wealthdiff),gcor,g1=1,g2=2)
194summary(qt1) # Do wealthy families have more ties?
195summary(qt2) # Is there a wealth difference effect?
196
197# For more information....
198?qaptest
199?gcor
200?outer
201?sapply
202?rep
203
204###########################################################
205#2.5 Network Regression
206
207# Combine the previous tests (takes a while to perform QAP test)
208marriagefit<-netlm(flomarriage,list(flobusiness,wealth,wealthdiff))
209summary(marriagefit) # Examine the results
210
211# Another example: we begin by preparing the response variable. We will use the Cheyenne
212# EMON in valued form, but need to recode the frequency data
213Y<-as.sociomatrix(emon[[1]], "Frequency") # Extract frequencies
214Y[Y>0]<-5-Y[Y>0] # Now, higher -> more frequent
215
216# Extract some vertex attributes
217crk<-emon[[1]]%v% "Command.Rank.Score" # Command rank
218spon<-emon[[1]]%v%"Sponsorship" # Type of organization
219
220# Form some predictor matrices (X variables)
221Xcr<-sapply(crk,rep,length(crk)) # Column effects for command rank
222Xcr
223Xsp<-outer(spon,spon,"!=") # Dyadic effect for type difference
224Xsp
225
226# Fit the model (takes a while to perform QAP test)
227cmfit<-netlm(Y,list(Xcr,Xsp))
228summary(cmfit) # Examine the results
229
230
231cmfitB<-netlm(Y,list(Xcr,Xsp),nullhyp="classical")
232summary(cmfitB) # In this example, pretty similar
233
234# For more information....
235?outer
236?netlm
237
238###########################################################
239#2.6 Network Autocorrelation Models
240
241rownames(parishNet)
242which(parishNet["Rapides",]==1)
243
244#Examine the OLS regression predicting percentage vote for Obama by percentage Black residents and average weekly wage
245
246modLM<-lm(parishAttr[,1]~parishAttr[,2]+parishAttr[,3])
247summary(modLM)
248
249#create some simple weight matrices
250wMat<-apply(parishNet,1,function(x){x/sum(x)})
251wRandMat<-rgraph(64)
252
253#Run the different models with the empirical weights and look at the results
254
255modAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat)
256summary(modAR)
257
258modMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W2=wMat)
259summary(modMA)
260
261modARMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat,W2=wMat)
262summary(modARMA)
263
264modRandAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wRandMat)
265summary(modRandAR)
266
267modWIntercept<-lnam(parishAttr[,1],x=cbind(1,parishAttr[,2:3]),W1=wMat)
268
269###########################################################
270###########################################################
271##3 Graph Level Indices and Conditional Uniform Graph Tests
272#3.1 Permutation tests for GLI/graph-level covariate association
273
274# Here we consider the famous Sampson monastery data:
275par(mfrow=c(4,3),mar=c(2,1,4,1))
276for(i in 1:length(sampson))
277  gplot(sampson[[i]],displaylabel=TRUE,boxed.label=FALSE,main=names(sampson)[i])
278par(mfrow=c(1,1),mar=c(5,4,4,2)+.01)
279
280# Are positive relations more reciprocal (relative to density) than negative
281# ones? Let's try a simple permutation test:
282
283# Calculate reciprocity levels for each network
284r4<-grecip(sampson,measure="edgewise.lrr")
285r4
286
287# Categorize relationships by positive and negative
288ispos<-c(TRUE,FALSE,TRUE,FALSE,TRUE,TRUE,TRUE,FALSE,TRUE,FALSE)
289
290# Calculate the empirical relationship
291obs<-sum(r4[ispos])-sum(r4[!ispos])
292obs # But what does this mean?
293
294# Run a permutation test
295reps<-vector()
296for(i in 1:1e4){
297  temp<-sample(ispos)
298  reps[i]<-sum(r4[temp])-sum(r4[!temp])
299}
300
301#calculate statistics and observe
302mean(reps>=obs) #Upper tail p-value
303mean(abs(reps)>=abs(obs)) # Two-sided version
304hist(reps)
305abline(v=obs,col=2,lwd=3) # Visualize it
306
307
308# We can look at transitivity as well. How does transitivity compare to density? (Log-odds method)
309log(gtrans(sampson)/gden(sampson))
310
311# Are positive relations more transitive (relative to density) than negative ones? Let's try another vector permutation test:
312ltr<-log(gtrans(sampson)/gden(sampson))
313obs<-sum(ltr[ispos])-sum(ltr[!ispos])
314reps<-vector()
315for(i in 1:1e4){
316  temp<-sample(ispos)
317  reps[i]<-sum(ltr[temp])-sum(ltr[!temp])
318}
319mean(reps>=obs) # Upper tail p-value
320mean(abs(reps)>=abs(obs)) # Two-sided version
321hist(reps)
322abline(v=obs,col=2,lwd=3) # Visualize it
323
324###########################################################
325##3.2 Comparing graphs via the triad census
326# Let's get the triad census for each network
327tc<-triad.census(sampson)
328tc
329
330# Cool trick: two-way correspondence analysis of graphs and their triad census
331# scores (aka a "Faust diagram"). Networks here appear close to the triad
332# types they contain at excess frequency (distances are chi-squared based;
333# see references in ?corresp for more detail).
334library(MASS) # Requires the MASS package
335plot(corresp(tc,nf=2)) # Plot network/triad association
336
337# What if this data were symmetric? We can symmetrize to illustrate.
338tc<-triad.census(symmetrize(sampson),mode="graph")#Need to use mode="graph" here
339rownames(tc)<-names(sampson)
340plot(corresp(tc,nf=2)) # Plot undirected network/triad association
341
342# For more information....
343?gtrans
344?triad.census
345?corresp
346?symmetrize
347
348###########################################################
349#3.3 Simple univariate conditional uniform graph tests
350# The cug.test function provides a simple interface for univariate CUG tests.
351# Let's try testing some data on trade in complex manufactured goods to see if overall
352# activity (density) is greater then would be expected from size alone.
353cug.test(ctrade,gden) # Call cug.test with the gden (density) function
354
355# Is there more reciprocity than density would suggest? Let's see.
356cug.test(ctrade,grecip,cmode="edges") # Conditioning on edges, calling grecip
357
358# Given biases in density and reciprocity, we might look to see if there is a
359# bias in transitivity, too. This time, let's condition on all of the above.
360cug.test(ctrade,gtrans,cmode="dyad") # Conditioning on dyad census
361
362# We are not limited to simple commands. Let's try indegree centralization:
363ct<-cug.test(ctrade,centralization,cmode="dyad",FUN.arg=list(FUN=degree, cmode="indegree"))
364# Note that we here must pass not only arguments to
365# cug.test, but also to centralization and degree!
366
367ct # Print the result
368plot(ct) # Can also plot it!
369
370###########################################################
371###########################################################
372##4 Multivariate Analysis of Graph Sets
373#4.1 Distance based methods: clustering and scaling
374
375# Start by calculating Hamming distances for the Sampson data
376samphd<-hdist(sampson)
377samphd
378
379# Now, try an MDS solution
380sampmds<-cmdscale(samphd)
381sampmds
382plot(sampmds,type="n") # Plot the results
383text(sampmds,label=names(sampson))
384
385# MDS suggests a three-cluster solution; let's try hclust
386samphc<-hclust(as.dist(samphd))
387plot(samphc,labels=names(sampson)) # Very clear solution
388rect.hclust(samphc,k=3)
389
390# Examine central graphs for each cluster
391sampcg<-gclust.centralgraph(samphc,k=3,sampson)
392par(mfrow=c(2,2))
393gplot(sampcg[1,,],main="Positive CG")
394gplot(sampcg[2,,],main="Negative CG")
395gplot(sampcg[3,,],main="Liking CG")
396par(mfrow=c(1,1))
397
398# More fun - can plot stats by cluster!
399gclust.boxstats(samphc,k=3,grecip(sampson,measure="edgewise.lrr"), names=c("Positive","Negative","Liking"), main="Edgewise LRR Reciprocity by Relational Type")
400gclust.boxstats(samphc,k=3,gtrans(sampson), names=c("Positive","Negative","Liking"), main="Transitivity by Relational Type")
401
402# For more information....
403?hdist
404?cmdscale
405?hclust
406?rect.hclust
407?gclust.centralgraph
408?gdist.plotstats
409
410###########################################################
411#4.2 Network PCA
412# To begin, let's get the graph correlation matrix for the Sampson nets
413sampcor<-gcor(sampson)
414sampcor
415
416# Now, we compute the eigendecomposition (could also have used gcov above)
417sampeig<-eigen(sampcor)
418
419# Eigenvalues contain variance explained, to whit:
420evals<-sampeig$value # Extract eigenvalues
421evals/sum(evals) # Variance explained
422
423# Show this as a scree plot
424barplot(evals/sum(evals),names.arg=1:length(evals))
425
426# Examine loadings (eigenvectors); looks like first 3 are key
427load<-sampeig$vector[,1:3]
428rownames(load)<-names(sampson)
429load
430
431# Can rotate using varimax
432varimax(load)
433
434# Try plotting the first two components
435plot(load[,1:2],type="n",asp=1,xlab="PC 1",ylab="PC 2")
436abline(h=0,v=0,lty=3)
437arrows(0,0,load[,1],load[,2],col=2) # Should be read angularly!
438text(load[,1:2],label=names(sampson))
439
440# Finally, extract scores
441S1<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,1],"*"),c(2,3),sum)
442S2<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,2],"*"),c(2,3),sum)
443S3<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,3],"*"),c(2,3),sum)
444
445#Visualize a score graph (not too helpful in this case!)
446coord<-gplot.layout.fruchtermanreingold(as.edgelist.sna(S1>0),NULL)
447gplot(S1!=0,edge.col=sign(S1)+3,coord=coord)
448
449# For more information....
450?gcor
451?gcov
452?eigen
453?varimax
454?abline
455?arrows
456?sweep
457?gplot.layout
458
459###########################################################
460#4.3 Network CCA
461# For this one, we're going to use the country trade data. Somewhat perversely,
462# we're going to use yacca instead of netcancor (b/c this data set has
463# missing data which netcancor doesn't handle, and b/c yacca currently has
464# better visualizations).
465library(yacca) # You'll need to install this....
466# Perform a canonical correlation analysis between relations and attribute
467# differences
468trade.cca<-cca(gvectorize(trade),gvectorize(tradediff), standardize.scores=FALSE) # Turn off standardization b/c of NAs
469summary(trade.cca)
470
471# Visualize the output
472plot(trade.cca)
473
474# For more information
475?netcancor
476?yacca
477?cca
478?gvectorize
479
480###########################################################
481#4.4 Studying Qualitative Dynamics with Network MDS
482# Johnson collected 'liking' scores for researchers working in the South Pole on a monthly basis over two years
483class(johnsonPolarY1)
484dim(johnsonPolarY1)
485johnsonPolarY1[1,,]
486
487# Visualize the data using weight or color to aid interpretation
488gplot(johnsonPolarY1)
489gplot(johnsonPolarY1[1,,],edge.lwd=johnsonPolarY1[1,,])
490coords<-gplot(johnsonPolarY1[1,,],edge.col=rainbow(10)[johnsonPolarY1[1,,]], vertex.col="black")
491par(ask=TRUE)
492for(i in 1:8)
493  gplot(johnsonPolarY1[i,,],edge.col=rainbow(10)[johnsonPolarY1[i,,]],
494        vertex.col="black",coord=coords)
495par(ask=FALSE)
496
497# Calculate Hamming distances for each of the two Johnson data sets
498jp1d <- hdist(johnsonPolarY1)
499jp2d <- hdist(johnsonPolarY2)
500
501# Now try an MDS solution
502jp1mds<-cmdscale(jp1d)
503jp1mds
504plot(jp1mds,type="l",lty=2) # Plot the results
505text(jp1mds,label=rownames(johnsonPolarY1),font=2)
506jp2mds<-cmdscale(jp2d)
507jp2mds
508plot(jp2mds,type="l",lty=2) # Plot the results
509text(jp2mds,label=rownames(johnsonPolarY2),font=2)