Sunbelt2015: movingBeyondDescriptives.R

File movingBeyondDescriptives.R, 15.8 KB (added by morrism, 4 years ago)
Line 
1##1 Getting Started
2install.packages("statnet")
3library(statnet)
4library(network)  ##this is part of statnet, but might need to be loaded separately to attach data
5
6##these next two packages are only necessary for a few routines
7install.packages("numDeriv")
8library(numDeriv)
9install.packages("yacca")
10library(yacca)
11
12
13###########################################################
14###########################################################
15##2 Node Level Indices, Network Covariates, and Network Regression
16##2.1 Network visualization and basic descriptives
17
18#make sure you're in the correct file directory or change the following command
19load("movingBeyondDescriptives.Rdata")
20
21data(emon) # Load Drabek et al. data
22
23# Begin by examining the 'emon' dataset
24?emon
25class(emon)
26class(emon[[1]])
27emon[[1]]
28emon[[1]]%v%"vertex.names" # Display vertex names
29
30# Extract ties from the Cheyenne EMON communicating at least "every few hours"
31g<-as.sociomatrix(emon[[1]],"Frequency") # Need to get the frequency info
32g<-symmetrize((g>0)&(g<4)) # Note the reverse coding!
33
34#Get some potential covariates
35drs<-emon[[1]]%v%"Decision.Rank.Score" # Get decision rank (see man page)
36crs<-emon[[1]]%v%"Command.Rank.Score" # Get command rank
37
38# Plot Cheyenne EMON communications
39gplot(emon[[1]])
40gplot(emon[[1]], label.cex=0.5, label.col=4, label=network.vertex.names(emon[[1]])) # Basic display, with labels
41
42#Calculate some basic centrality measures
43deg<-degree(g,gmode="graph")
44bet<-betweenness(g,gmode="graph")
45clo<-closeness(g,gmode="graph")
46
47#Raw correlations
48cor(cbind(deg,bet,clo),cbind(drs,crs))
49
50#Classical tests (using asymptotic t distribution)
51cor.test(deg,drs)
52cor.test(bet,drs)
53cor.test(clo,drs)
54
55###########################################################
56#2.2 Testing correlations
57#Lets build a permutation test
58deg
59drs
60c.obs<-cor(deg,drs)
61c.obs
62
63#Permute one of the data sets
64sample(drs)
65cor(deg,sample(drs))
66
67#write a for loop to permute one of the data sets many times
68c.rep<-vector(length=1000)
69for(n in 1:1000){
70  c.rep[n]<-cor(deg,sample(drs))
71}
72
73#look at what we've created
74c.rep
75hist(c.rep)
76
77#compare to empirical data
78abline(v=c.obs,col="red")
79
80#put it all into a function!
81perm.cor.test<-function(x,y,n=1000){ #Define a simple test function
82  c.obs<-cor(x,y)
83  c.rep<-vector()
84  for(i in 1:n)
85    c.rep[i]<-cor(x,sample(y))
86  c.rep
87}
88
89#look at the results
90new.c.rep<-perm.cor.test(deg,drs)
91hist(new.c.rep)
92abline(v=c.obs,col="red")
93
94#calculate quantiles
95mean(new.c.rep>=c.obs)
96mean(new.c.rep<=c.obs)
97mean(abs(new.c.rep)>=abs(c.obs))
98
99#make the output display these quantiles
100perm.cor.test<-function(x,y,niter=1000){ #Define a simple test function
101  c.obs<-cor(x,y)
102  c.rep<-vector()
103  for(i in 1:niter)
104    c.rep[i]<-cor(x,sample(y),use="complete.obs")
105  cat("Vector Permutation Test:\n\tObserved correlation: ",c.obs,"\tReplicate quantiles
106      (number of iterations = ",n,")\n",sep="")
107  cat("\t\tPr(rho>=obs):",mean(c.rep>=c.obs),"\n")
108  cat("\t\tPr(rho<=obs):",mean(c.rep<=c.obs),"\n")
109  cat("\t\tPr(|rho|>=|obs|):",mean(abs(c.rep)>=abs(c.obs)),"\n")
110  invisible(list(obs=c.obs,rep=c.rep))
111}
112
113#examine the fancy output
114fancy<-perm.cor.test(deg,drs)
115attributes(fancy)
116fancy$obs
117fancy$rep
118
119#For more information....
120?cor.test
121?t.test
122?sample
123
124###########################################################
125#2.3 Using NLIs as regression covariates
126
127pstaff<-emon[[1]]%v%"Paid.Staff" # Get more EMON covariates
128vstaff<-emon[[1]]%v%"Volunteer.Staff"
129govt<-((emon[[1]]%v%"Sponsorship")!="Private")
130
131#Simple model: decision rank is linear in size, degree, and govt status
132mod<-lm(drs~deg+pstaff+vstaff+govt)
133summary(mod)
134anova(mod) #Some useful lm tools
135AIC(mod)
136
137#Try with alternative measures....
138mod2<-lm(drs~bet+pstaff+vstaff+govt) #Betweenness
139summary(mod2)
140mod3<-lm(drs~clo+pstaff+vstaff+govt) #Closeness
141summary(mod3)
142AIC(mod,mod2,mod3) #Closeness wins!
143
144#For more information....
145?lm
146?anova
147?AIC
148
149###########################################################
150#2.4 Graph correlation and bivariate QAP
151# Remember the Florentine families data
152data(florentine)
153gplot(flobusiness) # Examine business ties
154gplot(flomarriage) # Examine marriage ties
155
156# Could the two be related?
157par(mfrow=c(1,2))
158coords<-plot(flobusiness,label=flobusiness%v%"vertex.names",label.cex=.5,pad=1)
159title("Business Ties")
160plot(flomarriage,label=flomarriage%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
161title("Marriage Ties")
162par(mfrow=c(1,1))
163
164# Let's try a graph correlation
165gcor(flobusiness,flomarriage)
166
167# Why can't we use our previous permutation test?
168# instead, use rmperm
169# take a look
170rmperm
171
172par(mfrow=c(1,2))
173gplot(flobusiness[,],label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
174title("Business Ties")
175gplot(rmperm(flobusiness),label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords)
176title("Permuted Business Ties")
177par(mfrow=c(1,1))
178
179#now look at qaptest
180qaptest
181
182#and use it
183qt<-qaptest(list(flobusiness,flomarriage),gcor,g1=1,g2=2)
184summary(qt) # Examine the results
185plot(qt) # Plot the QAP distribution
186
187# Testing against covariate effects
188wealth<-sapply(flomarriage%v%"wealth",rep,network.size(flomarriage))
189wealth
190wealthdiff<-abs(outer(flomarriage%v%"wealth",flomarriage%v%"wealth","-"))
191wealthdiff
192qt1<-qaptest(list(flomarriage,wealth),gcor,g1=1,g2=2)
193qt2<-qaptest(list(flomarriage,wealthdiff),gcor,g1=1,g2=2)
194summary(qt1) # Do wealthy families have more ties?
195summary(qt2) # Is there a wealth difference effect?
196
197# For more information....
198?qaptest
199?gcor
200?outer
201?sapply
202?rep
203
204###########################################################
205#2.5 Network Regression
206
207# Combine the previous tests (takes a while to perform QAP test)
208marriagefit<-netlm(flomarriage,list(flobusiness,wealth,wealthdiff))
209summary(marriagefit) # Examine the results
210
211# Another example: we begin by preparing the response variable. We will use the Cheyenne
212# EMON in valued form, but need to recode the frequency data
213Y<-as.sociomatrix(emon[[1]], "Frequency") # Extract frequencies
214Y[Y>0]<-5-Y[Y>0] # Now, higher -> more frequent
215
216# Extract some vertex attributes
217crk<-emon[[1]]%v% "Command.Rank.Score" # Command rank
218spon<-emon[[1]]%v%"Sponsorship" # Type of organization
219
220# Form some predictor matrices (X variables)
221Xcr<-sapply(crk,rep,length(crk)) # Column effects for command rank
222Xcr
223Xsp<-outer(spon,spon,"!=") # Dyadic effect for type difference
224Xsp
225
226# Fit the model (takes a while to perform QAP test)
227cmfit<-netlm(Y,list(Xcr,Xsp))
228summary(cmfit) # Examine the results
229
230
231cmfitB<-netlm(Y,list(Xcr,Xsp),nullhyp="classical")
232summary(cmfitB) # In this example, pretty similar
233
234# For more information....
235?outer
236?netlm
237
238###########################################################
239#2.6 Network Autocorrelation Models
240
241rownames(parishNet)
242which(parishNet["Rapides",]==1)
243
244#Examine the OLS regression predicting percentage vote for Obama by percentage Black residents and average weekly wage
245
246modLM<-lm(parishAttr[,1]~parishAttr[,2]+parishAttr[,3])
247summary(modLM)
248
249#create some simple weight matrices
250wMat<-apply(parishNet,1,function(x){x/sum(x)})
251wRandMat<-rgraph(64)
252
253#Run the different models with the empirical weights and look at the results
254
255modAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat)
256summary(modAR)
257
258modMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W2=wMat)
259summary(modMA)
260
261modARMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat,W2=wMat)
262summary(modARMA)
263
264modRandAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wRandMat)
265summary(modRandAR)
266
267###########################################################
268###########################################################
269##3 Graph Level Indices and Conditional Uniform Graph Tests
270#3.1 Permutation tests for GLI/graph-level covariate association
271
272# Here we consider the famous Sampson monastery data:
273par(mfrow=c(4,3),mar=c(2,1,4,1))
274for(i in 1:length(sampson))
275  gplot(sampson[[i]],displaylabel=TRUE,boxed.label=FALSE,main=names(sampson)[i])
276par(mfrow=c(1,1),mar=c(5,4,4,2)+.01)
277
278# Are positive relations more reciprocal (relative to density) than negative
279# ones? Let's try a simple permutation test:
280
281# Calculate reciprocity levels for each network
282r4<-grecip(sampson,measure="edgewise.lrr")
283r4
284
285# Categorize relationships by positive and negative
286ispos<-c(TRUE,FALSE,TRUE,FALSE,TRUE,TRUE,TRUE,FALSE,TRUE,FALSE)
287
288# Calculate the empirical relationship
289obs<-sum(r4[ispos])-sum(r4[!ispos])
290obs # But what does this mean?
291
292# Run a permutation test
293reps<-vector()
294for(i in 1:1e4){
295  temp<-sample(ispos)
296  reps[i]<-sum(r4[temp])-sum(r4[!temp])
297}
298
299#calculate statistics and observe
300mean(reps>=obs) #Upper tail p-value
301mean(abs(reps)>=abs(obs)) # Two-sided version
302hist(reps)
303abline(v=obs,col=2,lwd=3) # Visualize it
304
305
306# We can look at transitivity as well. How does transitivity compare to density? (Log-odds method)
307log(gtrans(sampson)/gden(sampson))
308
309# Are positive relations more transitive (relative to density) than negative ones? Let's try another vector permutation test:
310ltr<-log(gtrans(sampson)/gden(sampson))
311obs<-sum(ltr[ispos])-sum(ltr[!ispos])
312reps<-vector()
313for(i in 1:1e4){
314  temp<-sample(ispos)
315  reps[i]<-sum(ltr[temp])-sum(ltr[!temp])
316}
317mean(reps>=obs) # Upper tail p-value
318mean(abs(reps)>=abs(obs)) # Two-sided version
319hist(reps)
320abline(v=obs,col=2,lwd=3) # Visualize it
321
322###########################################################
323##3.2 Comparing graphs via the triad census
324# Let's get the triad census for each network
325tc<-triad.census(sampson)
326tc
327
328# Cool trick: two-way correspondence analysis of graphs and their triad census
329# scores (aka a "Faust diagram"). Networks here appear close to the triad
330# types they contain at excess frequency (distances are chi-squared based;
331# see references in ?corresp for more detail).
332library(MASS) # Requires the MASS package
333plot(corresp(tc,nf=2)) # Plot network/triad association
334
335# What if this data were symmetric? We can symmetrize to illustrate.
336tc<-triad.census(symmetrize(sampson),mode="graph")#Need to use mode="graph" here
337rownames(tc)<-names(sampson)
338plot(corresp(tc,nf=2)) # Plot undirected network/triad association
339
340# For more information....
341?gtrans
342?triad.census
343?corresp
344?symmetrize
345
346###########################################################
347#3.3 Simple univariate conditional uniform graph tests
348# The cug.test function provides a simple interface for univariate CUG tests.
349# Let's try testing some data on trade in complex manufactured goods to see if overall
350# activity (density) is greater then would be expected from size alone.
351cug.test(ctrade,gden) # Call cug.test with the gden (density) function
352
353# Is there more reciprocity than density would suggest? Let's see.
354cug.test(ctrade,grecip,cmode="edges") # Conditioning on edges, calling grecip
355
356# Given biases in density and reciprocity, we might look to see if there is a
357# bias in transitivity, too. This time, let's condition on all of the above.
358cug.test(ctrade,gtrans,cmode="dyad") # Conditioning on dyad census
359
360# We are not limited to simple commands. Let's try indegree centralization:
361ct<-cug.test(ctrade,centralization,cmode="dyad",FUN.arg=list(FUN=degree, cmode="indegree"))
362# Note that we here must pass not only arguments to
363# cug.test, but also to centralization and degree!
364
365ct # Print the result
366plot(ct) # Can also plot it!
367
368###########################################################
369###########################################################
370##4 Multivariate Analysis of Graph Sets
371#4.1 Distance based methods: clustering and scaling
372
373# Start by calculating Hamming distances for the Sampson data
374samphd<-hdist(sampson)
375samphd
376
377# Now, try an MDS solution
378sampmds<-cmdscale(samphd)
379sampmds
380plot(sampmds,type="n") # Plot the results
381text(sampmds,label=names(sampson))
382
383# MDS suggests a three-cluster solution; let's try hclust
384samphc<-hclust(as.dist(samphd))
385plot(samphc,labels=names(sampson)) # Very clear solution
386rect.hclust(samphc,k=3)
387
388# Examine central graphs for each cluster
389sampcg<-gclust.centralgraph(samphc,k=3,sampson)
390par(mfrow=c(2,2))
391gplot(sampcg[1,,],main="Positive CG")
392gplot(sampcg[2,,],main="Negative CG")
393gplot(sampcg[3,,],main="Liking CG")
394par(mfrow=c(1,1))
395
396# More fun - can plot stats by cluster!
397gclust.boxstats(samphc,k=3,grecip(sampson,measure="edgewise.lrr"), names=c("Positive","Negative","Liking"), main="Edgewise LRR Reciprocity by Relational Type")
398gclust.boxstats(samphc,k=3,gtrans(sampson), names=c("Positive","Negative","Liking"), main="Transitivity by Relational Type")
399
400# For more information....
401?hdist
402?cmdscale
403?hclust
404?rect.hclust
405?gclust.centralgraph
406?gdist.plotstats
407
408###########################################################
409#4.2 Network PCA
410# To begin, let's get the graph correlation matrix for the Sampson nets
411sampcor<-gcor(sampson)
412sampcor
413
414# Now, we compute the eigendecomposition (could also have used gcov above)
415sampeig<-eigen(sampcor)
416
417# Eigenvalues contain variance explained, to whit:
418evals<-sampeig$value # Extract eigenvalues
419evals/sum(evals) # Variance explained
420
421# Show this as a scree plot
422barplot(evals/sum(evals),names.arg=1:length(evals))
423
424# Examine loadings (eigenvectors); looks like first 3 are key
425load<-sampeig$vector[,1:3]
426rownames(load)<-names(sampson)
427load
428
429# Can rotate using varimax
430varimax(load)
431
432# Try plotting the first two components
433plot(load[,1:2],type="n",asp=1,xlab="PC 1",ylab="PC 2")
434abline(h=0,v=0,lty=3)
435arrows(0,0,load[,1],load[,2],col=2) # Should be read angularly!
436text(load[,1:2],label=names(sampson))
437
438# Finally, extract scores
439S1<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,1],"*"),c(2,3),sum)
440S2<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,2],"*"),c(2,3),sum)
441S3<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,3],"*"),c(2,3),sum)
442
443#Visualize a score graph (not too helpful in this case!)
444coord<-gplot.layout.fruchtermanreingold(as.edgelist.sna(S1>0),NULL)
445gplot(S1!=0,edge.col=sign(S1)+3,coord=coord)
446
447# For more information....
448?gcor
449?gcov
450?eigen
451?varimax
452?abline
453?arrows
454?sweep
455?gplot.layout
456
457###########################################################
458#4.3 Network CCA
459# For this one, we're going to use the country trade data. Somewhat perversely,
460# we're going to use yacca instead of netcancor (b/c this data set has
461# missing data which netcancor doesn't handle, and b/c yacca currently has
462# better visualizations).
463library(yacca) # You'll need to install this....
464# Perform a canonical correlation analysis between relations and attribute
465# differences
466trade.cca<-cca(gvectorize(trade),gvectorize(tradediff), standardize.scores=FALSE) # Turn off standardization b/c of NAs
467summary(trade.cca)
468
469# Visualize the output
470plot(trade.cca)
471
472# For more information
473?netcancor
474?yacca
475?cca
476?gvectorize
477
478###########################################################
479#4.4 Studying Qualitative Dynamics with Network MDS
480# Johnson collected 'liking' scores for researchers working in the South Pole on a monthly basis over two years
481class(johnsonPolarY1)
482dim(johnsonPolarY1)
483johnsonPolarY1[1,,]
484
485# Visualize the data using weight or color to aid interpretation
486gplot(johnsonPolarY1)
487gplot(johnsonPolarY1[1,,],edge.lwd=johnsonPolarY1[1,,])
488coords<-gplot(johnsonPolarY1[1,,],edge.col=rainbow(10)[johnsonPolarY1[1,,]], vertex.col="black")
489par(ask=TRUE)
490for(i in 1:8)
491  gplot(johnsonPolarY1[i,,],edge.col=rainbow(10)[johnsonPolarY1[i,,]],
492        vertex.col="black",coord=coords)
493par(ask=FALSE)
494
495# Calculate Hamming distances for each of the two Johnson data sets
496jp1d <- hdist(johnsonPolarY1)
497jp2d <- hdist(johnsonPolarY2)
498
499# Now try an MDS solution
500jp1mds<-cmdscale(jp1d)
501jp1mds
502plot(jp1mds,type="l",lty=2) # Plot the results
503text(jp1mds,label=rownames(johnsonPolarY1),font=2)
504jp2mds<-cmdscale(jp2d)
505jp2mds
506plot(jp2mds,type="l",lty=2) # Plot the results
507text(jp2mds,label=rownames(johnsonPolarY2),font=2)