# Sunbelt2016: movingBeyondDescriptives.R

File movingBeyondDescriptives.R, 15.9 KB (added by morrism, 2 years ago) |
---|

Line | |
---|---|

1 | ##1 Getting Started |

2 | install.packages("statnet") |

3 | library(statnet) |

4 | library(network) ##this is part of statnet, but might need to be loaded separately to attach data |

5 | |

6 | ##these next two packages are only necessary for a few routines |

7 | install.packages("numDeriv") |

8 | library(numDeriv) |

9 | install.packages("yacca") |

10 | library(yacca) |

11 | |

12 | |

13 | ########################################################### |

14 | ########################################################### |

15 | ##2 Node Level Indices, Network Covariates, and Network Regression |

16 | ##2.1 Network visualization and basic descriptives |

17 | |

18 | #make sure you're in the correct file directory or change the following command |

19 | load("movingBeyondDescriptives.Rdata") |

20 | |

21 | data(emon) # Load Drabek et al. data |

22 | |

23 | # Begin by examining the 'emon' dataset |

24 | ?emon |

25 | class(emon) |

26 | class(emon[[1]]) |

27 | emon[[1]] |

28 | emon[[1]]%v%"vertex.names" # Display vertex names |

29 | |

30 | # Extract ties from the Cheyenne EMON communicating at least "every few hours" |

31 | g<-as.sociomatrix(emon[[1]],"Frequency") # Need to get the frequency info |

32 | g<-symmetrize((g>0)&(g<4)) # Note the reverse coding! |

33 | |

34 | #Get some potential covariates |

35 | drs<-emon[[1]]%v%"Decision.Rank.Score" # Get decision rank (see man page) |

36 | crs<-emon[[1]]%v%"Command.Rank.Score" # Get command rank |

37 | |

38 | # Plot Cheyenne EMON communications |

39 | gplot(emon[[1]]) |

40 | gplot(emon[[1]], label.cex=0.5, label.col=4, label=network.vertex.names(emon[[1]])) # Basic display, with labels |

41 | |

42 | #Calculate some basic centrality measures |

43 | deg<-degree(g,gmode="graph") |

44 | bet<-betweenness(g,gmode="graph") |

45 | clo<-closeness(g,gmode="graph") |

46 | |

47 | #Raw correlations |

48 | cor(cbind(deg,bet,clo),cbind(drs,crs)) |

49 | |

50 | #Classical tests (using asymptotic t distribution) |

51 | cor.test(deg,drs) |

52 | cor.test(bet,drs) |

53 | cor.test(clo,drs) |

54 | |

55 | ########################################################### |

56 | #2.2 Testing correlations |

57 | #Lets build a permutation test |

58 | deg |

59 | drs |

60 | c.obs<-cor(deg,drs) |

61 | c.obs |

62 | |

63 | #Permute one of the data sets |

64 | sample(drs) |

65 | cor(deg,sample(drs)) |

66 | |

67 | #write a for loop to permute one of the data sets many times |

68 | c.rep<-vector(length=1000) |

69 | for(n in 1:1000){ |

70 | c.rep[n]<-cor(deg,sample(drs)) |

71 | } |

72 | |

73 | #look at what we've created |

74 | c.rep |

75 | hist(c.rep) |

76 | |

77 | #compare to empirical data |

78 | abline(v=c.obs,col="red") |

79 | |

80 | #put it all into a function! |

81 | perm.cor.test<-function(x,y,n=1000){ #Define a simple test function |

82 | c.obs<-cor(x,y) |

83 | c.rep<-vector() |

84 | for(i in 1:n) |

85 | c.rep[i]<-cor(x,sample(y)) |

86 | c.rep |

87 | } |

88 | |

89 | #look at the results |

90 | new.c.rep<-perm.cor.test(deg,drs) |

91 | hist(new.c.rep) |

92 | abline(v=c.obs,col="red") |

93 | |

94 | #calculate quantiles |

95 | mean(new.c.rep>=c.obs) |

96 | mean(new.c.rep<=c.obs) |

97 | mean(abs(new.c.rep)>=abs(c.obs)) |

98 | |

99 | #make the output display these quantiles |

100 | perm.cor.test<-function(x,y,niter=1000){ #Define a simple test function |

101 | c.obs<-cor(x,y) |

102 | c.rep<-vector() |

103 | for(i in 1:niter) |

104 | c.rep[i]<-cor(x,sample(y),use="complete.obs") |

105 | cat("Vector Permutation Test:\n\tObserved correlation: ",c.obs,"\tReplicate quantiles |

106 | (number of iterations = ",n,")\n",sep="") |

107 | cat("\t\tPr(rho>=obs):",mean(c.rep>=c.obs),"\n") |

108 | cat("\t\tPr(rho<=obs):",mean(c.rep<=c.obs),"\n") |

109 | cat("\t\tPr(|rho|>=|obs|):",mean(abs(c.rep)>=abs(c.obs)),"\n") |

110 | invisible(list(obs=c.obs,rep=c.rep)) |

111 | } |

112 | |

113 | #examine the fancy output |

114 | fancy<-perm.cor.test(deg,drs) |

115 | attributes(fancy) |

116 | fancy$obs |

117 | fancy$rep |

118 | |

119 | #For more information.... |

120 | ?cor.test |

121 | ?t.test |

122 | ?sample |

123 | |

124 | ########################################################### |

125 | #2.3 Using NLIs as regression covariates |

126 | |

127 | pstaff<-emon[[1]]%v%"Paid.Staff" # Get more EMON covariates |

128 | vstaff<-emon[[1]]%v%"Volunteer.Staff" |

129 | govt<-((emon[[1]]%v%"Sponsorship")!="Private") |

130 | |

131 | #Simple model: decision rank is linear in size, degree, and govt status |

132 | mod<-lm(drs~deg+pstaff+vstaff+govt) |

133 | summary(mod) |

134 | anova(mod) #Some useful lm tools |

135 | AIC(mod) |

136 | |

137 | #Try with alternative measures.... |

138 | mod2<-lm(drs~bet+pstaff+vstaff+govt) #Betweenness |

139 | summary(mod2) |

140 | mod3<-lm(drs~clo+pstaff+vstaff+govt) #Closeness |

141 | summary(mod3) |

142 | AIC(mod,mod2,mod3) #Closeness wins! |

143 | |

144 | #For more information.... |

145 | ?lm |

146 | ?anova |

147 | ?AIC |

148 | |

149 | ########################################################### |

150 | #2.4 Graph correlation and bivariate QAP |

151 | # Remember the Florentine families data |

152 | data(florentine) |

153 | gplot(flobusiness) # Examine business ties |

154 | gplot(flomarriage) # Examine marriage ties |

155 | |

156 | # Could the two be related? |

157 | par(mfrow=c(1,2)) |

158 | coords<-plot(flobusiness,label=flobusiness%v%"vertex.names",label.cex=.5,pad=1) |

159 | title("Business Ties") |

160 | plot(flomarriage,label=flomarriage%v%"vertex.names",label.cex=.5,pad=1,coord=coords) |

161 | title("Marriage Ties") |

162 | par(mfrow=c(1,1)) |

163 | |

164 | # Let's try a graph correlation |

165 | gcor(flobusiness,flomarriage) |

166 | |

167 | # Why can't we use our previous permutation test? |

168 | # instead, use rmperm |

169 | # take a look |

170 | rmperm |

171 | |

172 | par(mfrow=c(1,2)) |

173 | gplot(flobusiness[,],label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords) |

174 | title("Business Ties") |

175 | gplot(rmperm(flobusiness),label=flobusiness%v%"vertex.names",label.cex=.5,pad=1,coord=coords) |

176 | title("Permuted Business Ties") |

177 | par(mfrow=c(1,1)) |

178 | |

179 | #now look at qaptest |

180 | qaptest |

181 | |

182 | #and use it |

183 | qt<-qaptest(list(flobusiness,flomarriage),gcor,g1=1,g2=2) |

184 | summary(qt) # Examine the results |

185 | plot(qt) # Plot the QAP distribution |

186 | |

187 | # Testing against covariate effects |

188 | wealth<-sapply(flomarriage%v%"wealth",rep,network.size(flomarriage)) |

189 | wealth |

190 | wealthdiff<-abs(outer(flomarriage%v%"wealth",flomarriage%v%"wealth","-")) |

191 | wealthdiff |

192 | qt1<-qaptest(list(flomarriage,wealth),gcor,g1=1,g2=2) |

193 | qt2<-qaptest(list(flomarriage,wealthdiff),gcor,g1=1,g2=2) |

194 | summary(qt1) # Do wealthy families have more ties? |

195 | summary(qt2) # Is there a wealth difference effect? |

196 | |

197 | # For more information.... |

198 | ?qaptest |

199 | ?gcor |

200 | ?outer |

201 | ?sapply |

202 | ?rep |

203 | |

204 | ########################################################### |

205 | #2.5 Network Regression |

206 | |

207 | # Combine the previous tests (takes a while to perform QAP test) |

208 | marriagefit<-netlm(flomarriage,list(flobusiness,wealth,wealthdiff)) |

209 | summary(marriagefit) # Examine the results |

210 | |

211 | # Another example: we begin by preparing the response variable. We will use the Cheyenne |

212 | # EMON in valued form, but need to recode the frequency data |

213 | Y<-as.sociomatrix(emon[[1]], "Frequency") # Extract frequencies |

214 | Y[Y>0]<-5-Y[Y>0] # Now, higher -> more frequent |

215 | |

216 | # Extract some vertex attributes |

217 | crk<-emon[[1]]%v% "Command.Rank.Score" # Command rank |

218 | spon<-emon[[1]]%v%"Sponsorship" # Type of organization |

219 | |

220 | # Form some predictor matrices (X variables) |

221 | Xcr<-sapply(crk,rep,length(crk)) # Column effects for command rank |

222 | Xcr |

223 | Xsp<-outer(spon,spon,"!=") # Dyadic effect for type difference |

224 | Xsp |

225 | |

226 | # Fit the model (takes a while to perform QAP test) |

227 | cmfit<-netlm(Y,list(Xcr,Xsp)) |

228 | summary(cmfit) # Examine the results |

229 | |

230 | |

231 | cmfitB<-netlm(Y,list(Xcr,Xsp),nullhyp="classical") |

232 | summary(cmfitB) # In this example, pretty similar |

233 | |

234 | # For more information.... |

235 | ?outer |

236 | ?netlm |

237 | |

238 | ########################################################### |

239 | #2.6 Network Autocorrelation Models |

240 | |

241 | rownames(parishNet) |

242 | which(parishNet["Rapides",]==1) |

243 | |

244 | #Examine the OLS regression predicting percentage vote for Obama by percentage Black residents and average weekly wage |

245 | |

246 | modLM<-lm(parishAttr[,1]~parishAttr[,2]+parishAttr[,3]) |

247 | summary(modLM) |

248 | |

249 | #create some simple weight matrices |

250 | wMat<-apply(parishNet,1,function(x){x/sum(x)}) |

251 | wRandMat<-rgraph(64) |

252 | |

253 | #Run the different models with the empirical weights and look at the results |

254 | |

255 | modAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat) |

256 | summary(modAR) |

257 | |

258 | modMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W2=wMat) |

259 | summary(modMA) |

260 | |

261 | modARMA<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wMat,W2=wMat) |

262 | summary(modARMA) |

263 | |

264 | modRandAR<-lnam(parishAttr[,1],x=parishAttr[,2:3],W1=wRandMat) |

265 | summary(modRandAR) |

266 | |

267 | modWIntercept<-lnam(parishAttr[,1],x=cbind(1,parishAttr[,2:3]),W1=wMat) |

268 | |

269 | ########################################################### |

270 | ########################################################### |

271 | ##3 Graph Level Indices and Conditional Uniform Graph Tests |

272 | #3.1 Permutation tests for GLI/graph-level covariate association |

273 | |

274 | # Here we consider the famous Sampson monastery data: |

275 | par(mfrow=c(4,3),mar=c(2,1,4,1)) |

276 | for(i in 1:length(sampson)) |

277 | gplot(sampson[[i]],displaylabel=TRUE,boxed.label=FALSE,main=names(sampson)[i]) |

278 | par(mfrow=c(1,1),mar=c(5,4,4,2)+.01) |

279 | |

280 | # Are positive relations more reciprocal (relative to density) than negative |

281 | # ones? Let's try a simple permutation test: |

282 | |

283 | # Calculate reciprocity levels for each network |

284 | r4<-grecip(sampson,measure="edgewise.lrr") |

285 | r4 |

286 | |

287 | # Categorize relationships by positive and negative |

288 | ispos<-c(TRUE,FALSE,TRUE,FALSE,TRUE,TRUE,TRUE,FALSE,TRUE,FALSE) |

289 | |

290 | # Calculate the empirical relationship |

291 | obs<-sum(r4[ispos])-sum(r4[!ispos]) |

292 | obs # But what does this mean? |

293 | |

294 | # Run a permutation test |

295 | reps<-vector() |

296 | for(i in 1:1e4){ |

297 | temp<-sample(ispos) |

298 | reps[i]<-sum(r4[temp])-sum(r4[!temp]) |

299 | } |

300 | |

301 | #calculate statistics and observe |

302 | mean(reps>=obs) #Upper tail p-value |

303 | mean(abs(reps)>=abs(obs)) # Two-sided version |

304 | hist(reps) |

305 | abline(v=obs,col=2,lwd=3) # Visualize it |

306 | |

307 | |

308 | # We can look at transitivity as well. How does transitivity compare to density? (Log-odds method) |

309 | log(gtrans(sampson)/gden(sampson)) |

310 | |

311 | # Are positive relations more transitive (relative to density) than negative ones? Let's try another vector permutation test: |

312 | ltr<-log(gtrans(sampson)/gden(sampson)) |

313 | obs<-sum(ltr[ispos])-sum(ltr[!ispos]) |

314 | reps<-vector() |

315 | for(i in 1:1e4){ |

316 | temp<-sample(ispos) |

317 | reps[i]<-sum(ltr[temp])-sum(ltr[!temp]) |

318 | } |

319 | mean(reps>=obs) # Upper tail p-value |

320 | mean(abs(reps)>=abs(obs)) # Two-sided version |

321 | hist(reps) |

322 | abline(v=obs,col=2,lwd=3) # Visualize it |

323 | |

324 | ########################################################### |

325 | ##3.2 Comparing graphs via the triad census |

326 | # Let's get the triad census for each network |

327 | tc<-triad.census(sampson) |

328 | tc |

329 | |

330 | # Cool trick: two-way correspondence analysis of graphs and their triad census |

331 | # scores (aka a "Faust diagram"). Networks here appear close to the triad |

332 | # types they contain at excess frequency (distances are chi-squared based; |

333 | # see references in ?corresp for more detail). |

334 | library(MASS) # Requires the MASS package |

335 | plot(corresp(tc,nf=2)) # Plot network/triad association |

336 | |

337 | # What if this data were symmetric? We can symmetrize to illustrate. |

338 | tc<-triad.census(symmetrize(sampson),mode="graph")#Need to use mode="graph" here |

339 | rownames(tc)<-names(sampson) |

340 | plot(corresp(tc,nf=2)) # Plot undirected network/triad association |

341 | |

342 | # For more information.... |

343 | ?gtrans |

344 | ?triad.census |

345 | ?corresp |

346 | ?symmetrize |

347 | |

348 | ########################################################### |

349 | #3.3 Simple univariate conditional uniform graph tests |

350 | # The cug.test function provides a simple interface for univariate CUG tests. |

351 | # Let's try testing some data on trade in complex manufactured goods to see if overall |

352 | # activity (density) is greater then would be expected from size alone. |

353 | cug.test(ctrade,gden) # Call cug.test with the gden (density) function |

354 | |

355 | # Is there more reciprocity than density would suggest? Let's see. |

356 | cug.test(ctrade,grecip,cmode="edges") # Conditioning on edges, calling grecip |

357 | |

358 | # Given biases in density and reciprocity, we might look to see if there is a |

359 | # bias in transitivity, too. This time, let's condition on all of the above. |

360 | cug.test(ctrade,gtrans,cmode="dyad") # Conditioning on dyad census |

361 | |

362 | # We are not limited to simple commands. Let's try indegree centralization: |

363 | ct<-cug.test(ctrade,centralization,cmode="dyad",FUN.arg=list(FUN=degree, cmode="indegree")) |

364 | # Note that we here must pass not only arguments to |

365 | # cug.test, but also to centralization and degree! |

366 | |

367 | ct # Print the result |

368 | plot(ct) # Can also plot it! |

369 | |

370 | ########################################################### |

371 | ########################################################### |

372 | ##4 Multivariate Analysis of Graph Sets |

373 | #4.1 Distance based methods: clustering and scaling |

374 | |

375 | # Start by calculating Hamming distances for the Sampson data |

376 | samphd<-hdist(sampson) |

377 | samphd |

378 | |

379 | # Now, try an MDS solution |

380 | sampmds<-cmdscale(samphd) |

381 | sampmds |

382 | plot(sampmds,type="n") # Plot the results |

383 | text(sampmds,label=names(sampson)) |

384 | |

385 | # MDS suggests a three-cluster solution; let's try hclust |

386 | samphc<-hclust(as.dist(samphd)) |

387 | plot(samphc,labels=names(sampson)) # Very clear solution |

388 | rect.hclust(samphc,k=3) |

389 | |

390 | # Examine central graphs for each cluster |

391 | sampcg<-gclust.centralgraph(samphc,k=3,sampson) |

392 | par(mfrow=c(2,2)) |

393 | gplot(sampcg[1,,],main="Positive CG") |

394 | gplot(sampcg[2,,],main="Negative CG") |

395 | gplot(sampcg[3,,],main="Liking CG") |

396 | par(mfrow=c(1,1)) |

397 | |

398 | # More fun - can plot stats by cluster! |

399 | gclust.boxstats(samphc,k=3,grecip(sampson,measure="edgewise.lrr"), names=c("Positive","Negative","Liking"), main="Edgewise LRR Reciprocity by Relational Type") |

400 | gclust.boxstats(samphc,k=3,gtrans(sampson), names=c("Positive","Negative","Liking"), main="Transitivity by Relational Type") |

401 | |

402 | # For more information.... |

403 | ?hdist |

404 | ?cmdscale |

405 | ?hclust |

406 | ?rect.hclust |

407 | ?gclust.centralgraph |

408 | ?gdist.plotstats |

409 | |

410 | ########################################################### |

411 | #4.2 Network PCA |

412 | # To begin, let's get the graph correlation matrix for the Sampson nets |

413 | sampcor<-gcor(sampson) |

414 | sampcor |

415 | |

416 | # Now, we compute the eigendecomposition (could also have used gcov above) |

417 | sampeig<-eigen(sampcor) |

418 | |

419 | # Eigenvalues contain variance explained, to whit: |

420 | evals<-sampeig$value # Extract eigenvalues |

421 | evals/sum(evals) # Variance explained |

422 | |

423 | # Show this as a scree plot |

424 | barplot(evals/sum(evals),names.arg=1:length(evals)) |

425 | |

426 | # Examine loadings (eigenvectors); looks like first 3 are key |

427 | load<-sampeig$vector[,1:3] |

428 | rownames(load)<-names(sampson) |

429 | load |

430 | |

431 | # Can rotate using varimax |

432 | varimax(load) |

433 | |

434 | # Try plotting the first two components |

435 | plot(load[,1:2],type="n",asp=1,xlab="PC 1",ylab="PC 2") |

436 | abline(h=0,v=0,lty=3) |

437 | arrows(0,0,load[,1],load[,2],col=2) # Should be read angularly! |

438 | text(load[,1:2],label=names(sampson)) |

439 | |

440 | # Finally, extract scores |

441 | S1<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,1],"*"),c(2,3),sum) |

442 | S2<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,2],"*"),c(2,3),sum) |

443 | S3<-apply(sweep(as.sociomatrix.sna(sampson),1,load[,3],"*"),c(2,3),sum) |

444 | |

445 | #Visualize a score graph (not too helpful in this case!) |

446 | coord<-gplot.layout.fruchtermanreingold(as.edgelist.sna(S1>0),NULL) |

447 | gplot(S1!=0,edge.col=sign(S1)+3,coord=coord) |

448 | |

449 | # For more information.... |

450 | ?gcor |

451 | ?gcov |

452 | ?eigen |

453 | ?varimax |

454 | ?abline |

455 | ?arrows |

456 | ?sweep |

457 | ?gplot.layout |

458 | |

459 | ########################################################### |

460 | #4.3 Network CCA |

461 | # For this one, we're going to use the country trade data. Somewhat perversely, |

462 | # we're going to use yacca instead of netcancor (b/c this data set has |

463 | # missing data which netcancor doesn't handle, and b/c yacca currently has |

464 | # better visualizations). |

465 | library(yacca) # You'll need to install this.... |

466 | # Perform a canonical correlation analysis between relations and attribute |

467 | # differences |

468 | trade.cca<-cca(gvectorize(trade),gvectorize(tradediff), standardize.scores=FALSE) # Turn off standardization b/c of NAs |

469 | summary(trade.cca) |

470 | |

471 | # Visualize the output |

472 | plot(trade.cca) |

473 | |

474 | # For more information |

475 | ?netcancor |

476 | ?yacca |

477 | ?cca |

478 | ?gvectorize |

479 | |

480 | ########################################################### |

481 | #4.4 Studying Qualitative Dynamics with Network MDS |

482 | # Johnson collected 'liking' scores for researchers working in the South Pole on a monthly basis over two years |

483 | class(johnsonPolarY1) |

484 | dim(johnsonPolarY1) |

485 | johnsonPolarY1[1,,] |

486 | |

487 | # Visualize the data using weight or color to aid interpretation |

488 | gplot(johnsonPolarY1) |

489 | gplot(johnsonPolarY1[1,,],edge.lwd=johnsonPolarY1[1,,]) |

490 | coords<-gplot(johnsonPolarY1[1,,],edge.col=rainbow(10)[johnsonPolarY1[1,,]], vertex.col="black") |

491 | par(ask=TRUE) |

492 | for(i in 1:8) |

493 | gplot(johnsonPolarY1[i,,],edge.col=rainbow(10)[johnsonPolarY1[i,,]], |

494 | vertex.col="black",coord=coords) |

495 | par(ask=FALSE) |

496 | |

497 | # Calculate Hamming distances for each of the two Johnson data sets |

498 | jp1d <- hdist(johnsonPolarY1) |

499 | jp2d <- hdist(johnsonPolarY2) |

500 | |

501 | # Now try an MDS solution |

502 | jp1mds<-cmdscale(jp1d) |

503 | jp1mds |

504 | plot(jp1mds,type="l",lty=2) # Plot the results |

505 | text(jp1mds,label=rownames(johnsonPolarY1),font=2) |

506 | jp2mds<-cmdscale(jp2d) |

507 | jp2mds |

508 | plot(jp2mds,type="l",lty=2) # Plot the results |

509 | text(jp2mds,label=rownames(johnsonPolarY2),font=2) |