@@ -14,6 +14,43 @@ import org.apache.spark.Logging
14
14
import java .util .{HashSet => JHashSet , TreeSet => JTreeSet }
15
15
// import org.apache.spark.graphx.MakeString
16
16
17
+ class TrackCounts extends Serializable {
18
+
19
+ var red : Long = 0
20
+ var stub : Long = 0
21
+ var disambig : Long = 0
22
+ var notFound : Long = 0
23
+ var titleNull : Long = 0
24
+ var relevant : Long = 0
25
+ var total : Long = 0
26
+
27
+ def update (o : TrackCounts ) {
28
+ red += o.red
29
+ stub += o.stub
30
+ disambig += o.disambig
31
+ notFound += o.notFound
32
+ titleNull += o.titleNull
33
+ relevant += o.relevant
34
+ total += o.total
35
+ }
36
+
37
+ def addArticle (art : WikiArticle ) {
38
+ if (art.redirect) red += 1
39
+ if (art.stub) stub += 1
40
+ if (art.disambig) disambig += 1
41
+ if (art.title == WikiArticle .notFoundString) notFound += 1
42
+ if (art.title == null ) titleNull += 1
43
+ if (art.relevant) relevant += 1
44
+ total += 1
45
+ }
46
+
47
+ override def toString : String = {
48
+ s " Redirects: $red, Stubs: $stub, Disambig: $disambig, Not Found: $notFound, Null: $titleNull, RELEVANT: $relevant, TOTAL: $total"
49
+
50
+ }
51
+
52
+ }
53
+
17
54
18
55
object PrePostProcessWikipedia extends Logging {
19
56
@@ -104,19 +141,39 @@ object PrePostProcessWikipedia extends Logging {
104
141
logWarning(s " XML RDD counted. Found ${xmlRDD.count} raw articles. " )
105
142
106
143
val allArtsRDD = xmlRDD.map { raw => new WikiArticle (raw) }.cache
107
- val numRedirects = allArtsRDD.filter { art => art.redirect }.count
108
- val numStubs = allArtsRDD.filter { art => art.stub }.count
109
- val numDisambig = allArtsRDD.filter { art => art.disambig }.count
110
- val numTitleNotFound = allArtsRDD.filter { art => art.title == WikiArticle .notFoundString }.count
111
- logWarning(s " Filter results: \t Redirects: $numRedirects \t Stubs: $numStubs \t Disambiguations: $numDisambig \t Title not found: $numTitleNotFound" )
144
+ // val numRedirects = allArtsRDD.filter { art => art.redirect }.count
145
+ // val numStubs = allArtsRDD.filter { art => art.stub }.count
146
+ // val numDisambig = allArtsRDD.filter { art => art.disambig }.count
147
+ // val numTitleNotFound = allArtsRDD.filter { art => art.title == WikiArticle.notFoundString }.count
148
+ // logWarning(s"Filter results:\tRedirects: $numRedirects \tStubs: $numStubs \tDisambiguations: $numDisambig \t Title not found: $numTitleNotFound")
112
149
113
- val wikiRDD = allArtsRDD.filter { art => art.relevant }.repartition(128 )
114
- logWarning(s " wikiRDD counted. Found ${wikiRDD.count} relevant articles. " )
150
+ val wikiRDD = allArtsRDD.filter { art => art.relevant }.cache // .repartition(128)
151
+ wikiRDD.repartition(128 )
152
+ // val wikiRDD = allArtsRDD.filter { art => art.relevant }.repartition(128)
153
+ val wikiRDDCount = wikiRDD.count
154
+ logWarning(s " wikiRDD counted. Found ${wikiRDDCount} relevant articles. " )
155
+ // logWarning("Counting differently")
156
+
157
+ // count: redirects, stubs, disambigs, titlenotfound, titlenull, relevant, total
158
+ // val zeroCount = new TrackCounts
159
+ // val countSeqOp = (curCount: TrackCounts, art: WikiArticle) => {
160
+ // curCount.addArticle(art)
161
+ // curCount
162
+ // }
163
+ // val countCombOp = (c1: TrackCounts, c2: TrackCounts) => {
164
+ // c1.update(c2)
165
+ // c1
166
+ // }
167
+ //
168
+ // val cr = allArtsRDD.aggregate(zeroCount)(countSeqOp, countCombOp)
169
+ // logWarning(s"Different count results: $cr")
170
+ // System.exit(0)
171
+
115
172
val vertices : RDD [(VertexId , String )] = wikiRDD.map { art => (art.vertexID, art.title) }
116
173
val edges : RDD [Edge [Double ]] = wikiRDD.flatMap { art => art.edges }
117
174
logWarning(" creating graph" )
118
175
val g = Graph (vertices, edges)
119
- val cleanG = g.subgraph(x => true , (vid, vd) => vd != null )
176
+ val cleanG = g.subgraph(x => true , (vid, vd) => vd != null ).cache
120
177
logWarning(s " DIRTY graph has ${g.triplets.count()} EDGES, ${g.vertices.count()} VERTICES " )
121
178
logWarning(s " CLEAN graph has ${cleanG.triplets.count()} EDGES, ${cleanG.vertices.count()} VERTICES " )
122
179
val resultG = pagerankConnComponentsAlt(numIters, cleanG)
@@ -134,12 +191,13 @@ object PrePostProcessWikipedia extends Logging {
134
191
var currentGraph = g
135
192
logWarning(" starting iterations" )
136
193
for (i <- 0 to numRepetitions) {
194
+ currentGraph.cache
137
195
val startTime = System .currentTimeMillis
138
196
logWarning(" starting pagerank" )
139
- val pr = PageRank .run(currentGraph, 20 )
197
+ val pr = PageRank .run(currentGraph, 20 ).cache
140
198
pr.vertices.count
141
199
logWarning(" Pagerank completed" )
142
- val prAndTitle = currentGraph.outerJoinVertices(pr.vertices)({(id : VertexId , title : String , rank : Option [Double ]) => (title, rank.getOrElse(0.0 ))})
200
+ val prAndTitle = currentGraph.outerJoinVertices(pr.vertices)({(id : VertexId , title : String , rank : Option [Double ]) => (title, rank.getOrElse(0.0 ))}).cache
143
201
prAndTitle.vertices.count
144
202
logWarning(" join completed." )
145
203
val top20 = prAndTitle.vertices.top(20 )(Ordering .by((entry : (VertexId , (String , Double ))) => entry._2._2))
@@ -149,8 +207,8 @@ object PrePostProcessWikipedia extends Logging {
149
207
val filterTop20 = {(v : VertexId , d : String ) =>
150
208
! top20verts.contains(v)
151
209
}
152
- val newGraph = currentGraph.subgraph(x => true , filterTop20)
153
- val ccGraph = ConnectedComponents .run(newGraph)
210
+ val newGraph = currentGraph.subgraph(x => true , filterTop20).cache
211
+ val ccGraph = ConnectedComponents .run(newGraph).cache
154
212
// val zeroVal = new mutable.HashSet[VertexId]()
155
213
// val seqOp = (s: mutable.HashSet[VertexId], vtuple: (VertexId, VertexId)) => {
156
214
// s.add(vtuple._2)
0 commit comments