@@ -14,6 +14,43 @@ import org.apache.spark.Logging
1414import java .util .{HashSet => JHashSet , TreeSet => JTreeSet }
1515// import org.apache.spark.graphx.MakeString
1616
17+ class TrackCounts extends Serializable {
18+
19+ var red : Long = 0
20+ var stub : Long = 0
21+ var disambig : Long = 0
22+ var notFound : Long = 0
23+ var titleNull : Long = 0
24+ var relevant : Long = 0
25+ var total : Long = 0
26+
27+ def update (o : TrackCounts ) {
28+ red += o.red
29+ stub += o.stub
30+ disambig += o.disambig
31+ notFound += o.notFound
32+ titleNull += o.titleNull
33+ relevant += o.relevant
34+ total += o.total
35+ }
36+
37+ def addArticle (art : WikiArticle ) {
38+ if (art.redirect) red += 1
39+ if (art.stub) stub += 1
40+ if (art.disambig) disambig += 1
41+ if (art.title == WikiArticle .notFoundString) notFound += 1
42+ if (art.title == null ) titleNull += 1
43+ if (art.relevant) relevant += 1
44+ total += 1
45+ }
46+
47+ override def toString : String = {
48+ s " Redirects: $red, Stubs: $stub, Disambig: $disambig, Not Found: $notFound, Null: $titleNull, RELEVANT: $relevant, TOTAL: $total"
49+
50+ }
51+
52+ }
53+
1754
1855object PrePostProcessWikipedia extends Logging {
1956
@@ -104,19 +141,39 @@ object PrePostProcessWikipedia extends Logging {
104141 logWarning(s " XML RDD counted. Found ${xmlRDD.count} raw articles. " )
105142
106143 val allArtsRDD = xmlRDD.map { raw => new WikiArticle (raw) }.cache
107- val numRedirects = allArtsRDD.filter { art => art.redirect }.count
108- val numStubs = allArtsRDD.filter { art => art.stub }.count
109- val numDisambig = allArtsRDD.filter { art => art.disambig }.count
110- val numTitleNotFound = allArtsRDD.filter { art => art.title == WikiArticle .notFoundString }.count
111- logWarning(s " Filter results: \t Redirects: $numRedirects \t Stubs: $numStubs \t Disambiguations: $numDisambig \t Title not found: $numTitleNotFound" )
144+ // val numRedirects = allArtsRDD.filter { art => art.redirect }.count
145+ // val numStubs = allArtsRDD.filter { art => art.stub }.count
146+ // val numDisambig = allArtsRDD.filter { art => art.disambig }.count
147+ // val numTitleNotFound = allArtsRDD.filter { art => art.title == WikiArticle.notFoundString }.count
148+ // logWarning(s"Filter results:\tRedirects: $numRedirects \tStubs: $numStubs \tDisambiguations: $numDisambig \t Title not found: $numTitleNotFound")
112149
113- val wikiRDD = allArtsRDD.filter { art => art.relevant }.repartition(128 )
114- logWarning(s " wikiRDD counted. Found ${wikiRDD.count} relevant articles. " )
150+ val wikiRDD = allArtsRDD.filter { art => art.relevant }.cache // .repartition(128)
151+ wikiRDD.repartition(128 )
152+ // val wikiRDD = allArtsRDD.filter { art => art.relevant }.repartition(128)
153+ val wikiRDDCount = wikiRDD.count
154+ logWarning(s " wikiRDD counted. Found ${wikiRDDCount} relevant articles. " )
155+ // logWarning("Counting differently")
156+
157+ // count: redirects, stubs, disambigs, titlenotfound, titlenull, relevant, total
158+ // val zeroCount = new TrackCounts
159+ // val countSeqOp = (curCount: TrackCounts, art: WikiArticle) => {
160+ // curCount.addArticle(art)
161+ // curCount
162+ // }
163+ // val countCombOp = (c1: TrackCounts, c2: TrackCounts) => {
164+ // c1.update(c2)
165+ // c1
166+ // }
167+ //
168+ // val cr = allArtsRDD.aggregate(zeroCount)(countSeqOp, countCombOp)
169+ // logWarning(s"Different count results: $cr")
170+ // System.exit(0)
171+
115172 val vertices : RDD [(VertexId , String )] = wikiRDD.map { art => (art.vertexID, art.title) }
116173 val edges : RDD [Edge [Double ]] = wikiRDD.flatMap { art => art.edges }
117174 logWarning(" creating graph" )
118175 val g = Graph (vertices, edges)
119- val cleanG = g.subgraph(x => true , (vid, vd) => vd != null )
176+ val cleanG = g.subgraph(x => true , (vid, vd) => vd != null ).cache
120177 logWarning(s " DIRTY graph has ${g.triplets.count()} EDGES, ${g.vertices.count()} VERTICES " )
121178 logWarning(s " CLEAN graph has ${cleanG.triplets.count()} EDGES, ${cleanG.vertices.count()} VERTICES " )
122179 val resultG = pagerankConnComponentsAlt(numIters, cleanG)
@@ -134,12 +191,13 @@ object PrePostProcessWikipedia extends Logging {
134191 var currentGraph = g
135192 logWarning(" starting iterations" )
136193 for (i <- 0 to numRepetitions) {
194+ currentGraph.cache
137195 val startTime = System .currentTimeMillis
138196 logWarning(" starting pagerank" )
139- val pr = PageRank .run(currentGraph, 20 )
197+ val pr = PageRank .run(currentGraph, 20 ).cache
140198 pr.vertices.count
141199 logWarning(" Pagerank completed" )
142- val prAndTitle = currentGraph.outerJoinVertices(pr.vertices)({(id : VertexId , title : String , rank : Option [Double ]) => (title, rank.getOrElse(0.0 ))})
200+ val prAndTitle = currentGraph.outerJoinVertices(pr.vertices)({(id : VertexId , title : String , rank : Option [Double ]) => (title, rank.getOrElse(0.0 ))}).cache
143201 prAndTitle.vertices.count
144202 logWarning(" join completed." )
145203 val top20 = prAndTitle.vertices.top(20 )(Ordering .by((entry : (VertexId , (String , Double ))) => entry._2._2))
@@ -149,8 +207,8 @@ object PrePostProcessWikipedia extends Logging {
149207 val filterTop20 = {(v : VertexId , d : String ) =>
150208 ! top20verts.contains(v)
151209 }
152- val newGraph = currentGraph.subgraph(x => true , filterTop20)
153- val ccGraph = ConnectedComponents .run(newGraph)
210+ val newGraph = currentGraph.subgraph(x => true , filterTop20).cache
211+ val ccGraph = ConnectedComponents .run(newGraph).cache
154212// val zeroVal = new mutable.HashSet[VertexId]()
155213// val seqOp = (s: mutable.HashSet[VertexId], vtuple: (VertexId, VertexId)) => {
156214// s.add(vtuple._2)
0 commit comments