|
1 |
| -package org.apache.spark.examples.graphx |
2 |
| - |
3 |
| -import org.apache.spark._ |
4 |
| -import org.apache.spark.graph._ |
5 |
| -import org.apache.spark.graph.algorithms._ |
6 |
| -import org.apache.spark.rdd.NewHadoopRDD |
7 |
| -import org.apache.hadoop.io.LongWritable |
8 |
| -import org.apache.hadoop.io.Text |
9 |
| -import org.apache.hadoop.conf.Configuration |
10 |
| -import org.apache.mahout.text.wikipedia._ |
11 |
| -import org.apache.spark.rdd.RDD |
12 |
| -import java.util.Calendar |
13 |
| -import scala.math.Ordering.Implicits._ |
14 |
| - |
15 |
| - |
16 |
| -object AnalyzeWikipedia extends Logging { |
17 |
| - |
18 |
| - def main(args: Array[String]) = { |
19 |
| - |
20 |
| - |
21 |
| - |
22 |
| - |
23 |
| - val host = args(0) |
24 |
| - val fname = args(1) |
25 |
| - // val numparts = { |
26 |
| - // if (args.length >= 3) { |
27 |
| - // args(2).toInt |
28 |
| - // } else { |
29 |
| - // 64 |
30 |
| - // } |
31 |
| - // } |
32 |
| - // val preformattedFname = args(2) |
33 |
| - |
34 |
| - val serializer = "org.apache.spark.serializer.KryoSerializer" |
35 |
| - System.setProperty("spark.serializer", serializer) |
36 |
| - System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator") |
37 |
| - |
38 |
| - val sc = new SparkContext(host, "AnalyzeWikipedia") |
39 |
| - // val top10 = sc.parallelize(1 to 1000, 10).map(x => (x.toString, x)).top(10)(Ordering.by(_._2)) |
40 |
| - |
41 |
| - |
42 |
| - // val conf = new Configuration |
43 |
| - // conf.set("key.value.separator.in.input.line", " "); |
44 |
| - // conf.set("xmlinput.start", "<page>"); |
45 |
| - // conf.set("xmlinput.end", "</page>"); |
46 |
| - |
47 |
| - // val xmlRDD = sc.newAPIHadoopFile(fname, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text], conf) |
48 |
| - // .map(stringify) |
49 |
| - |
50 |
| - // println("XML pages: " + xmlRDD.count) |
51 |
| - // // .repartition(numparts) |
52 |
| - |
53 |
| - // val wikiRDD = xmlRDD.map { raw => new WikiArticle(raw) } |
54 |
| - // .filter { art => art.relevant } |
55 |
| - |
56 |
| - // println("Relevant pages: " + wikiRDD.count) |
57 |
| - |
58 |
| - // val vertices: RDD[(Vid, String)] = wikiRDD.map { art => (art.vertexID, art.title) } |
59 |
| - // val justVids = wikiRDD.map { art => art.vertexID } |
60 |
| - // // println("taking top vids") |
61 |
| - // // val topvids = justVids.top(10) |
62 |
| - // // sc.stop() |
63 |
| - // // System.exit(0) |
64 |
| - |
65 |
| - // // val edges: RDD[Edge[Double]] = wikiRDD.flatMap { art => art.edges } |
66 |
| - // val edges: RDD[Edge[Double]] = wikiRDD.flatMap { art => art.edges } |
67 |
| - // println("Edges: " + edges.count) |
68 |
| - // println("Creating graph: " + Calendar.getInstance().getTime()) |
69 |
| - |
70 |
| - // val g = Graph(vertices, edges) |
71 |
| - // val g = Graph.fromEdges(edges, 1) |
72 |
| - // val g = Graph(edges, 1) |
73 |
| - val g = GraphLoader.edgeListAndVertexListFiles(sc, fname + "_edges", fname + "_vertices", |
74 |
| - minEdgePartitions = 128).cache() |
75 |
| - println("Triplets: " + g.triplets.count) |
76 |
| - |
77 |
| - println("starting pagerank " + Calendar.getInstance().getTime()) |
78 |
| - val startTime = System.currentTimeMillis |
79 |
| - val pr = PageRank.run(g, 20) |
80 |
| - |
81 |
| - println("PR numvertices: " + pr.vertices.count + "\tOriginal numVertices " + g.vertices.count) |
82 |
| - println("Pagerank runtime: " + ((System.currentTimeMillis - startTime)/1000.0) + " seconds") |
83 |
| - val prAndTitle = g.outerJoinVertices(pr.vertices)({(id: Vid, title: String, rank: Option[Double]) => (title, rank.getOrElse(0.0))}) |
84 |
| - println("finished join.") |
85 |
| - |
86 |
| - val topArticles = prAndTitle.vertices.top(30)(Ordering.by((entry: (Vid, (String, Double))) => entry._2._2)) |
87 |
| - println("Top articles:\n" + topArticles.deep.mkString("\n")) |
88 |
| - // for(v <- topArticles) { |
89 |
| - // println(v) |
90 |
| - // } |
91 |
| - val article_name = "JohnsHopkinsUniversity" |
92 |
| - // |
93 |
| - //Find relevant vertices |
94 |
| - g.mapTriplets(e => { |
95 |
| - if ((e.srcAttr contains article_name) || (e.dstAttr contains article_name)) { 1.0 } |
96 |
| - else { e.attr } |
97 |
| - }) |
98 |
| - val coarsenedGraph = g.contractEdges({ e => e.attr == 1.0 }, {et => et.srcAttr + " " + et.dstAttr }, |
99 |
| - { (v1: String , v2: String) => v1 + "\n" + v2 }) |
100 |
| - |
101 |
| - // filter only vertices whose title contains JHU |
102 |
| - val relevant = coarsenedGraph.vertices.filter( {case (vid: Vid, data: String) => data contains article_name}).collect |
103 |
| - println("Articles matching " + article_name) |
104 |
| - println(relevant.deep.mkString("New Article\n")) |
105 |
| - |
106 |
| - sc.stop() |
107 |
| - } |
108 |
| - |
109 |
| - |
110 |
| - def stringify(tup: (org.apache.hadoop.io.LongWritable, org.apache.hadoop.io.Text)): String = { |
111 |
| - tup._2.toString |
112 |
| - } |
113 |
| - |
114 |
| - |
115 |
| - |
116 |
| -} |
| 1 | +//package org.apache.spark.examples.graphx |
| 2 | +// |
| 3 | +//import org.apache.spark._ |
| 4 | +//import org.apache.spark.graph._ |
| 5 | +//import org.apache.spark.graph.algorithms._ |
| 6 | +//import org.apache.spark.rdd.NewHadoopRDD |
| 7 | +//import org.apache.hadoop.io.LongWritable |
| 8 | +//import org.apache.hadoop.io.Text |
| 9 | +//import org.apache.hadoop.conf.Configuration |
| 10 | +//import org.apache.mahout.text.wikipedia._ |
| 11 | +//import org.apache.spark.rdd.RDD |
| 12 | +//import java.util.Calendar |
| 13 | +//import scala.math.Ordering.Implicits._ |
| 14 | +// |
| 15 | +// |
| 16 | +//object AnalyzeWikipedia extends Logging { |
| 17 | +// |
| 18 | +// def main(args: Array[String]) = { |
| 19 | +// |
| 20 | +// |
| 21 | +// |
| 22 | +// |
| 23 | +// val host = args(0) |
| 24 | +// val fname = args(1) |
| 25 | +// // val numparts = { |
| 26 | +// // if (args.length >= 3) { |
| 27 | +// // args(2).toInt |
| 28 | +// // } else { |
| 29 | +// // 64 |
| 30 | +// // } |
| 31 | +// // } |
| 32 | +// // val preformattedFname = args(2) |
| 33 | +// |
| 34 | +// val serializer = "org.apache.spark.serializer.KryoSerializer" |
| 35 | +// System.setProperty("spark.serializer", serializer) |
| 36 | +// System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator") |
| 37 | +// |
| 38 | +// val sc = new SparkContext(host, "AnalyzeWikipedia") |
| 39 | +// // val top10 = sc.parallelize(1 to 1000, 10).map(x => (x.toString, x)).top(10)(Ordering.by(_._2)) |
| 40 | +// |
| 41 | +// |
| 42 | +// // val conf = new Configuration |
| 43 | +// // conf.set("key.value.separator.in.input.line", " "); |
| 44 | +// // conf.set("xmlinput.start", "<page>"); |
| 45 | +// // conf.set("xmlinput.end", "</page>"); |
| 46 | +// |
| 47 | +// // val xmlRDD = sc.newAPIHadoopFile(fname, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text], conf) |
| 48 | +// // .map(stringify) |
| 49 | +// |
| 50 | +// // println("XML pages: " + xmlRDD.count) |
| 51 | +// // // .repartition(numparts) |
| 52 | +// |
| 53 | +// // val wikiRDD = xmlRDD.map { raw => new WikiArticle(raw) } |
| 54 | +// // .filter { art => art.relevant } |
| 55 | +// |
| 56 | +// // println("Relevant pages: " + wikiRDD.count) |
| 57 | +// |
| 58 | +// // val vertices: RDD[(Vid, String)] = wikiRDD.map { art => (art.vertexID, art.title) } |
| 59 | +// // val justVids = wikiRDD.map { art => art.vertexID } |
| 60 | +// // // println("taking top vids") |
| 61 | +// // // val topvids = justVids.top(10) |
| 62 | +// // // sc.stop() |
| 63 | +// // // System.exit(0) |
| 64 | +// |
| 65 | +// // // val edges: RDD[Edge[Double]] = wikiRDD.flatMap { art => art.edges } |
| 66 | +// // val edges: RDD[Edge[Double]] = wikiRDD.flatMap { art => art.edges } |
| 67 | +// // println("Edges: " + edges.count) |
| 68 | +// // println("Creating graph: " + Calendar.getInstance().getTime()) |
| 69 | +// |
| 70 | +// // val g = Graph(vertices, edges) |
| 71 | +// // val g = Graph.fromEdges(edges, 1) |
| 72 | +// // val g = Graph(edges, 1) |
| 73 | +// val g = GraphLoader.edgeListAndVertexListFiles(sc, fname + "_edges", fname + "_vertices", |
| 74 | +// minEdgePartitions = 128).cache() |
| 75 | +// println("Triplets: " + g.triplets.count) |
| 76 | +// |
| 77 | +// println("starting pagerank " + Calendar.getInstance().getTime()) |
| 78 | +// val startTime = System.currentTimeMillis |
| 79 | +// val pr = PageRank.run(g, 20) |
| 80 | +// |
| 81 | +// println("PR numvertices: " + pr.vertices.count + "\tOriginal numVertices " + g.vertices.count) |
| 82 | +// println("Pagerank runtime: " + ((System.currentTimeMillis - startTime)/1000.0) + " seconds") |
| 83 | +// val prAndTitle = g.outerJoinVertices(pr.vertices)({(id: Vid, title: String, rank: Option[Double]) => (title, rank.getOrElse(0.0))}) |
| 84 | +// println("finished join.") |
| 85 | +// |
| 86 | +// val topArticles = prAndTitle.vertices.top(30)(Ordering.by((entry: (Vid, (String, Double))) => entry._2._2)) |
| 87 | +// println("Top articles:\n" + topArticles.deep.mkString("\n")) |
| 88 | +// // for(v <- topArticles) { |
| 89 | +// // println(v) |
| 90 | +// // } |
| 91 | +// val article_name = "JohnsHopkinsUniversity" |
| 92 | +// // |
| 93 | +// //Find relevant vertices |
| 94 | +// g.mapTriplets(e => { |
| 95 | +// if ((e.srcAttr contains article_name) || (e.dstAttr contains article_name)) { 1.0 } |
| 96 | +// else { e.attr } |
| 97 | +// }) |
| 98 | +// val coarsenedGraph = g.contractEdges({ e => e.attr == 1.0 }, {et => et.srcAttr + " " + et.dstAttr }, |
| 99 | +// { (v1: String , v2: String) => v1 + "\n" + v2 }) |
| 100 | +// |
| 101 | +// // filter only vertices whose title contains JHU |
| 102 | +// val relevant = coarsenedGraph.vertices.filter( {case (vid: Vid, data: String) => data contains article_name}).collect |
| 103 | +// println("Articles matching " + article_name) |
| 104 | +// println(relevant.deep.mkString("New Article\n")) |
| 105 | +// |
| 106 | +// sc.stop() |
| 107 | +// } |
| 108 | +// |
| 109 | +// |
| 110 | +// def stringify(tup: (org.apache.hadoop.io.LongWritable, org.apache.hadoop.io.Text)): String = { |
| 111 | +// tup._2.toString |
| 112 | +// } |
| 113 | +// |
| 114 | +// |
| 115 | +// |
| 116 | +//} |
0 commit comments