-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_graph.py
49 lines (40 loc) · 1.25 KB
/
build_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
'''
Spark Job Model Production
$ spark-submit --py-files graphframes.zip --jars jars/scala-logging-api_2.11-2.1.2.jar,jars/scala-logging-slf4j_2.11-2.1.2.jar build_graph.py
Marlesson
'''
import os
import sys
import argparse
import time
import importlib
from pyspark.sql import SparkSession
from graphframes import *
if __name__ == '__main__':
# Session Spark
spark = SparkSession\
.builder\
.appName("Graph")\
.getOrCreate()
# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
("a", "Alice", 34),
("b", "Bob", 36),
("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
("a", "b", "friend"),
("b", "c", "follow"),
("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)
# Query: Get in-degree of each vertex.
g.inDegrees.show()
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()
# # Run PageRank algorithm, and show results.
# results = g.pageRank(resetProbability=0.01, maxIter=20)
# results.vertices.select("id", "pagerank").show()