forked from mahmoudparsian/pyspark-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbasicjoin.txt
106 lines (101 loc) · 1.77 KB
/
basicjoin.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# cat > R.txt
k1,v1
k1,v2
k2,v3
k2,v4
k3,v7
k3,v8
k3,v9
# cat > S.txt
k1,v11
k1,v22
k1,v33
k2,v55
k4,v77
k5,v88
# ./pyspark
Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 1.2.0
/_/
Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
SparkContext available as sc.
>>> R = sc.textFile("R.txt");
>>> R.collect()
[u'k1,v1',
u'k1,v2',
u'k2,v3',
u'k2,v4',
u'k3,v7',
u'k3,v8',
u'k3,v9']
>>> S = sc.textFile("S.txt");
>>> S.collect()
[u'k1,v11',
u'k1,v22',
u'k1,v33',
u'k2,v55',
u'k4,v77',
u'k5,v88'
]
>>> r1 = R.map(lambda s: s.split(","))
>>> r1.collect()
[
[u'k1', u'v1'],
[u'k1', u'v2'],
[u'k2', u'v3'],
[u'k2', u'v4'],
[u'k3', u'v7'],
[u'k3', u'v8'],
[u'k3', u'v9']
]
>>> r2 = r1.flatMap(lambda s: [(s[0], s[1])])
>>> r2.collect()
[
(u'k1', u'v1'),
(u'k1', u'v2'),
(u'k2', u'v3'),
(u'k2', u'v4'),
(u'k3', u'v7'),
(u'k3', u'v8'),
(u'k3', u'v9')
]
>>>
>>> s1 = S.map(lambda s: s.split(","))
>>> s1.collect()
[
[u'k1', u'v11'],
[u'k1', u'v22'],
[u'k1', u'v33'],
[u'k2', u'v55'],
[u'k4', u'v77'],
[u'k5', u'v88']
]
>>> s2 = s1.flatMap(lambda s: [(s[0], s[1])])
>>> s2.collect()
[
(u'k1', u'v11'),
(u'k1', u'v22'),
(u'k1', u'v33'),
(u'k2', u'v55'),
(u'k4', u'v77'),
(u'k5', u'v88')
]
>>> RjoinedS = r2.join(s2)
>>> RjoinedS.collect()
[
(u'k2', (u'v3', u'v55')),
(u'k2', (u'v4', u'v55')),
(u'k1', (u'v1', u'v11')),
(u'k1', (u'v1', u'v22')),
(u'k1', (u'v1', u'v33')),
(u'k1', (u'v2', u'v11')),
(u'k1', (u'v2', u'v22')),
(u'k1', (u'v2', u'v33'))
]
>>>