Skip to content

Commit eef5377

Browse files
pyspark examples
1 parent cffd0c2 commit eef5377

File tree

2 files changed

+33
-22
lines changed

2 files changed

+33
-22
lines changed

pyspark-broadcast-dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,5 @@ def state_convert(code):
3333

3434
# Broadcast variable on filter
3535

36-
filter_df= df.where((df['Foo'].isin(broadcastStates.value)))
36+
filteDf= df.where((df['state'].isin(broadcastStates.value)))
3737

pyspark-withcolumn.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,22 @@
22
"""
33
Created on Sun Jun 14 10:20:19 2020
44
"""
5-
65
import pyspark
76
from pyspark.sql import SparkSession
87
from pyspark.sql.functions import col, lit
9-
from pyspark.sql.types import StructType, StructField, StringType
8+
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
109

1110
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
1211

13-
data = [(("James","","Smith"),"36636","M","3000"), \
14-
(("Michael","Rose",""),"40288","M","4000"), \
15-
(("Robert","","Williams"),"42114","M","4000"), \
16-
(("Maria","Anne","Jones"),"39192","F","4000"), \
17-
(("Jen","Mary","Brown"),"","F","-1") \
12+
data = [('James','','Smith','1991-04-01','M',3000),
13+
('Michael','Rose','','2000-05-19','M',4000),
14+
('Robert','','Williams','1978-09-05','M',4000),
15+
('Maria','Anne','Jones','1967-12-01','F',4000),
16+
('Jen','Mary','Brown','1980-02-17','F',-1)
1817
]
1918

20-
schema = StructType([
21-
StructField('name', StructType([
22-
StructField('firstname', StringType(), True),
23-
StructField('middlename', StringType(), True),
24-
StructField('lastname', StringType(), True)
25-
])),
26-
StructField('dob', StringType(), True),
27-
StructField('gender', StringType(), True),
28-
StructField('salary', StringType(), True)
29-
])
30-
31-
32-
df = spark.createDataFrame(data=data, schema = schema)
19+
columns = ["firstname","middlename","lastname","dob","gender","salary"]
20+
df = spark.createDataFrame(data=data, schema = columns)
3321
df.printSchema()
3422
df.show(truncate=False)
3523

@@ -51,13 +39,36 @@
5139
.withColumn("anotherColumn",lit("anotherValue"))
5240
df6.printSchema()
5341

54-
5542
df.withColumnRenamed("gender","sex") \
5643
.show(truncate=False)
5744

5845
df4.drop("CopiedColumn") \
5946
.show(truncate=False)
6047

48+
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
49+
(("Michael","Rose",""),"40288","M","4000"), \
50+
(("Robert","","Williams"),"42114","M","4000"), \
51+
(("Maria","Anne","Jones"),"39192","F","4000"), \
52+
(("Jen","Mary","Brown"),"","F","-1") \
53+
]
54+
55+
schemaStruct = StructType([
56+
StructField('name', StructType([
57+
StructField('firstname', StringType(), True),
58+
StructField('middlename', StringType(), True),
59+
StructField('lastname', StringType(), True)
60+
])),
61+
StructField('dob', StringType(), True),
62+
StructField('gender', StringType(), True),
63+
StructField('salary', StringType(), True)
64+
])
65+
66+
67+
df7 = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
68+
df7.printSchema()
69+
df7.show(truncate=False)
70+
71+
6172
"""
6273
columns = ["name","address"]
6374
data = [("Robert, Smith", "1 Main st, Newark, NJ, 92537"), \

0 commit comments

Comments
 (0)