|
2 | 2 | """
|
3 | 3 | Created on Sun Jun 14 10:20:19 2020
|
4 | 4 | """
|
5 |
| - |
6 | 5 | import pyspark
|
7 | 6 | from pyspark.sql import SparkSession
|
8 | 7 | from pyspark.sql.functions import col, lit
|
9 |
| -from pyspark.sql.types import StructType, StructField, StringType |
| 8 | +from pyspark.sql.types import StructType, StructField, StringType,IntegerType |
10 | 9 |
|
11 | 10 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
|
12 | 11 |
|
13 |
| -data = [(("James","","Smith"),"36636","M","3000"), \ |
14 |
| - (("Michael","Rose",""),"40288","M","4000"), \ |
15 |
| - (("Robert","","Williams"),"42114","M","4000"), \ |
16 |
| - (("Maria","Anne","Jones"),"39192","F","4000"), \ |
17 |
| - (("Jen","Mary","Brown"),"","F","-1") \ |
| 12 | +data = [('James','','Smith','1991-04-01','M',3000), |
| 13 | + ('Michael','Rose','','2000-05-19','M',4000), |
| 14 | + ('Robert','','Williams','1978-09-05','M',4000), |
| 15 | + ('Maria','Anne','Jones','1967-12-01','F',4000), |
| 16 | + ('Jen','Mary','Brown','1980-02-17','F',-1) |
18 | 17 | ]
|
19 | 18 |
|
20 |
| -schema = StructType([ |
21 |
| - StructField('name', StructType([ |
22 |
| - StructField('firstname', StringType(), True), |
23 |
| - StructField('middlename', StringType(), True), |
24 |
| - StructField('lastname', StringType(), True) |
25 |
| - ])), |
26 |
| - StructField('dob', StringType(), True), |
27 |
| - StructField('gender', StringType(), True), |
28 |
| - StructField('salary', StringType(), True) |
29 |
| - ]) |
30 |
| - |
31 |
| - |
32 |
| -df = spark.createDataFrame(data=data, schema = schema) |
| 19 | +columns = ["firstname","middlename","lastname","dob","gender","salary"] |
| 20 | +df = spark.createDataFrame(data=data, schema = columns) |
33 | 21 | df.printSchema()
|
34 | 22 | df.show(truncate=False)
|
35 | 23 |
|
|
51 | 39 | .withColumn("anotherColumn",lit("anotherValue"))
|
52 | 40 | df6.printSchema()
|
53 | 41 |
|
54 |
| - |
55 | 42 | df.withColumnRenamed("gender","sex") \
|
56 | 43 | .show(truncate=False)
|
57 | 44 |
|
58 | 45 | df4.drop("CopiedColumn") \
|
59 | 46 | .show(truncate=False)
|
60 | 47 |
|
| 48 | +dataStruct = [(("James","","Smith"),"36636","M","3000"), \ |
| 49 | + (("Michael","Rose",""),"40288","M","4000"), \ |
| 50 | + (("Robert","","Williams"),"42114","M","4000"), \ |
| 51 | + (("Maria","Anne","Jones"),"39192","F","4000"), \ |
| 52 | + (("Jen","Mary","Brown"),"","F","-1") \ |
| 53 | +] |
| 54 | + |
| 55 | +schemaStruct = StructType([ |
| 56 | + StructField('name', StructType([ |
| 57 | + StructField('firstname', StringType(), True), |
| 58 | + StructField('middlename', StringType(), True), |
| 59 | + StructField('lastname', StringType(), True) |
| 60 | + ])), |
| 61 | + StructField('dob', StringType(), True), |
| 62 | + StructField('gender', StringType(), True), |
| 63 | + StructField('salary', StringType(), True) |
| 64 | + ]) |
| 65 | + |
| 66 | + |
| 67 | +df7 = spark.createDataFrame(data=dataStruct, schema = schemaStruct) |
| 68 | +df7.printSchema() |
| 69 | +df7.show(truncate=False) |
| 70 | + |
| 71 | + |
61 | 72 | """
|
62 | 73 | columns = ["name","address"]
|
63 | 74 | data = [("Robert, Smith", "1 Main st, Newark, NJ, 92537"), \
|
|
0 commit comments