|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +author SparkByExamples.com |
| 4 | +""" |
| 5 | + |
| 6 | +from pyspark.sql import SparkSession |
| 7 | + |
| 8 | +spark = SparkSession.builder \ |
| 9 | + .appName('SparkByExamples.com') \ |
| 10 | + .getOrCreate() |
| 11 | + |
| 12 | +data = [('James','Smith','M',3000), |
| 13 | + ('Anna','Rose','F',4100), |
| 14 | + ('Robert','Williams','M',6200), |
| 15 | +] |
| 16 | + |
| 17 | +columns = ["firstname","lastname","gender","salary"] |
| 18 | +df = spark.createDataFrame(data=data, schema = columns) |
| 19 | +df.show() |
| 20 | + |
| 21 | + |
| 22 | +if 'salary1' not in df.columns: |
| 23 | + print("aa") |
| 24 | + |
| 25 | +# Add new constanct column |
| 26 | +from pyspark.sql.functions import lit |
| 27 | +df.withColumn("bonus_percent", lit(0.3)) \ |
| 28 | + .show() |
| 29 | + |
| 30 | +#Add column from existing column |
| 31 | +df.withColumn("bonus_amount", df.salary*0.3) \ |
| 32 | + .show() |
| 33 | + |
| 34 | +#Add column by concatinating existing columns |
| 35 | +from pyspark.sql.functions import concat_ws |
| 36 | +df.withColumn("name", concat_ws(",","firstname",'lastname')) \ |
| 37 | + .show() |
| 38 | + |
| 39 | +#Add current date |
| 40 | +from pyspark.sql.functions import current_date |
| 41 | +df.withColumn("current_date", current_date()) \ |
| 42 | + .show() |
| 43 | + |
| 44 | + |
| 45 | +from pyspark.sql.functions import when |
| 46 | +df.withColumn("grade", \ |
| 47 | + when((df.salary < 4000), lit("A")) \ |
| 48 | + .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \ |
| 49 | + .otherwise(lit("C")) \ |
| 50 | + ).show() |
| 51 | + |
| 52 | +# Add column using select |
| 53 | +df.select("firstname","salary", lit(0.3).alias("bonus")).show() |
| 54 | +df.select("firstname","salary", lit(df.salary * 0.3).alias("bonus_amount")).show() |
| 55 | +df.select("firstname","salary", current_date().alias("today_date")).show() |
| 56 | + |
| 57 | +#Add columns using SQL |
| 58 | +df.createOrReplaceTempView("PER") |
| 59 | +spark.sql("select firstname,salary, '0.3' as bonus from PER").show() |
| 60 | +spark.sql("select firstname,salary, salary * 0.3 as bonus_amount from PER").show() |
| 61 | +spark.sql("select firstname,salary, current_date() as today_date from PER").show() |
| 62 | +spark.sql("select firstname,salary, " + |
| 63 | + "case salary when salary < 4000 then 'A' "+ |
| 64 | + "else 'B' END as grade from PER").show() |
| 65 | + |
| 66 | + |
| 67 | + |
| 68 | + |
| 69 | + |
0 commit comments