Skip to content

Commit 00b2c1e

Browse files
PySpark Date & Timestamp examples
1 parent 9f32d04 commit 00b2c1e

File tree

3 files changed

+133
-1
lines changed

3 files changed

+133
-1
lines changed

pyspark-current-date-timestamp.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
data=[["1"]]
13+
df=spark.createDataFrame(data,["id"])
14+
15+
from pyspark.sql.functions import *
16+
17+
#current_date() & current_timestamp()
18+
df.withColumn("current_date",current_date()) \
19+
.withColumn("current_timestamp",current_timestamp()) \
20+
.show(truncate=False)
21+
22+
#SQL
23+
spark.sql("select current_date(), current_timestamp()") \
24+
.show(truncate=False)
25+
26+
# Date & Timestamp into custom format
27+
df.withColumn("date_format",date_format(current_date(),"MM-dd-yyyy")) \
28+
.withColumn("to_timestamp",to_timestamp(current_timestamp(),"MM-dd-yyyy HH mm ss SSS")) \
29+
.show(truncate=False)
30+
31+
#SQL
32+
spark.sql("select date_format(current_date(),'MM-dd-yyyy') as date_format ," + \
33+
"to_timestamp(current_timestamp(),'MM-dd-yyyy HH mm ss SSS') as to_timestamp") \
34+
.show(truncate=False)

pyspark-date-timestamp-functions.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
# Create SparkSession
8+
spark = SparkSession.builder \
9+
.appName('SparkByExamples.com') \
10+
.getOrCreate()
11+
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]
12+
df=spark.createDataFrame(data,["id","input"])
13+
df.show()
14+
15+
from pyspark.sql.functions import *
16+
17+
#current_date()
18+
df.select(current_date().alias("current_date")
19+
).show(1)
20+
21+
#date_format()
22+
df.select(col("input"),
23+
date_format(col("input"), "MM-dd-yyyy").alias("date_format")
24+
).show()
25+
26+
#to_date()
27+
df.select(col("input"),
28+
to_date(col("input"), "yyy-MM-dd").alias("to_date")
29+
).show()
30+
31+
#datediff()
32+
df.select(col("input"),
33+
datediff(current_date(),col("input")).alias("datediff")
34+
).show()
35+
36+
#months_between()
37+
df.select(col("input"),
38+
months_between(current_date(),col("input")).alias("months_between")
39+
).show()
40+
41+
#trunc()
42+
df.select(col("input"),
43+
trunc(col("input"),"Month").alias("Month_Trunc"),
44+
trunc(col("input"),"Year").alias("Month_Year"),
45+
trunc(col("input"),"Month").alias("Month_Trunc")
46+
).show()
47+
48+
#add_months() , date_add(), date_sub()
49+
50+
df.select(col("input"),
51+
add_months(col("input"),3).alias("add_months"),
52+
add_months(col("input"),-3).alias("sub_months"),
53+
date_add(col("input"),4).alias("date_add"),
54+
date_sub(col("input"),4).alias("date_sub")
55+
).show()
56+
57+
#
58+
59+
df.select(col("input"),
60+
year(col("input")).alias("year"),
61+
month(col("input")).alias("month"),
62+
next_day(col("input"),"Sunday").alias("next_day"),
63+
weekofyear(col("input")).alias("weekofyear")
64+
).show()
65+
66+
df.select(col("input"),
67+
dayofweek(col("input")).alias("dayofweek"),
68+
dayofmonth(col("input")).alias("dayofmonth"),
69+
dayofyear(col("input")).alias("dayofyear"),
70+
).show()
71+
72+
data=[["1","02-01-2020 11 01 19 06"],["2","03-01-2019 12 01 19 406"],["3","03-01-2021 12 01 19 406"]]
73+
df2=spark.createDataFrame(data,["id","input"])
74+
df2.show(truncate=False)
75+
76+
#current_timestamp()
77+
df2.select(current_timestamp().alias("current_timestamp")
78+
).show(1,truncate=False)
79+
80+
#to_timestamp()
81+
df2.select(col("input"),
82+
to_timestamp(col("input"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp")
83+
).show(truncate=False)
84+
85+
86+
#hour, minute,second
87+
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
88+
df3=spark.createDataFrame(data,["id","input"])
89+
90+
df3.select(col("input"),
91+
hour(col("input")).alias("hour"),
92+
minute(col("input")).alias("minute"),
93+
second(col("input")).alias("second")
94+
).show(truncate=False)
95+
96+
97+

python-pandas.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919

2020
#Outputs below data on console
2121

22-
print(pandasDF.count())
22+
pdCount=pandasDF.count()
23+
print(pdCount)
2324

2425
print(pandasDF.max())
2526
print(pandasDF.mean())

0 commit comments

Comments
 (0)