Skip to content

Commit 58037be

Browse files
Initial Commit
1 parent 07a8ae0 commit 58037be

11 files changed

+134
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from pyspark.sql import SparkSession
2+
from pyspark.sql.functions import from_json, to_timestamp, col, expr
3+
from pyspark.sql.types import StructType, StructField, StringType
4+
5+
from lib.logger import Log4j
6+
7+
if __name__ == "__main__":
8+
spark = SparkSession \
9+
.builder \
10+
.master("local[3]") \
11+
.appName("Stream Stream Join Demo") \
12+
.config("spark.streaming.stopGracefullyOnShutdown", "true") \
13+
.config("spark.sql.shuffle.partitions", 2) \
14+
.getOrCreate()
15+
16+
logger = Log4j(spark)
17+
18+
impressionSchema = StructType([
19+
StructField("InventoryID", StringType()),
20+
StructField("CreatedTime", StringType()),
21+
StructField("Campaigner", StringType())
22+
])
23+
24+
clickSchema = StructType([
25+
StructField("InventoryID", StringType()),
26+
StructField("CreatedTime", StringType())
27+
])
28+
29+
kafka_impression_df = spark \
30+
.readStream \
31+
.format("kafka") \
32+
.option("kafka.bootstrap.servers", "localhost:9092") \
33+
.option("subscribe", "impressions") \
34+
.option("startingOffsets", "earliest") \
35+
.load()
36+
37+
impressions_df = kafka_impression_df \
38+
.select(from_json(col("value").cast("string"), impressionSchema).alias("value")) \
39+
.selectExpr("value.InventoryID as ImpressionID", "value.CreatedTime", "value.Campaigner") \
40+
.withColumn("ImpressionTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
41+
.drop("CreatedTime")
42+
43+
kafka_click_df = spark \
44+
.readStream \
45+
.format("kafka") \
46+
.option("kafka.bootstrap.servers", "localhost:9092") \
47+
.option("subscribe", "clicks") \
48+
.option("startingOffsets", "earliest") \
49+
.load()
50+
51+
clicks_df = kafka_click_df.select(
52+
from_json(col("value").cast("string"), clickSchema).alias("value")) \
53+
.selectExpr("value.InventoryID as ClickID", "value.CreatedTime") \
54+
.withColumn("ClickTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
55+
.drop("CreatedTime")
56+
57+
join_expr = "ImpressionID == ClickID"
58+
join_type = "inner"
59+
60+
joined_df = impressions_df.join(clicks_df, expr(join_expr), join_type)
61+
62+
output_query = joined_df.writeStream \
63+
.format("console") \
64+
.outputMode("append") \
65+
.option("checkpointLocation", "chk-point-dir") \
66+
.trigger(processingTime="1 minute") \
67+
.start()
68+
69+
logger.info("Waiting for Query")
70+
output_query.awaitTermination()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"InventoryID": "100001", "CreatedTime": "2020-09-09 10:00:00", "Campaigner": "ABC Ltd"}
2+
{"InventoryID": "100002", "CreatedTime": "2020-09-09 10:06:00", "Campaigner": "ABC Ltd"}
3+
{"InventoryID": "100003", "CreatedTime": "2020-09-09 10:02:00", "Campaigner": "XYZ Ltd"}
4+
{"InventoryID": "100004", "CreatedTime": "2020-09-09 10:09:00", "Campaigner": "XYZ Ltd"}
5+
6+
{"InventoryID": "100001", "CreatedTime": "2020-09-09 10:18:00"}
7+
{"InventoryID": "100002", "CreatedTime": "2020-09-09 10:18:00"}
8+
{"InventoryID": "100003", "CreatedTime": "2020-09-09 10:18:00"}
9+
{"InventoryID": "100004", "CreatedTime": "2020-09-09 10:18:00"}
10+
{"InventoryID": "100001", "CreatedTime": "2020-09-09 10:18:00"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\zookeeper-server-start.bat %KAFKA_HOME%\config\zookeeper.properties
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-server-start.bat %KAFKA_HOME%\config\server-0.properties
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-topics.bat --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic impressions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-topics.bat --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic clicks
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-console-producer.bat --broker-list localhost:9092 --topic impressions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-console-producer.bat --broker-list localhost:9092 --topic clicks

12-StreamStreamJoinDemo/lib/__init__.py

Whitespace-only changes.

12-StreamStreamJoinDemo/lib/logger.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
class Log4j:
2+
def __init__(self, spark):
3+
log4j = spark._jvm.org.apache.log4j
4+
5+
root_class = "guru.learningjournal.spark.examples"
6+
conf = spark.sparkContext.getConf()
7+
app_name = conf.get("spark.app.name")
8+
9+
self.logger = log4j.LogManager.getLogger(root_class + "." + app_name)
10+
11+
def warn(self, message):
12+
self.logger.warn(message)
13+
14+
def info(self, message):
15+
self.logger.info(message)
16+
17+
def error(self, message):
18+
self.logger.error(message)
19+
20+
def debug(self, message):
21+
self.logger.debug(message)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Set everything to be logged to the console
2+
log4j.rootCategory=WARN, console
3+
4+
# define console appender
5+
log4j.appender.console=org.apache.log4j.ConsoleAppender
6+
log4j.appender.console.target=System.out
7+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
8+
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
9+
10+
#application log
11+
log4j.logger.guru.learningjournal.spark.examples=INFO, console
12+
log4j.additivity.guru.learningjournal.spark.examples=false
13+
14+
#define following in Java System
15+
# -Dlog4j.configuration=file:log4j.properties
16+
17+
# Recommendations from Spark template
18+
log4j.logger.org.apache.spark.repl.Main=WARN
19+
log4j.logger.org.spark_project.jetty=WARN
20+
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
21+
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
22+
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
23+
log4j.logger.org.apache.parquet=ERROR
24+
log4j.logger.parquet=ERROR
25+
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
26+
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
27+

0 commit comments

Comments
 (0)