Skip to content

Commit 07a8ae0

Browse files
Initial Commit
1 parent 59c51d9 commit 07a8ae0

10 files changed

Lines changed: 139 additions & 0 deletions

File tree

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from pyspark.sql import SparkSession
2+
from pyspark.sql.functions import from_json, col, to_timestamp
3+
from pyspark.sql.types import StructType, StructField, StringType
4+
5+
from lib.logger import Log4j
6+
7+
8+
def write_to_cassandra(target_df, batch_id):
9+
target_df.write \
10+
.format("org.apache.spark.sql.cassandra") \
11+
.option("keyspace", "spark_db") \
12+
.option("table", "users") \
13+
.mode("append") \
14+
.save()
15+
target_df.show()
16+
17+
18+
if __name__ == "__main__":
19+
spark = SparkSession \
20+
.builder \
21+
.master("local[3]") \
22+
.appName("Stream Table Join Demo") \
23+
.config("spark.streaming.stopGracefullyOnShutdown", "true") \
24+
.config("spark.sql.shuffle.partitions", 2) \
25+
.config("spark.cassandra.connection.host", "localhost") \
26+
.config("spark.cassandra.connection.port", "9042") \
27+
.config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
28+
.config("spark.sql.catalog.lh", "com.datastax.spark.connector.datasource.CassandraCatalog") \
29+
.getOrCreate()
30+
31+
logger = Log4j(spark)
32+
33+
login_schema = StructType([
34+
StructField("created_time", StringType()),
35+
StructField("login_id", StringType())
36+
])
37+
38+
kafka_source_df = spark \
39+
.readStream \
40+
.format("kafka") \
41+
.option("kafka.bootstrap.servers", "localhost:9092") \
42+
.option("subscribe", "logins") \
43+
.option("startingOffsets", "earliest") \
44+
.load()
45+
46+
value_df = kafka_source_df.select(from_json(col("value").cast("string"), login_schema).alias("value"))
47+
48+
login_df = value_df.select("value.*") \
49+
.withColumn("created_time", to_timestamp(col("created_time"), "yyyy-MM-dd HH:mm:ss"))
50+
51+
user_df = spark.read \
52+
.format("org.apache.spark.sql.cassandra") \
53+
.option("keyspace", "spark_db") \
54+
.option("table", "users") \
55+
.load()
56+
57+
join_expr = login_df.login_id == user_df.login_id
58+
join_type = "inner"
59+
60+
joined_df = login_df.join(user_df, join_expr, join_type) \
61+
.drop(login_df.login_id)
62+
63+
output_df = joined_df.select(col("login_id"), col("user_name"),
64+
col("created_time").alias("last_login"))
65+
66+
output_query = output_df.writeStream \
67+
.foreachBatch(write_to_cassandra) \
68+
.outputMode("update") \
69+
.option("checkpointLocation", "chk-point-dir") \
70+
.trigger(processingTime="1 minute") \
71+
.start()
72+
73+
logger.info("Waiting for Query")
74+
output_query.awaitTermination()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\zookeeper-server-start.bat %KAFKA_HOME%\config\zookeeper.properties
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-server-start.bat %KAFKA_HOME%\config\server-0.properties
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-topics.bat --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic logins
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
%KAFKA_HOME%\bin\windows\kafka-console-producer.bat --broker-list localhost:9092 --topic logins

11-StreamTableJoinDemo/lib/__init__.py

Whitespace-only changes.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
class Log4j:
2+
def __init__(self, spark):
3+
log4j = spark._jvm.org.apache.log4j
4+
5+
root_class = "guru.learningjournal.spark.examples"
6+
conf = spark.sparkContext.getConf()
7+
app_name = conf.get("spark.app.name")
8+
9+
self.logger = log4j.LogManager.getLogger(root_class + "." + app_name)
10+
11+
def warn(self, message):
12+
self.logger.warn(message)
13+
14+
def info(self, message):
15+
self.logger.info(message)
16+
17+
def error(self, message):
18+
self.logger.error(message)
19+
20+
def debug(self, message):
21+
self.logger.debug(message)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Set everything to be logged to the console
2+
log4j.rootCategory=WARN, console
3+
4+
# define console appender
5+
log4j.appender.console=org.apache.log4j.ConsoleAppender
6+
log4j.appender.console.target=System.out
7+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
8+
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
9+
10+
#application log
11+
log4j.logger.guru.learningjournal.spark.examples=INFO, console
12+
log4j.additivity.guru.learningjournal.spark.examples=false
13+
14+
#define following in Java System
15+
# -Dlog4j.configuration=file:log4j.properties
16+
17+
# Recommendations from Spark template
18+
log4j.logger.org.apache.spark.repl.Main=WARN
19+
log4j.logger.org.spark_project.jetty=WARN
20+
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
21+
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
22+
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
23+
log4j.logger.org.apache.parquet=ERROR
24+
log4j.logger.parquet=ERROR
25+
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
26+
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
27+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE KEYSPACE spark_db WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
2+
USE spark_db;
3+
CREATE TABLE users(Login_id text PRIMARY KEY, user_name text, last_login timestamp);
4+
5+
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100001', 'Prashant', '2019-02-05 10:05:00');
6+
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100009', 'Alisha', '2019-03-07 11:03:00');
7+
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100087', 'Abdul', '2019-06-12 09:43:00');
8+
9+
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100091', 'New User', '2019-06-12 09:43:00');
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"login_id": "100001", "created_time": "2020-09-09 10:18:00"}
2+
{"login_id": "100009", "created_time": "2020-09-18 07:15:00"}
3+
{"login_id": "100087", "created_time": "2020-09-18 07:15:00"}
4+
{"login_id": "100091", "created_time": "2020-09-18 07:15:00"}

0 commit comments

Comments
 (0)