Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
LearningJournal committed Aug 27, 2020
1 parent 59c51d9 commit 07a8ae0
Show file tree
Hide file tree
Showing 10 changed files with 139 additions and 0 deletions.
74 changes: 74 additions & 0 deletions 11-StreamTableJoinDemo/StreamTableJoinDemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType

from lib.logger import Log4j


def write_to_cassandra(target_df, batch_id):
target_df.write \
.format("org.apache.spark.sql.cassandra") \
.option("keyspace", "spark_db") \
.option("table", "users") \
.mode("append") \
.save()
target_df.show()


if __name__ == "__main__":
spark = SparkSession \
.builder \
.master("local[3]") \
.appName("Stream Table Join Demo") \
.config("spark.streaming.stopGracefullyOnShutdown", "true") \
.config("spark.sql.shuffle.partitions", 2) \
.config("spark.cassandra.connection.host", "localhost") \
.config("spark.cassandra.connection.port", "9042") \
.config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
.config("spark.sql.catalog.lh", "com.datastax.spark.connector.datasource.CassandraCatalog") \
.getOrCreate()

logger = Log4j(spark)

login_schema = StructType([
StructField("created_time", StringType()),
StructField("login_id", StringType())
])

kafka_source_df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "logins") \
.option("startingOffsets", "earliest") \
.load()

value_df = kafka_source_df.select(from_json(col("value").cast("string"), login_schema).alias("value"))

login_df = value_df.select("value.*") \
.withColumn("created_time", to_timestamp(col("created_time"), "yyyy-MM-dd HH:mm:ss"))

user_df = spark.read \
.format("org.apache.spark.sql.cassandra") \
.option("keyspace", "spark_db") \
.option("table", "users") \
.load()

join_expr = login_df.login_id == user_df.login_id
join_type = "inner"

joined_df = login_df.join(user_df, join_expr, join_type) \
.drop(login_df.login_id)

output_df = joined_df.select(col("login_id"), col("user_name"),
col("created_time").alias("last_login"))

output_query = output_df.writeStream \
.foreachBatch(write_to_cassandra) \
.outputMode("update") \
.option("checkpointLocation", "chk-point-dir") \
.trigger(processingTime="1 minute") \
.start()

logger.info("Waiting for Query")
output_query.awaitTermination()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
%KAFKA_HOME%\bin\windows\zookeeper-server-start.bat %KAFKA_HOME%\config\zookeeper.properties
1 change: 1 addition & 0 deletions 11-StreamTableJoinDemo/kafka-scripts/02-start-kafka.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
%KAFKA_HOME%\bin\windows\kafka-server-start.bat %KAFKA_HOME%\config\server-0.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
%KAFKA_HOME%\bin\windows\kafka-topics.bat --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic logins
1 change: 1 addition & 0 deletions 11-StreamTableJoinDemo/kafka-scripts/05-start-producer.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
%KAFKA_HOME%\bin\windows\kafka-console-producer.bat --broker-list localhost:9092 --topic logins
Empty file.
21 changes: 21 additions & 0 deletions 11-StreamTableJoinDemo/lib/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
class Log4j:
def __init__(self, spark):
log4j = spark._jvm.org.apache.log4j

root_class = "guru.learningjournal.spark.examples"
conf = spark.sparkContext.getConf()
app_name = conf.get("spark.app.name")

self.logger = log4j.LogManager.getLogger(root_class + "." + app_name)

def warn(self, message):
self.logger.warn(message)

def info(self, message):
self.logger.info(message)

def error(self, message):
self.logger.error(message)

def debug(self, message):
self.logger.debug(message)
27 changes: 27 additions & 0 deletions 11-StreamTableJoinDemo/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Set everything to be logged to the console
log4j.rootCategory=WARN, console

# define console appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

#application log
log4j.logger.guru.learningjournal.spark.examples=INFO, console
log4j.additivity.guru.learningjournal.spark.examples=false

#define following in Java System
# -Dlog4j.configuration=file:log4j.properties

# Recommendations from Spark template
log4j.logger.org.apache.spark.repl.Main=WARN
log4j.logger.org.spark_project.jetty=WARN
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
log4j.logger.org.apache.parquet=ERROR
log4j.logger.parquet=ERROR
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR

9 changes: 9 additions & 0 deletions 11-StreamTableJoinDemo/static-data/user.cql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
CREATE KEYSPACE spark_db WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
USE spark_db;
CREATE TABLE users(Login_id text PRIMARY KEY, user_name text, last_login timestamp);

INSERT INTO users (Login_id, user_name, last_login) VALUES( '100001', 'Prashant', '2019-02-05 10:05:00');
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100009', 'Alisha', '2019-03-07 11:03:00');
INSERT INTO users (Login_id, user_name, last_login) VALUES( '100087', 'Abdul', '2019-06-12 09:43:00');

INSERT INTO users (Login_id, user_name, last_login) VALUES( '100091', 'New User', '2019-06-12 09:43:00');
4 changes: 4 additions & 0 deletions 11-StreamTableJoinDemo/streaming-data/logins.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"login_id": "100001", "created_time": "2020-09-09 10:18:00"}
{"login_id": "100009", "created_time": "2020-09-18 07:15:00"}
{"login_id": "100087", "created_time": "2020-09-18 07:15:00"}
{"login_id": "100091", "created_time": "2020-09-18 07:15:00"}

0 comments on commit 07a8ae0

Please sign in to comment.