-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
237e990
commit 490da34
Showing
8 changed files
with
279 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,12 @@ | ||
# Kafka Spark Streaming | ||
An example project for integrating Kafka and Spark Streaming in order to run streaming sql queries. | ||
|
||
## NetworkQualityStreamingJob | ||
An example Spark Streaming app which consumes network signal data and executes continuous SQL query. | ||
|
||
## NetworkQualityCassandraJob | ||
An example Spark Streaming app which consumes network signal data and writes to Cassandra with a foreach writer | ||
|
||
## NetworkQualityAnalysisJob | ||
An example Spark DataFrame app which creates a DF from Cassandra and executes an aggregation SQL query. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
spark.master=local[*] | ||
topic.names=network-data | ||
bootstrap.servers=localhost:9092 | ||
cassandra.host=localhost | ||
cassandra.keyspace=test | ||
cassandra.table=network_signals | ||
processing.time=10 seconds |
40 changes: 40 additions & 0 deletions
40
src/main/scala/com/datapyro/spark/NetworkQualityAnalysisJob.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package com.datapyro.spark | ||
|
||
import com.datapyro.kafka.util.ConfigUtil | ||
import org.apache.spark.sql.SparkSession | ||
|
||
/** | ||
* Cassandra Dataframe Example | ||
* (Works with Spark 2.2.0) | ||
*/ | ||
object NetworkQualityAnalysisJob extends App { | ||
|
||
val config = ConfigUtil.getConfig("cassandra") | ||
|
||
// spark config | ||
val spark: SparkSession = SparkSession.builder | ||
.master(config.getProperty("spark.master")) | ||
.appName(getClass.getSimpleName) | ||
.getOrCreate() | ||
|
||
// prepare cassandra df | ||
val df = spark.read | ||
.format("org.apache.spark.sql.cassandra") | ||
.options(Map("table" -> "network_signals", "keyspace" -> "test", "cluster" -> "Test Cluster")) | ||
.load() | ||
|
||
df.printSchema() | ||
df.createOrReplaceTempView("network_signals") | ||
|
||
// execute sql | ||
val sql = | ||
""" | ||
SELECT networkType, COUNT(*), AVG(rxSpeed), AVG(txSpeed), SUM(rxData), SUM(txData) | ||
FROM network_signals | ||
GROUP BY networkType | ||
""" | ||
spark.sql(sql).show() | ||
|
||
spark.close() | ||
|
||
} |
121 changes: 121 additions & 0 deletions
121
src/main/scala/com/datapyro/spark/NetworkQualityCassandraJob.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
package com.datapyro.spark | ||
|
||
import java.util.UUID | ||
|
||
import org.apache.spark.sql.{ForeachWriter, Row, SparkSession} | ||
import org.apache.spark.sql.functions.from_json | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.sql.functions._ | ||
import org.apache.spark.sql.streaming.{OutputMode, Trigger} | ||
import com.datapyro.kafka.util.ConfigUtil | ||
import com.datastax.driver.core._ | ||
import com.datastax.driver.core.querybuilder.{QueryBuilder => QB} | ||
|
||
/** | ||
* Cassandra Spark Streaming Foreach Example | ||
* | ||
* CREATE KEYSPACE test WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; | ||
* | ||
* CREATE TABLE network_signals (id TEXT PRIMARY KEY, deviceId TEXT, time TIMESTAMP, networkType TEXT, rxSpeed DOUBLE, txSpeed DOUBLE, rxData DOUBLE, txData DOUBLE, latitude DOUBLE, longitude DOUBLE); | ||
*/ | ||
object NetworkQualityCassandraJob extends App { | ||
|
||
val config = ConfigUtil.getConfig("cassandra") | ||
|
||
// spark config | ||
val spark: SparkSession = SparkSession.builder | ||
.master(config.getProperty("spark.master")) | ||
.appName(getClass.getSimpleName) | ||
.getOrCreate() | ||
|
||
import spark.implicits._ | ||
|
||
// define schema for json | ||
val schema = StructType( | ||
List( | ||
StructField("deviceId", StringType, true), | ||
StructField("time", LongType, true), | ||
StructField("signals", ArrayType(StructType(Array( | ||
StructField("time", LongType, true), | ||
StructField("networkType", StringType, true), | ||
StructField("rxSpeed", DoubleType, true), | ||
StructField("txSpeed", DoubleType, true), | ||
StructField("rxData", LongType, true), | ||
StructField("txData", LongType, true), | ||
StructField("latitude", DoubleType, true), | ||
StructField("longitude", DoubleType, true) | ||
)))) | ||
) | ||
) | ||
|
||
// create stream | ||
val df = spark | ||
.readStream | ||
.format("kafka") | ||
.option("kafka.bootstrap.servers", config.getProperty("bootstrap.servers")) | ||
.option("subscribe", config.getProperty("topic.names")) | ||
.load() | ||
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") | ||
.select(from_json($"value", schema).alias("data")) | ||
|
||
df.createOrReplaceTempView("network_signals") | ||
|
||
val sql = "SELECT x.deviceId, x.signal.* FROM (SELECT data.deviceId, EXPLODE(data.signals) AS signal FROM network_signals) x" | ||
val generateUUID = udf(() => UUID.randomUUID().toString) | ||
val query = spark.sql(sql).withColumn("id", generateUUID()) | ||
|
||
// cassandra | ||
val keyspace = config.getProperty("cassandra.keyspace") | ||
val table = config.getProperty("cassandra.table") | ||
val cluster = { | ||
Cluster.builder() | ||
.addContactPoint(config.getProperty("cassandra.host")) | ||
.build() | ||
} | ||
|
||
val writer = new ForeachWriter[Row] { | ||
var session: Session = null | ||
var records: Int = 0 | ||
var start: Long = 0 | ||
|
||
override def open(partitionId: Long, version: Long) = { | ||
start = System.currentTimeMillis() | ||
session = cluster.connect(keyspace) | ||
session != null | ||
} | ||
|
||
override def process(row: Row) = { | ||
val query = { | ||
QB.insertInto(table) | ||
.value("deviceId", row.getString(0)) | ||
.value("time", row.getLong(1)) | ||
.value("networkType", row.getString(2)) | ||
.value("rxSpeed", row.getDouble(3)) | ||
.value("txSpeed", row.getDouble(4)) | ||
.value("rxData", row.getLong(5)) | ||
.value("txData", row.getLong(6)) | ||
.value("latitude", row.getDouble(7)) | ||
.value("longitude", row.getDouble(8)) | ||
.value("id", row.getString(9)) | ||
} | ||
session.executeAsync(query) | ||
records += 1 | ||
} | ||
|
||
override def close(errorOrNull: Throwable) = { | ||
if (session != null) session.close() | ||
println(records + " records processed, takes " + (System.currentTimeMillis() - start) + " ms") | ||
} | ||
} | ||
|
||
val result = query.writeStream | ||
.trigger(Trigger.ProcessingTime(config.getProperty("processing.time"))) | ||
.outputMode(OutputMode.Append()) | ||
.foreach(writer) | ||
.start() | ||
|
||
result.awaitTermination() | ||
|
||
} | ||
|
||
|
84 changes: 0 additions & 84 deletions
84
src/main/scala/com/datapyro/spark/NetworkQualitySparkStreamingJob.scala
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.