better comments for spark scripts

RyanQuey · RyanQuey · commit e2dd124c2c07 · 2020-08-05T21:22:26.000-07:00
diff --git a/spark-scripts/src/main/scala/SimpleApp.scala b/spark-scripts/src/main/scala/SimpleApp.scala
@@ -4,7 +4,7 @@
 */
 import org.apache.spark.sql.SparkSession
 
-// level 1: get spark streaming to work
+// level 1: get spark to work from an sbt project
 object SimpleApp {
   def main(args: Array[String]) {
     val logFile = "file:///home/ubuntu/projects/java-podcast-processor/spark-scripts/README.md" // Should be some file on your system
diff --git a/spark-scripts/src/main/scala/SparkAggKafkaStreamingTest.scala b/spark-scripts/src/main/scala/SparkAggKafkaStreamingTest.scala
@@ -12,6 +12,7 @@ import org.apache.kafka.common.serialization.StringDeserializer
 object SparkAggKafkaStreamingTest {
 	def main (args: Array[String]) { 
     /* 
+     * Level 4: Run aggregations on these Spark streams that are consuming Kafka topics
      * for each topic, get timestamp of first and last event that occurs.
      * Also get the average time interval between messages (avgDiffSec)
      */
diff --git a/spark-scripts/src/main/scala/SparkKafkaStreamingAvgTimeDiff.scala b/spark-scripts/src/main/scala/SparkKafkaStreamingAvgTimeDiff.scala
@@ -23,6 +23,8 @@ object SparkKafkaStreamingAvgTimeDiff {
     import spark.implicits._
 
     /*
+     * Level 6: Do it on our actual podcast kafka topics
+     *
      * Currently using `podcast` topic as the "action" and `episode topic as the reaction for our proof of concept
      * This way we can do things like:
      *   - test how many episodes were successfully parsed out, by comparing our `episode` kafka topic with the episodeCount returned from itunes api call
diff --git a/spark-scripts/src/main/scala/SparkKafkaStreamingAvgTimeDiffTest.scala b/spark-scripts/src/main/scala/SparkKafkaStreamingAvgTimeDiffTest.scala
@@ -12,8 +12,13 @@ import org.apache.kafka.common.serialization.StringDeserializer
 object SparkKafkaStreamingAvgTimeDiffTest {
 	def main (args: Array[String]) {
     /* 
+     * Level 5: Get average time between events between two different kafka topics, but specifically when the 2nd topic is consuming from the 1st topic, so we see a "reaction time". 
+     * This is done by doing `select where` clause that filters `action_value = reaction_value`. 
+     * E.g., if in `test` there is an event with value `event1`, it won't be paired with events in topic `test-reaction` with value `event3`, but rather only with value `event1`.
+     *
      * this is close to what we want to do, but only uses fake topics (`test` as an action,  and `test-reaction` as reaction). 
      * we can then take a producer running in a terminal session and send events to these, just to make sure that our logic is working correctly, before we were on this on our actual kafka topics
+     *
      * For the final product, see spark-scripts/src/main/scala/SparkKafkaStreamingAvgTimeDiff.scala
      */
 
diff --git a/spark-scripts/src/main/scala/SparkKafkaStreamingTest.scala b/spark-scripts/src/main/scala/SparkKafkaStreamingTest.scala
@@ -6,7 +6,7 @@ import org.apache.spark.sql.kafka010._
 import org.apache.kafka.clients.consumer.ConsumerRecord
 import org.apache.kafka.common.serialization.StringDeserializer
 
-// level 2: get Sparks Streaming to work with kafka topics
+// level 3: get Sparks Streaming to work with kafka topics
 object SparkKafkaStreamingTest {
 	def main (args: Array[String]) { 
     /* 
diff --git a/spark-scripts/src/main/scala/SparkStreamingTest.scala b/spark-scripts/src/main/scala/SparkStreamingTest.scala
@@ -1,6 +1,7 @@
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.SparkSession
 
+// level 2: get spark streaming to work
 // singleton class (our main). Runs a word count over network (localhost:9999)
 object SparkStreamingTest {
 	def main (args: Array[String]) {