tdas · tdas · Oct 8, 2015 · Oct 9, 2015 · Oct 10, 2015 · Oct 13, 2015
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
@@ -44,18 +44,6 @@ object StatefulNetworkWordCount {
 
     StreamingExamples.setStreamingLogLevels()
 
-    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
-      val currentCount = values.sum
-
-      val previousCount = state.getOrElse(0)
-
-      Some(currentCount + previousCount)
-    }
-
-    val newUpdateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => {
-      iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
-    }
-
     val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
     // Create the context with a 1 second batch size
     val ssc = new StreamingContext(sparkConf, Seconds(1))
@@ -72,8 +60,16 @@ object StatefulNetworkWordCount {
 
     // Update the cumulative count using updateStateByKey
     // This will give a Dstream made of state (which is the cumulative count of the words)
-    val stateDstream = wordDstream.updateStateByKey[Int](newUpdateFunc,
-      new HashPartitioner (ssc.sparkContext.defaultParallelism), true, initialRDD)
+
+    val trackStateFunc = (word: String, one: Option[Int], state: State[Int]) => {
+      val sum = one.getOrElse(0) + state.getOrElse(0)
+      val output = (word, sum)
+      state.update(sum)
+      Some(output)
+    }
+
+    val stateDstream = wordDstream.trackStateByKey(
+      TrackStateSpec(trackStateFunc).initialState(initialRDD))
     stateDstream.print()
     ssc.start()
     ssc.awaitTermination()

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/State.scala b/streaming/src/main/scala/org/apache/spark/streaming/State.scala
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming
+
+/**
+ * Abstract class for getting and updating the tracked state in the `trackStateByKey` operation of
+ * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] and
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream]].
+ * {{{
+ *
+ * }}}
+ */
+sealed abstract class State[S] {
+
+  /** Whether the state already exists */
+  def exists(): Boolean
+
+  /**
+   * Get the state if it exists, otherwise wise it will throw an exception.
+   * Check with `exists()` whether the state exists or not before calling `get()`.
+   */
+  def get(): S
+
+  /**
+   * Update the state with a new value. Note that you cannot update the state if the state is
+   * timing out (that is, `isTimingOut() return true`, or if the state has already been removed by
+   * `remove()`.
+   */
+  def update(newState: S): Unit
+
+  /** Remove the state if it exists. */
+  def remove(): Unit
+
+  /** Is the state going to be timed out by the system after this batch interval */
+  def isTimingOut(): Boolean
+
+  @inline final def getOption(): Option[S] = Option(get())
+
+  /** Get the state if it exists, otherwise return the default value */
+  @inline final def getOrElse[S1 >: S](default: => S1): S1 = {
+    if (exists) this.get else default
+  }
+
+  @inline final override def toString() = getOption.map { _.toString }.getOrElse("<state not set>")
+}
+
+/** Internal implementation of the [[State]] interface */
+private[streaming] class StateImpl[S] extends State[S] {
+
+  private var state: S = null.asInstanceOf[S]
+  private var defined: Boolean = true
+  private var timingOut: Boolean = false
+  private var updated: Boolean = false
+  private var removed: Boolean = false
+
+  // ========= Public API =========
+  def exists(): Boolean = {
+    defined
+  }
+
+  def get(): S = {
+    state
+  }
+
+  def update(newState: S): Unit = {
+    require(!removed, "Cannot update the state after it has been removed")
+    require(!timingOut, "Cannot update the state that is timing out")
+    state = newState
+    updated = true
+  }
+
+  def isTimingOut(): Boolean = {
+    timingOut
+  }
+
+  def remove(): Unit = {
+    require(!timingOut, "Cannot remove the state that is timing out")
+    removed = true
+  }
+
+  // ========= Internal API =========
+
+  /** Whether the state has been marked for removing */
+  def isRemoved(): Boolean = {
+    removed
+  }
+
+  /** Whether the state has been been updated */
+  def isUpdated(): Boolean = {
+    updated
+  }
+
+  /**
+   * Internal method to update the state data and reset internal flags in `this`.
+   * This method allows `this` object to be reused across many state records.
+   */
+  def wrap(optionalState: Option[S]): Unit = {
+    optionalState match {
+      case Some(newState) =>
+        this.state = newState
+        defined = true
+
+      case None =>
+        this.state = null.asInstanceOf[S]
+        defined = false
+    }
+    timingOut = false
+    removed = false
+    updated = false
+  }
+
+  /**
+   * Internal method to update the state data and reset internal flags in `this`.
+   * This method allows `this` object to be reused across many state records.
+   */
+  def wrapTiminoutState(newState: S): Unit = {
+    this.state = newState
+    defined = true
+    timingOut = true
+    removed = false
+    updated = false
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/TrackStateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/TrackStateSpec.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.{HashPartitioner, Partitioner}
+import org.apache.spark.api.java.JavaPairRDD
+import org.apache.spark.rdd.RDD
+
+
+/**
+ * Abstract class having all the specifications of DStream.trackStateByKey().
+ * Use the `TrackStateSpec.create()` or `TrackStateSpec.create()` to create instances of this class.
+ *
+ * {{{
+ *    TrackStateSpec(trackingFunction)            // in Scala
+ *    TrackStateSpec.create(trackingFunction)     // in Java
+ * }}}
+ */
+sealed abstract class TrackStateSpec[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag]
+  extends Serializable {
+
+  def initialState(rdd: RDD[(K, S)]): this.type
+  def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type
+
+  def numPartitions(numPartitions: Int): this.type
+  def partitioner(partitioner: Partitioner): this.type
+
+  def timeout(interval: Duration): this.type
+}
+
+
+/** Builder object for creating instances of TrackStateSpec */
+object TrackStateSpec {
+
+  def apply[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
+      trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = {
+    new TrackStateSpecImpl[K, V, S, T](trackingFunction)
+  }
+
+  def create[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
+      trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = {
+    apply(trackingFunction)
+  }
+}
+
+
+/** Internal implementation of [[TrackStateSpec]] interface */
+private[streaming]
+case class TrackStateSpecImpl[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
+    function: (K, Option[V], State[S]) => Option[T]) extends TrackStateSpec[K, V, S, T] {
+
+  require(function != null)
+
+  @volatile private var partitioner: Partitioner = null
+  @volatile private var initialStateRDD: RDD[(K, S)] = null
+  @volatile private var timeoutInterval: Duration = null
+
+
+  def initialState(rdd: RDD[(K, S)]): this.type = {
+    this.initialStateRDD = rdd
+    this
+  }
+
+  def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type = {
+    this.initialStateRDD = javaPairRDD.rdd
+    this
+  }
+
+
+  def numPartitions(numPartitions: Int): this.type = {
+    this.partitioner(new HashPartitioner(numPartitions))
+    this
+  }
+
+  def partitioner(partitioner: Partitioner): this.type = {
+    this.partitioner = partitioner
+    this
+  }
+
+  def timeout(interval: Duration): this.type = {
+    this.timeoutInterval = interval
+    this
+  }
+
+  // ================= Private Methods =================
+
+  private[streaming] def getFunction(): (K, Option[V], State[S]) => Option[T] = function
+
+  private[streaming] def getInitialStateRDD(): Option[RDD[(K, S)]] = Option(initialStateRDD)
+
+  private[streaming] def getPartitioner(): Option[Partitioner] = Option(partitioner)
+
+  private[streaming] def getTimeoutInterval(): Option[Duration] = Option(timeoutInterval)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -24,19 +24,18 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapred.{JobConf, OutputFormat}
 import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
 
-import org.apache.spark.{HashPartitioner, Partitioner}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.streaming.StreamingContext.rddToFileName
+import org.apache.spark.streaming.{Duration, Time, TrackStateSpec, TrackStateSpecImpl}
 import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf}
+import org.apache.spark.{HashPartitioner, Partitioner}
 
 /**
  * Extra functions available on DStream of (key, value) pairs through an implicit conversion.
  */
 class PairDStreamFunctions[K, V](self: DStream[(K, V)])
     (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
-  extends Serializable
-{
+  extends Serializable {
   private[streaming] def ssc = self.ssc
 
   private[streaming] def sparkContext = self.context.sparkContext
@@ -350,6 +349,16 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
     )
   }
 
+  def trackStateByKey[S: ClassTag, T: ClassTag](spec: TrackStateSpec[K, V, S, T]): DStream[T] = {
+    new TrackeStateDStream[K, V, S, T](
+      self,
+      spec.asInstanceOf[TrackStateSpecImpl[K, V, S, T]]
+    ).mapPartitions { partitionIter =>
+      partitionIter.flatMap { _.emittedRecords }
+    }
+  }
+
+
   /**
    * Return a new "state" DStream where the state for each key is updated by applying
    * the given function on the previous state of the key and the new values of each key.