Skip to content

First draft of sessionByKey #28

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,6 @@ object StatefulNetworkWordCount {

StreamingExamples.setStreamingLogLevels()

val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.sum

val previousCount = state.getOrElse(0)

Some(currentCount + previousCount)
}

val newUpdateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => {
iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
}

val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sparkConf, Seconds(1))
Expand All @@ -72,8 +60,16 @@ object StatefulNetworkWordCount {

// Update the cumulative count using updateStateByKey
// This will give a Dstream made of state (which is the cumulative count of the words)
val stateDstream = wordDstream.updateStateByKey[Int](newUpdateFunc,
new HashPartitioner (ssc.sparkContext.defaultParallelism), true, initialRDD)

val trackStateFunc = (word: String, one: Option[Int], state: State[Int]) => {
val sum = one.getOrElse(0) + state.getOrElse(0)
val output = (word, sum)
state.update(sum)
Some(output)
}

val stateDstream = wordDstream.trackStateByKey(
TrackStateSpec(trackStateFunc).initialState(initialRDD))
stateDstream.print()
ssc.start()
ssc.awaitTermination()
Expand Down
138 changes: 138 additions & 0 deletions streaming/src/main/scala/org/apache/spark/streaming/State.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.streaming

/**
* Abstract class for getting and updating the tracked state in the `trackStateByKey` operation of
* [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] and
* [[org.apache.spark.streaming.api.java.JavaPairDStream]].
* {{{
*
* }}}
*/
sealed abstract class State[S] {

/** Whether the state already exists */
def exists(): Boolean

/**
* Get the state if it exists, otherwise wise it will throw an exception.
* Check with `exists()` whether the state exists or not before calling `get()`.
*/
def get(): S

/**
* Update the state with a new value. Note that you cannot update the state if the state is
* timing out (that is, `isTimingOut() return true`, or if the state has already been removed by
* `remove()`.
*/
def update(newState: S): Unit

/** Remove the state if it exists. */
def remove(): Unit

/** Is the state going to be timed out by the system after this batch interval */
def isTimingOut(): Boolean

@inline final def getOption(): Option[S] = Option(get())

/** Get the state if it exists, otherwise return the default value */
@inline final def getOrElse[S1 >: S](default: => S1): S1 = {
if (exists) this.get else default
}

@inline final override def toString() = getOption.map { _.toString }.getOrElse("<state not set>")
}

/** Internal implementation of the [[State]] interface */
private[streaming] class StateImpl[S] extends State[S] {

private var state: S = null.asInstanceOf[S]
private var defined: Boolean = true
private var timingOut: Boolean = false
private var updated: Boolean = false
private var removed: Boolean = false

// ========= Public API =========
def exists(): Boolean = {
defined
}

def get(): S = {
state
}

def update(newState: S): Unit = {
require(!removed, "Cannot update the state after it has been removed")
require(!timingOut, "Cannot update the state that is timing out")
state = newState
updated = true
}

def isTimingOut(): Boolean = {
timingOut
}

def remove(): Unit = {
require(!timingOut, "Cannot remove the state that is timing out")
removed = true
}

// ========= Internal API =========

/** Whether the state has been marked for removing */
def isRemoved(): Boolean = {
removed
}

/** Whether the state has been been updated */
def isUpdated(): Boolean = {
updated
}

/**
* Internal method to update the state data and reset internal flags in `this`.
* This method allows `this` object to be reused across many state records.
*/
def wrap(optionalState: Option[S]): Unit = {
optionalState match {
case Some(newState) =>
this.state = newState
defined = true

case None =>
this.state = null.asInstanceOf[S]
defined = false
}
timingOut = false
removed = false
updated = false
}

/**
* Internal method to update the state data and reset internal flags in `this`.
* This method allows `this` object to be reused across many state records.
*/
def wrapTiminoutState(newState: S): Unit = {
this.state = newState
defined = true
timingOut = true
removed = false
updated = false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.streaming

import scala.reflect.ClassTag

import org.apache.spark.{HashPartitioner, Partitioner}
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.rdd.RDD


/**
* Abstract class having all the specifications of DStream.trackStateByKey().
* Use the `TrackStateSpec.create()` or `TrackStateSpec.create()` to create instances of this class.
*
* {{{
* TrackStateSpec(trackingFunction) // in Scala
* TrackStateSpec.create(trackingFunction) // in Java
* }}}
*/
sealed abstract class TrackStateSpec[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag]
extends Serializable {

def initialState(rdd: RDD[(K, S)]): this.type
def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type

def numPartitions(numPartitions: Int): this.type
def partitioner(partitioner: Partitioner): this.type

def timeout(interval: Duration): this.type
}


/** Builder object for creating instances of TrackStateSpec */
object TrackStateSpec {

def apply[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = {
new TrackStateSpecImpl[K, V, S, T](trackingFunction)
}

def create[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = {
apply(trackingFunction)
}
}


/** Internal implementation of [[TrackStateSpec]] interface */
private[streaming]
case class TrackStateSpecImpl[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
function: (K, Option[V], State[S]) => Option[T]) extends TrackStateSpec[K, V, S, T] {

require(function != null)

@volatile private var partitioner: Partitioner = null
@volatile private var initialStateRDD: RDD[(K, S)] = null
@volatile private var timeoutInterval: Duration = null


def initialState(rdd: RDD[(K, S)]): this.type = {
this.initialStateRDD = rdd
this
}

def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type = {
this.initialStateRDD = javaPairRDD.rdd
this
}


def numPartitions(numPartitions: Int): this.type = {
this.partitioner(new HashPartitioner(numPartitions))
this
}

def partitioner(partitioner: Partitioner): this.type = {
this.partitioner = partitioner
this
}

def timeout(interval: Duration): this.type = {
this.timeoutInterval = interval
this
}

// ================= Private Methods =================

private[streaming] def getFunction(): (K, Option[V], State[S]) => Option[T] = function

private[streaming] def getInitialStateRDD(): Option[RDD[(K, S)]] = Option(initialStateRDD)

private[streaming] def getPartitioner(): Option[Partitioner] = Option(partitioner)

private[streaming] def getTimeoutInterval(): Option[Duration] = Option(timeoutInterval)
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,18 @@ import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{JobConf, OutputFormat}
import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}

import org.apache.spark.{HashPartitioner, Partitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}
import org.apache.spark.streaming.StreamingContext.rddToFileName
import org.apache.spark.streaming.{Duration, Time, TrackStateSpec, TrackStateSpecImpl}
import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf}
import org.apache.spark.{HashPartitioner, Partitioner}

/**
* Extra functions available on DStream of (key, value) pairs through an implicit conversion.
*/
class PairDStreamFunctions[K, V](self: DStream[(K, V)])
(implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
extends Serializable
{
extends Serializable {
private[streaming] def ssc = self.ssc

private[streaming] def sparkContext = self.context.sparkContext
Expand Down Expand Up @@ -350,6 +349,16 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
)
}

def trackStateByKey[S: ClassTag, T: ClassTag](spec: TrackStateSpec[K, V, S, T]): DStream[T] = {
new TrackeStateDStream[K, V, S, T](
self,
spec.asInstanceOf[TrackStateSpecImpl[K, V, S, T]]
).mapPartitions { partitionIter =>
partitionIter.flatMap { _.emittedRecords }
}
}


/**
* Return a new "state" DStream where the state for each key is updated by applying
* the given function on the previous state of the key and the new values of each key.
Expand Down
Loading