Skip to content

Commit d05871e

Browse files
committed
remove reuse of PythonRDD
1 parent be5e5ff commit d05871e

File tree

3 files changed

+20
-51
lines changed

3 files changed

+20
-51
lines changed

python/pyspark/streaming/dstream.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -286,13 +286,11 @@ def transform(self, func):
286286
`func` can have one argument of `rdd`, or have two arguments of
287287
(`time`, `rdd`)
288288
"""
289-
resue = False
290289
if func.func_code.co_argcount == 1:
291-
reuse = True
292290
oldfunc = func
293291
func = lambda t, rdd: oldfunc(rdd)
294292
assert func.func_code.co_argcount == 2, "func should take one or two arguments"
295-
return TransformedDStream(self, func, reuse)
293+
return TransformedDStream(self, func)
296294

297295
def transformWith(self, func, other, keepSerializer=False):
298296
"""
@@ -597,34 +595,30 @@ class TransformedDStream(DStream):
597595
Multiple continuous transformations of DStream can be combined into
598596
one transformation.
599597
"""
600-
def __init__(self, prev, func, reuse=False):
598+
def __init__(self, prev, func):
601599
ssc = prev._ssc
602600
self._ssc = ssc
603601
self.ctx = ssc._sc
604602
self._jrdd_deserializer = self.ctx.serializer
605603
self.is_cached = False
606604
self.is_checkpointed = False
605+
self._jdstream_val = None
607606

608607
if (isinstance(prev, TransformedDStream) and
609608
not prev.is_cached and not prev.is_checkpointed):
610609
prev_func = prev.func
611-
old_func = func
612-
func = lambda t, rdd: old_func(t, prev_func(t, rdd))
613-
reuse = reuse and prev.reuse
614-
prev = prev.prev
615-
616-
self.prev = prev
617-
self.func = func
618-
self.reuse = reuse
619-
self._jdstream_val = None
610+
self.func = lambda t, rdd: func(t, prev_func(t, rdd))
611+
self.prev = prev.prev
612+
else:
613+
self.prev = prev
614+
self.func = func
620615

621616
@property
622617
def _jdstream(self):
623618
if self._jdstream_val is not None:
624619
return self._jdstream_val
625620

626621
jfunc = TransformFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
627-
jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
628-
jfunc, self.reuse).asJavaDStream()
629-
self._jdstream_val = jdstream
630-
return jdstream
622+
dstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
623+
self._jdstream_val = dstream.asJavaDStream()
624+
return self._jdstream_val

python/pyspark/streaming/tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ def setup():
504504
return ssc
505505

506506
cpd = tempfile.mkdtemp("test_streaming_cps")
507-
ssc = StreamingContext.getOrCreate(cpd, setup)
507+
self.ssc = ssc = StreamingContext.getOrCreate(cpd, setup)
508508
ssc.start()
509509

510510
def check_output(n):
@@ -539,7 +539,7 @@ def check_output(n):
539539
ssc.stop(True, True)
540540

541541
time.sleep(1)
542-
ssc = StreamingContext.getOrCreate(cpd, setup)
542+
self.ssc = ssc = StreamingContext.getOrCreate(cpd, setup)
543543
ssc.start()
544544
check_output(3)
545545

streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -157,43 +157,18 @@ private[python] abstract class PythonDStream(
157157

158158
/**
159159
* Transformed DStream in Python.
160-
*
161-
* If `reuse` is true and the result of the `func` is an PythonRDD, then it will cache it
162-
* as an template for future use, this can reduce the Python callbacks.
163160
*/
164161
private[python] class PythonTransformedDStream (
165162
parent: DStream[_],
166-
@transient pfunc: PythonTransformFunction,
167-
var reuse: Boolean = false)
163+
@transient pfunc: PythonTransformFunction)
168164
extends PythonDStream(parent, pfunc) {
169165

170-
// rdd returned by func
171-
var lastResult: PythonRDD = _
172-
173166
override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
174167
val rdd = parent.getOrCompute(validTime)
175-
if (rdd.isEmpty) {
176-
return None
177-
}
178-
if (reuse && lastResult != null) {
179-
// use the previous result as the template to generate new RDD
180-
Some(lastResult.copyTo(rdd.get))
168+
if (rdd.isDefined) {
169+
func(rdd, validTime)
181170
} else {
182-
val r = func(rdd, validTime)
183-
if (reuse && r.isDefined && lastResult == null) {
184-
// try to use the result as a template
185-
r.get match {
186-
case pyrdd: PythonRDD =>
187-
if (pyrdd.firstParent == rdd) {
188-
// only one PythonRDD
189-
lastResult = pyrdd
190-
} else {
191-
// maybe have multiple stages, don't check it anymore
192-
reuse = false
193-
}
194-
}
195-
}
196-
r
171+
None
197172
}
198173
}
199174
}
@@ -209,18 +184,18 @@ private[python] class PythonTransformed2DStream(
209184

210185
val func = new TransformFunction(pfunc)
211186

212-
override def slideDuration: Duration = parent.slideDuration
213-
214187
override def dependencies = List(parent, parent2)
215188

189+
override def slideDuration: Duration = parent.slideDuration
190+
216191
override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
217192
val empty: RDD[_] = ssc.sparkContext.emptyRDD
218193
val rdd1 = parent.getOrCompute(validTime).getOrElse(empty)
219194
val rdd2 = parent2.getOrCompute(validTime).getOrElse(empty)
220195
func(Some(rdd1), Some(rdd2), validTime)
221196
}
222197

223-
val asJavaDStream = JavaDStream.fromDStream(this)
198+
val asJavaDStream = JavaDStream.fromDStream(this)
224199
}
225200

226201
/**

0 commit comments

Comments
 (0)