Skip to content

Commit e00136b

Browse files
committed
address comments
1 parent 069a94c commit e00136b

File tree

9 files changed

+245
-221
lines changed

9 files changed

+245
-221
lines changed

core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ private[spark] class PythonRDD(
5252
accumulator: Accumulator[JList[Array[Byte]]])
5353
extends RDD[Array[Byte]](parent) {
5454

55+
// create a new PythonRDD with same Python setting but different parent.
5556
def copyTo(rdd: RDD[_]): PythonRDD = {
5657
new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning,
5758
pythonExec, broadcastVars, accumulator)

python/pyspark/java_gateway.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import platform
2424
from subprocess import Popen, PIPE
2525
from threading import Thread
26-
2726
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
2827

2928

python/pyspark/streaming/context.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
def _daemonize_callback_server():
3232
"""
3333
Hack Py4J to daemonize callback server
34+
35+
The thread of callback server has daemon=False, it will block the driver
36+
from exiting if it's not shutdown. The following code replace `start()`
37+
of CallbackServer with a new version, which set daemon=True for this
38+
thread.
3439
"""
3540
# TODO: create a patch for Py4J
3641
import socket
@@ -47,7 +52,6 @@ def start(self):
4752
1)
4853
try:
4954
self.server_socket.bind((self.address, self.port))
50-
# self.port = self.server_socket.getsockname()[1]
5155
except Exception:
5256
msg = 'An error occurred while trying to start the callback server'
5357
logger.exception(msg)
@@ -63,19 +67,21 @@ def start(self):
6367

6468
class StreamingContext(object):
6569
"""
66-
Main entry point for Spark Streaming functionality. A StreamingContext represents the
67-
connection to a Spark cluster, and can be used to create L{DStream}s and
68-
broadcast variables on that cluster.
70+
Main entry point for Spark Streaming functionality. A StreamingContext
71+
represents the connection to a Spark cluster, and can be used to create
72+
L{DStream}s various input sources. It can be from an existing L{SparkContext}.
73+
After creating and transforming DStreams, the streaming computation can
74+
be started and stopped using `context.start()` and `context.stop()`,
75+
respectively. `context.awaitTransformation()` allows the current thread
76+
to wait for the termination of the context by `stop()` or by an exception.
6977
"""
7078

7179
def __init__(self, sparkContext, duration):
7280
"""
73-
Create a new StreamingContext. At least the master and app name and duration
74-
should be set, either through the named parameters here or through C{conf}.
81+
Create a new StreamingContext.
7582
7683
@param sparkContext: L{SparkContext} object.
77-
@param duration: seconds for SparkStreaming.
78-
84+
@param duration: number of seconds.
7985
"""
8086
self._sc = sparkContext
8187
self._jvm = self._sc._jvm
@@ -127,8 +133,12 @@ def awaitTermination(self, timeout=None):
127133

128134
def stop(self, stopSparkContext=True, stopGraceFully=False):
129135
"""
130-
Stop the execution of the streams immediately (does not wait for all received data
131-
to be processed).
136+
Stop the execution of the streams, with option of ensuring all
137+
received data has been processed.
138+
139+
@param stopSparkContext Stop the associated SparkContext or not
140+
@param stopGracefully Stop gracefully by waiting for the processing
141+
of all received data to be completed
132142
"""
133143
self._jssc.stop(stopSparkContext, stopGraceFully)
134144
if stopSparkContext:
@@ -140,7 +150,7 @@ def remember(self, duration):
140150
in the last given duration. DStreams remember RDDs only for a
141151
limited duration of time and releases them for garbage collection.
142152
This method allows the developer to specify how to long to remember
143-
the RDDs ( if the developer wishes to query old data outside the
153+
the RDDs (if the developer wishes to query old data outside the
144154
DStream computation).
145155
146156
@param duration Minimum duration (in seconds) that each DStream

0 commit comments

Comments
 (0)