29
29
from pyspark .files import SparkFiles
30
30
from pyspark .java_gateway import launch_gateway
31
31
from pyspark .serializers import PickleSerializer , BatchedSerializer , UTF8Deserializer , \
32
- PairDeserializer
32
+ PairDeserializer
33
33
from pyspark .storagelevel import StorageLevel
34
34
from pyspark import rdd
35
35
from pyspark .rdd import RDD
@@ -50,12 +50,11 @@ class SparkContext(object):
50
50
_next_accum_id = 0
51
51
_active_spark_context = None
52
52
_lock = Lock ()
53
- _python_includes = None # zip and egg files that need to be added to PYTHONPATH
54
-
53
+ _python_includes = None # zip and egg files that need to be added to PYTHONPATH
55
54
56
55
def __init__ (self , master = None , appName = None , sparkHome = None , pyFiles = None ,
57
- environment = None , batchSize = 1024 , serializer = PickleSerializer (), conf = None ,
58
- gateway = None ):
56
+ environment = None , batchSize = 1024 , serializer = PickleSerializer (), conf = None ,
57
+ gateway = None ):
59
58
"""
60
59
Create a new SparkContext. At least the master and app name should be set,
61
60
either through the named parameters here or through C{conf}.
@@ -138,8 +137,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
138
137
self ._accumulatorServer = accumulators ._start_update_server ()
139
138
(host , port ) = self ._accumulatorServer .server_address
140
139
self ._javaAccumulator = self ._jsc .accumulator (
141
- self ._jvm .java .util .ArrayList (),
142
- self ._jvm .PythonAccumulatorParam (host , port ))
140
+ self ._jvm .java .util .ArrayList (),
141
+ self ._jvm .PythonAccumulatorParam (host , port ))
143
142
144
143
self .pythonExec = os .environ .get ("PYSPARK_PYTHON" , 'python' )
145
144
@@ -165,7 +164,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
165
164
(dirname , filename ) = os .path .split (path )
166
165
self ._python_includes .append (filename )
167
166
sys .path .append (path )
168
- if not dirname in sys .path :
167
+ if dirname not in sys .path :
169
168
sys .path .append (dirname )
170
169
171
170
# Create a temporary directory inside spark.local.dir:
@@ -192,15 +191,19 @@ def _ensure_initialized(cls, instance=None, gateway=None):
192
191
SparkContext ._writeToFile = SparkContext ._jvm .PythonRDD .writeToFile
193
192
194
193
if instance :
195
- if SparkContext ._active_spark_context and SparkContext ._active_spark_context != instance :
194
+ if (SparkContext ._active_spark_context and
195
+ SparkContext ._active_spark_context != instance ):
196
196
currentMaster = SparkContext ._active_spark_context .master
197
197
currentAppName = SparkContext ._active_spark_context .appName
198
198
callsite = SparkContext ._active_spark_context ._callsite
199
199
200
200
# Raise error if there is already a running Spark context
201
- raise ValueError ("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \
202
- " created by %s at %s:%s " \
203
- % (currentAppName , currentMaster , callsite .function , callsite .file , callsite .linenum ))
201
+ raise ValueError (
202
+ "Cannot run multiple SparkContexts at once; "
203
+ "existing SparkContext(app=%s, master=%s)"
204
+ " created by %s at %s:%s "
205
+ % (currentAppName , currentMaster ,
206
+ callsite .function , callsite .file , callsite .linenum ))
204
207
else :
205
208
SparkContext ._active_spark_context = instance
206
209
@@ -290,7 +293,7 @@ def textFile(self, name, minPartitions=None):
290
293
Read a text file from HDFS, a local file system (available on all
291
294
nodes), or any Hadoop-supported file system URI, and return it as an
292
295
RDD of Strings.
293
-
296
+
294
297
>>> path = os.path.join(tempdir, "sample-text.txt")
295
298
>>> with open(path, "w") as testFile:
296
299
... testFile.write("Hello world!")
@@ -584,11 +587,12 @@ def addPyFile(self, path):
584
587
HTTP, HTTPS or FTP URI.
585
588
"""
586
589
self .addFile (path )
587
- (dirname , filename ) = os .path .split (path ) # dirname may be directory or HDFS/S3 prefix
590
+ (dirname , filename ) = os .path .split (path ) # dirname may be directory or HDFS/S3 prefix
588
591
589
592
if filename .endswith ('.zip' ) or filename .endswith ('.ZIP' ) or filename .endswith ('.egg' ):
590
593
self ._python_includes .append (filename )
591
- sys .path .append (os .path .join (SparkFiles .getRootDirectory (), filename )) # for tests in local mode
594
+ # for tests in local mode
595
+ sys .path .append (os .path .join (SparkFiles .getRootDirectory (), filename ))
592
596
593
597
def setCheckpointDir (self , dirName ):
594
598
"""
@@ -649,9 +653,9 @@ def setJobGroup(self, groupId, description, interruptOnCancel=False):
649
653
Cancelled
650
654
651
655
If interruptOnCancel is set to true for the job group, then job cancellation will result
652
- in Thread.interrupt() being called on the job's executor threads. This is useful to help ensure
653
- that the tasks are actually stopped in a timely manner, but is off by default due to HDFS-1208,
654
- where HDFS may respond to Thread.interrupt() by marking nodes as dead.
656
+ in Thread.interrupt() being called on the job's executor threads. This is useful to help
657
+ ensure that the tasks are actually stopped in a timely manner, but is off by default due
658
+ to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
655
659
"""
656
660
self ._jsc .setJobGroup (groupId , description , interruptOnCancel )
657
661
@@ -688,7 +692,7 @@ def cancelAllJobs(self):
688
692
"""
689
693
self ._jsc .sc ().cancelAllJobs ()
690
694
691
- def runJob (self , rdd , partitionFunc , partitions = None , allowLocal = False ):
695
+ def runJob (self , rdd , partitionFunc , partitions = None , allowLocal = False ):
692
696
"""
693
697
Executes the given partitionFunc on the specified set of partitions,
694
698
returning the result as an array of elements.
@@ -703,7 +707,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
703
707
>>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
704
708
[0, 1, 16, 25]
705
709
"""
706
- if partitions == None :
710
+ if partitions is None :
707
711
partitions = range (rdd ._jrdd .partitions ().size ())
708
712
javaPartitions = ListConverter ().convert (partitions , self ._gateway ._gateway_client )
709
713
@@ -714,6 +718,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
714
718
it = self ._jvm .PythonRDD .runJob (self ._jsc .sc (), mappedRDD ._jrdd , javaPartitions , allowLocal )
715
719
return list (mappedRDD ._collect_iterator_through_file (it ))
716
720
721
+
717
722
def _test ():
718
723
import atexit
719
724
import doctest
0 commit comments