Skip to content

Commit 5d16d5b

Browse files
nchammasrxin
authored andcommitted
[SPARK-2470] PEP8 fixes to PySpark
This pull request aims to resolve all outstanding PEP8 violations in PySpark. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes apache#1505 from nchammas/master and squashes the following commits: 98171af [Nicholas Chammas] [SPARK-2470] revert PEP 8 fixes to cloudpickle cba7768 [Nicholas Chammas] [SPARK-2470] wrap expression list in parentheses e178dbe [Nicholas Chammas] [SPARK-2470] style - change position of line break 9127d2b [Nicholas Chammas] [SPARK-2470] wrap expression lists in parentheses 22132a4 [Nicholas Chammas] [SPARK-2470] wrap conditionals in parentheses 24639bc [Nicholas Chammas] [SPARK-2470] fix whitespace for doctest 7d557b7 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to tests.py 8f8e4c0 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to storagelevel.py b3b96cf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to statcounter.py d644477 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to worker.py aa3a7b6 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to sql.py 1916859 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to shell.py 95d1d95 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to serializers.py a0fec2e [Nicholas Chammas] [SPARK-2470] PEP8 fixes to mllib c85e1e5 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to join.py d14f2f1 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to __init__.py 81fcb20 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to resultiterable.py 1bde265 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to java_gateway.py 7fc849c [Nicholas Chammas] [SPARK-2470] PEP8 fixes to daemon.py ca2d28b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to context.py f4e0039 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to conf.py a6d5e4b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to cloudpickle.py f0a7ebf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to rddsampler.py 4dd148f [nchammas] Merge pull request apache#5 from apache/master f7e4581 [Nicholas Chammas] unrelated pep8 fix a36eed0 [Nicholas Chammas] name ec2 instances and security groups consistently de7292a [nchammas] Merge pull request apache#4 from apache/master 2e4fe00 [nchammas] Merge pull request #3 from apache/master 89fde08 [nchammas] Merge pull request #2 from apache/master 69f6e22 [Nicholas Chammas] PEP8 fixes 2627247 [Nicholas Chammas] broke up lines before they hit 100 chars 6544b7e [Nicholas Chammas] [SPARK-2065] give launched instances names 69da6cf [nchammas] Merge pull request #1 from apache/master
1 parent c3462c6 commit 5d16d5b

18 files changed

+127
-97
lines changed

python/pyspark/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,5 @@
5959
from pyspark.storagelevel import StorageLevel
6060

6161

62-
__all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD", "SparkFiles", "StorageLevel", "Row"]
62+
__all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD",
63+
"SparkFiles", "StorageLevel", "Row"]

python/pyspark/conf.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
spark.executorEnv.VAR4=value4
5151
spark.home=/path
5252
>>> sorted(conf.getAll(), key=lambda p: p[0])
53-
[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), (u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
53+
[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \
54+
(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
5455
"""
5556

5657

@@ -118,9 +119,9 @@ def setExecutorEnv(self, key=None, value=None, pairs=None):
118119
"""Set an environment variable to be passed to executors."""
119120
if (key is not None and pairs is not None) or (key is None and pairs is None):
120121
raise Exception("Either pass one key-value pair or a list of pairs")
121-
elif key != None:
122+
elif key is not None:
122123
self._jconf.setExecutorEnv(key, value)
123-
elif pairs != None:
124+
elif pairs is not None:
124125
for (k, v) in pairs:
125126
self._jconf.setExecutorEnv(k, v)
126127
return self
@@ -137,7 +138,7 @@ def setAll(self, pairs):
137138

138139
def get(self, key, defaultValue=None):
139140
"""Get the configured value for some key, or return a default otherwise."""
140-
if defaultValue == None: # Py4J doesn't call the right get() if we pass None
141+
if defaultValue is None: # Py4J doesn't call the right get() if we pass None
141142
if not self._jconf.contains(key):
142143
return None
143144
return self._jconf.get(key)

python/pyspark/context.py

+25-20
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from pyspark.files import SparkFiles
3030
from pyspark.java_gateway import launch_gateway
3131
from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \
32-
PairDeserializer
32+
PairDeserializer
3333
from pyspark.storagelevel import StorageLevel
3434
from pyspark import rdd
3535
from pyspark.rdd import RDD
@@ -50,12 +50,11 @@ class SparkContext(object):
5050
_next_accum_id = 0
5151
_active_spark_context = None
5252
_lock = Lock()
53-
_python_includes = None # zip and egg files that need to be added to PYTHONPATH
54-
53+
_python_includes = None # zip and egg files that need to be added to PYTHONPATH
5554

5655
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
57-
environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
58-
gateway=None):
56+
environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
57+
gateway=None):
5958
"""
6059
Create a new SparkContext. At least the master and app name should be set,
6160
either through the named parameters here or through C{conf}.
@@ -138,8 +137,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
138137
self._accumulatorServer = accumulators._start_update_server()
139138
(host, port) = self._accumulatorServer.server_address
140139
self._javaAccumulator = self._jsc.accumulator(
141-
self._jvm.java.util.ArrayList(),
142-
self._jvm.PythonAccumulatorParam(host, port))
140+
self._jvm.java.util.ArrayList(),
141+
self._jvm.PythonAccumulatorParam(host, port))
143142

144143
self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
145144

@@ -165,7 +164,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
165164
(dirname, filename) = os.path.split(path)
166165
self._python_includes.append(filename)
167166
sys.path.append(path)
168-
if not dirname in sys.path:
167+
if dirname not in sys.path:
169168
sys.path.append(dirname)
170169

171170
# Create a temporary directory inside spark.local.dir:
@@ -192,15 +191,19 @@ def _ensure_initialized(cls, instance=None, gateway=None):
192191
SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile
193192

194193
if instance:
195-
if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
194+
if (SparkContext._active_spark_context and
195+
SparkContext._active_spark_context != instance):
196196
currentMaster = SparkContext._active_spark_context.master
197197
currentAppName = SparkContext._active_spark_context.appName
198198
callsite = SparkContext._active_spark_context._callsite
199199

200200
# Raise error if there is already a running Spark context
201-
raise ValueError("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \
202-
" created by %s at %s:%s " \
203-
% (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum))
201+
raise ValueError(
202+
"Cannot run multiple SparkContexts at once; "
203+
"existing SparkContext(app=%s, master=%s)"
204+
" created by %s at %s:%s "
205+
% (currentAppName, currentMaster,
206+
callsite.function, callsite.file, callsite.linenum))
204207
else:
205208
SparkContext._active_spark_context = instance
206209

@@ -290,7 +293,7 @@ def textFile(self, name, minPartitions=None):
290293
Read a text file from HDFS, a local file system (available on all
291294
nodes), or any Hadoop-supported file system URI, and return it as an
292295
RDD of Strings.
293-
296+
294297
>>> path = os.path.join(tempdir, "sample-text.txt")
295298
>>> with open(path, "w") as testFile:
296299
... testFile.write("Hello world!")
@@ -584,11 +587,12 @@ def addPyFile(self, path):
584587
HTTP, HTTPS or FTP URI.
585588
"""
586589
self.addFile(path)
587-
(dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix
590+
(dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix
588591

589592
if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
590593
self._python_includes.append(filename)
591-
sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename)) # for tests in local mode
594+
# for tests in local mode
595+
sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename))
592596

593597
def setCheckpointDir(self, dirName):
594598
"""
@@ -649,9 +653,9 @@ def setJobGroup(self, groupId, description, interruptOnCancel=False):
649653
Cancelled
650654
651655
If interruptOnCancel is set to true for the job group, then job cancellation will result
652-
in Thread.interrupt() being called on the job's executor threads. This is useful to help ensure
653-
that the tasks are actually stopped in a timely manner, but is off by default due to HDFS-1208,
654-
where HDFS may respond to Thread.interrupt() by marking nodes as dead.
656+
in Thread.interrupt() being called on the job's executor threads. This is useful to help
657+
ensure that the tasks are actually stopped in a timely manner, but is off by default due
658+
to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
655659
"""
656660
self._jsc.setJobGroup(groupId, description, interruptOnCancel)
657661

@@ -688,7 +692,7 @@ def cancelAllJobs(self):
688692
"""
689693
self._jsc.sc().cancelAllJobs()
690694

691-
def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
695+
def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
692696
"""
693697
Executes the given partitionFunc on the specified set of partitions,
694698
returning the result as an array of elements.
@@ -703,7 +707,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
703707
>>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
704708
[0, 1, 16, 25]
705709
"""
706-
if partitions == None:
710+
if partitions is None:
707711
partitions = range(rdd._jrdd.partitions().size())
708712
javaPartitions = ListConverter().convert(partitions, self._gateway._gateway_client)
709713

@@ -714,6 +718,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
714718
it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
715719
return list(mappedRDD._collect_iterator_through_file(it))
716720

721+
717722
def _test():
718723
import atexit
719724
import doctest

python/pyspark/daemon.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,12 @@ def should_exit():
4242

4343

4444
def compute_real_exit_code(exit_code):
45-
# SystemExit's code can be integer or string, but os._exit only accepts integers
46-
import numbers
47-
if isinstance(exit_code, numbers.Integral):
48-
return exit_code
49-
else:
50-
return 1
45+
# SystemExit's code can be integer or string, but os._exit only accepts integers
46+
import numbers
47+
if isinstance(exit_code, numbers.Integral):
48+
return exit_code
49+
else:
50+
return 1
5151

5252

5353
def worker(listen_sock):

python/pyspark/java_gateway.py

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from threading import Thread
2525
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
2626

27+
2728
def launch_gateway():
2829
SPARK_HOME = os.environ["SPARK_HOME"]
2930

python/pyspark/join.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,11 @@
3333

3434
from pyspark.resultiterable import ResultIterable
3535

36+
3637
def _do_python_join(rdd, other, numPartitions, dispatch):
3738
vs = rdd.map(lambda (k, v): (k, (1, v)))
3839
ws = other.map(lambda (k, v): (k, (2, v)))
39-
return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x : dispatch(x.__iter__()))
40+
return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__()))
4041

4142

4243
def python_join(rdd, other, numPartitions):
@@ -85,6 +86,7 @@ def make_mapper(i):
8586
vrdds = [rdd.map(make_mapper(i)) for i, rdd in enumerate(rdds)]
8687
union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds)
8788
rdd_len = len(vrdds)
89+
8890
def dispatch(seq):
8991
bufs = [[] for i in range(rdd_len)]
9092
for (n, v) in seq:

python/pyspark/mllib/_common.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def _deserialize_double_vector(ba, offset=0):
164164
nb = len(ba) - offset
165165
if nb < 5:
166166
raise TypeError("_deserialize_double_vector called on a %d-byte array, "
167-
"which is too short" % nb)
167+
"which is too short" % nb)
168168
if ba[offset] == DENSE_VECTOR_MAGIC:
169169
return _deserialize_dense_vector(ba, offset)
170170
elif ba[offset] == SPARSE_VECTOR_MAGIC:
@@ -272,6 +272,7 @@ def _serialize_labeled_point(p):
272272
header_float[0] = p.label
273273
return header + serialized_features
274274

275+
275276
def _deserialize_labeled_point(ba, offset=0):
276277
"""Deserialize a LabeledPoint from a mutually understood format."""
277278
from pyspark.mllib.regression import LabeledPoint
@@ -283,6 +284,7 @@ def _deserialize_labeled_point(ba, offset=0):
283284
features = _deserialize_double_vector(ba, offset + 9)
284285
return LabeledPoint(label, features)
285286

287+
286288
def _copyto(array, buffer, offset, shape, dtype):
287289
"""
288290
Copy the contents of a vector to a destination bytearray at the

python/pyspark/mllib/linalg.py

+1
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ def stringify(vector):
247247
else:
248248
return "[" + ",".join([str(v) for v in vector]) + "]"
249249

250+
250251
def _test():
251252
import doctest
252253
(failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)

python/pyspark/mllib/util.py

-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from pyspark.serializers import NoOpSerializer
2525

2626

27-
2827
class MLUtils:
2928
"""
3029
Helper methods to load, save and pre-process data used in MLlib.
@@ -154,7 +153,6 @@ def saveAsLibSVMFile(data, dir):
154153
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
155154
lines.saveAsTextFile(dir)
156155

157-
158156
@staticmethod
159157
def loadLabeledPoints(sc, path, minPartitions=None):
160158
"""

python/pyspark/rddsampler.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,16 @@
1818
import sys
1919
import random
2020

21+
2122
class RDDSampler(object):
2223
def __init__(self, withReplacement, fraction, seed=None):
2324
try:
2425
import numpy
2526
self._use_numpy = True
2627
except ImportError:
27-
print >> sys.stderr, "NumPy does not appear to be installed. Falling back to default random generator for sampling."
28+
print >> sys.stderr, (
29+
"NumPy does not appear to be installed. "
30+
"Falling back to default random generator for sampling.")
2831
self._use_numpy = False
2932

3033
self._seed = seed if seed is not None else random.randint(0, sys.maxint)
@@ -61,7 +64,7 @@ def getUniformSample(self, split):
6164
def getPoissonSample(self, split, mean):
6265
if not self._rand_initialized or split != self._split:
6366
self.initRandomGenerator(split)
64-
67+
6568
if self._use_numpy:
6669
return self._random.poisson(mean)
6770
else:
@@ -80,30 +83,27 @@ def getPoissonSample(self, split, mean):
8083
num_arrivals += 1
8184

8285
return (num_arrivals - 1)
83-
86+
8487
def shuffle(self, vals):
8588
if self._random is None:
8689
self.initRandomGenerator(0) # this should only ever called on the master so
8790
# the split does not matter
88-
91+
8992
if self._use_numpy:
9093
self._random.shuffle(vals)
9194
else:
9295
self._random.shuffle(vals, self._random.random)
9396

9497
def func(self, split, iterator):
95-
if self._withReplacement:
98+
if self._withReplacement:
9699
for obj in iterator:
97-
# For large datasets, the expected number of occurrences of each element in a sample with
98-
# replacement is Poisson(frac). We use that to get a count for each element.
99-
count = self.getPoissonSample(split, mean = self._fraction)
100+
# For large datasets, the expected number of occurrences of each element in
101+
# a sample with replacement is Poisson(frac). We use that to get a count for
102+
# each element.
103+
count = self.getPoissonSample(split, mean=self._fraction)
100104
for _ in range(0, count):
101105
yield obj
102106
else:
103107
for obj in iterator:
104108
if self.getUniformSample(split) <= self._fraction:
105109
yield obj
106-
107-
108-
109-

python/pyspark/resultiterable.py

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import collections
2121

22+
2223
class ResultIterable(collections.Iterable):
2324
"""
2425
A special result iterable. This is used because the standard iterator can not be pickled
@@ -27,7 +28,9 @@ def __init__(self, data):
2728
self.data = data
2829
self.index = 0
2930
self.maxindex = len(data)
31+
3032
def __iter__(self):
3133
return iter(self.data)
34+
3235
def __len__(self):
3336
return len(self.data)

0 commit comments

Comments
 (0)