Skip to content

Commit 3134c3f

Browse files
committed
[SPARK-6953] [PySpark] speed up python tests
This PR try to speed up some python tests: ``` tests.py 144s -> 103s -41s mllib/classification.py 24s -> 17s -7s mllib/regression.py 27s -> 15s -12s mllib/tree.py 27s -> 13s -14s mllib/tests.py 64s -> 31s -33s streaming/tests.py 185s -> 84s -101s ``` Considering python3, the total saving will be 558s (almost 10 minutes) (core, and streaming run three times, mllib runs twice). During testing, it will show used time for each test file: ``` Run core tests ... Running test: pyspark/rdd.py ... ok (22s) Running test: pyspark/context.py ... ok (16s) Running test: pyspark/conf.py ... ok (4s) Running test: pyspark/broadcast.py ... ok (4s) Running test: pyspark/accumulators.py ... ok (4s) Running test: pyspark/serializers.py ... ok (6s) Running test: pyspark/profiler.py ... ok (5s) Running test: pyspark/shuffle.py ... ok (1s) Running test: pyspark/tests.py ... ok (103s) 144s ``` Author: Reynold Xin <rxin@databricks.com> Author: Xiangrui Meng <meng@databricks.com> Closes #5605 from rxin/python-tests-speed and squashes the following commits: d08542d [Reynold Xin] Merge pull request #14 from mengxr/SPARK-6953 89321ee [Xiangrui Meng] fix seed in tests 3ad2387 [Reynold Xin] Merge pull request #5427 from davies/python_tests
1 parent e72c16e commit 3134c3f

File tree

9 files changed

+182
-127
lines changed

9 files changed

+182
-127
lines changed

python/pyspark/mllib/classification.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class LogisticRegressionModel(LinearClassificationModel):
8686
... LabeledPoint(0.0, [0.0, 1.0]),
8787
... LabeledPoint(1.0, [1.0, 0.0]),
8888
... ]
89-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
89+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
9090
>>> lrm.predict([1.0, 0.0])
9191
1
9292
>>> lrm.predict([0.0, 1.0])
@@ -95,15 +95,15 @@ class LogisticRegressionModel(LinearClassificationModel):
9595
[1, 0]
9696
>>> lrm.clearThreshold()
9797
>>> lrm.predict([0.0, 1.0])
98-
0.123...
98+
0.279...
9999
100100
>>> sparse_data = [
101101
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
102102
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
103103
... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
104104
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
105105
... ]
106-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
106+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data), iterations=10)
107107
>>> lrm.predict(array([0.0, 1.0]))
108108
1
109109
>>> lrm.predict(array([1.0, 0.0]))
@@ -129,7 +129,8 @@ class LogisticRegressionModel(LinearClassificationModel):
129129
... LabeledPoint(1.0, [1.0, 0.0, 0.0]),
130130
... LabeledPoint(2.0, [0.0, 0.0, 1.0])
131131
... ]
132-
>>> mcm = LogisticRegressionWithLBFGS.train(data=sc.parallelize(multi_class_data), numClasses=3)
132+
>>> data = sc.parallelize(multi_class_data)
133+
>>> mcm = LogisticRegressionWithLBFGS.train(data, iterations=10, numClasses=3)
133134
>>> mcm.predict([0.0, 0.5, 0.0])
134135
0
135136
>>> mcm.predict([0.8, 0.0, 0.0])
@@ -298,7 +299,7 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType
298299
... LabeledPoint(0.0, [0.0, 1.0]),
299300
... LabeledPoint(1.0, [1.0, 0.0]),
300301
... ]
301-
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data))
302+
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data), iterations=10)
302303
>>> lrm.predict([1.0, 0.0])
303304
1
304305
>>> lrm.predict([0.0, 1.0])
@@ -330,22 +331,22 @@ class SVMModel(LinearClassificationModel):
330331
... LabeledPoint(1.0, [2.0]),
331332
... LabeledPoint(1.0, [3.0])
332333
... ]
333-
>>> svm = SVMWithSGD.train(sc.parallelize(data))
334+
>>> svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
334335
>>> svm.predict([1.0])
335336
1
336337
>>> svm.predict(sc.parallelize([[1.0]])).collect()
337338
[1]
338339
>>> svm.clearThreshold()
339340
>>> svm.predict(array([1.0]))
340-
1.25...
341+
1.44...
341342
342343
>>> sparse_data = [
343344
... LabeledPoint(0.0, SparseVector(2, {0: -1.0})),
344345
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
345346
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
346347
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
347348
... ]
348-
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data))
349+
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data), iterations=10)
349350
>>> svm.predict(SparseVector(2, {1: 1.0}))
350351
1
351352
>>> svm.predict(SparseVector(2, {0: -1.0}))

python/pyspark/mllib/regression.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ class LinearRegressionModel(LinearRegressionModelBase):
108108
... LabeledPoint(3.0, [2.0]),
109109
... LabeledPoint(2.0, [3.0])
110110
... ]
111-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=np.array([1.0]))
111+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
112+
... initialWeights=np.array([1.0]))
112113
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
113114
True
114115
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -135,12 +136,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
135136
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
136137
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
137138
... ]
138-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
139+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
140+
... initialWeights=array([1.0]))
139141
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
140142
True
141143
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
142144
True
143-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
145+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
144146
... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
145147
... intercept=True, validateData=True)
146148
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
@@ -238,7 +240,7 @@ class LassoModel(LinearRegressionModelBase):
238240
... LabeledPoint(3.0, [2.0]),
239241
... LabeledPoint(2.0, [3.0])
240242
... ]
241-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
243+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, initialWeights=array([1.0]))
242244
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
243245
True
244246
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -265,12 +267,13 @@ class LassoModel(LinearRegressionModelBase):
265267
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
266268
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
267269
... ]
268-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
270+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
271+
... initialWeights=array([1.0]))
269272
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
270273
True
271274
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
272275
True
273-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
276+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
274277
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
275278
... validateData=True)
276279
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -321,7 +324,8 @@ class RidgeRegressionModel(LinearRegressionModelBase):
321324
... LabeledPoint(3.0, [2.0]),
322325
... LabeledPoint(2.0, [3.0])
323326
... ]
324-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
327+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10,
328+
... initialWeights=array([1.0]))
325329
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
326330
True
327331
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -348,12 +352,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
348352
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
349353
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
350354
... ]
351-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
355+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
356+
... initialWeights=array([1.0]))
352357
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
353358
True
354359
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
355360
True
356-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
361+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
357362
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
358363
... validateData=True)
359364
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -396,7 +401,7 @@ def _test():
396401
from pyspark import SparkContext
397402
import pyspark.mllib.regression
398403
globs = pyspark.mllib.regression.__dict__.copy()
399-
globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
404+
globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
400405
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
401406
globs['sc'].stop()
402407
if failure_count:

0 commit comments

Comments
 (0)