Skip to content

Commit 3ad2387

Browse files
committed
Merge pull request apache#5427 from davies/python_tests
[SPARK-6953] [PySpark] speed up python tests Signed-off-by: Reynold Xin <rxin@databricks.com> Conflicts: python/pyspark/streaming/tests.py (cherry picked from commit 21b15f5) Signed-off-by: Reynold Xin <rxin@databricks.com>
1 parent ab9128f commit 3ad2387

File tree

9 files changed

+182
-127
lines changed

9 files changed

+182
-127
lines changed

python/pyspark/mllib/classification.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class LogisticRegressionModel(LinearClassificationModel):
8686
... LabeledPoint(0.0, [0.0, 1.0]),
8787
... LabeledPoint(1.0, [1.0, 0.0]),
8888
... ]
89-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
89+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
9090
>>> lrm.predict([1.0, 0.0])
9191
1
9292
>>> lrm.predict([0.0, 1.0])
@@ -95,15 +95,15 @@ class LogisticRegressionModel(LinearClassificationModel):
9595
[1, 0]
9696
>>> lrm.clearThreshold()
9797
>>> lrm.predict([0.0, 1.0])
98-
0.123...
98+
0.279...
9999
100100
>>> sparse_data = [
101101
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
102102
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
103103
... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
104104
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
105105
... ]
106-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
106+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data), iterations=10)
107107
>>> lrm.predict(array([0.0, 1.0]))
108108
1
109109
>>> lrm.predict(array([1.0, 0.0]))
@@ -129,7 +129,8 @@ class LogisticRegressionModel(LinearClassificationModel):
129129
... LabeledPoint(1.0, [1.0, 0.0, 0.0]),
130130
... LabeledPoint(2.0, [0.0, 0.0, 1.0])
131131
... ]
132-
>>> mcm = LogisticRegressionWithLBFGS.train(data=sc.parallelize(multi_class_data), numClasses=3)
132+
>>> data = sc.parallelize(multi_class_data)
133+
>>> mcm = LogisticRegressionWithLBFGS.train(data, iterations=10, numClasses=3)
133134
>>> mcm.predict([0.0, 0.5, 0.0])
134135
0
135136
>>> mcm.predict([0.8, 0.0, 0.0])
@@ -298,7 +299,7 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType
298299
... LabeledPoint(0.0, [0.0, 1.0]),
299300
... LabeledPoint(1.0, [1.0, 0.0]),
300301
... ]
301-
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data))
302+
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data), iterations=10)
302303
>>> lrm.predict([1.0, 0.0])
303304
1
304305
>>> lrm.predict([0.0, 1.0])
@@ -330,22 +331,22 @@ class SVMModel(LinearClassificationModel):
330331
... LabeledPoint(1.0, [2.0]),
331332
... LabeledPoint(1.0, [3.0])
332333
... ]
333-
>>> svm = SVMWithSGD.train(sc.parallelize(data))
334+
>>> svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
334335
>>> svm.predict([1.0])
335336
1
336337
>>> svm.predict(sc.parallelize([[1.0]])).collect()
337338
[1]
338339
>>> svm.clearThreshold()
339340
>>> svm.predict(array([1.0]))
340-
1.25...
341+
1.44...
341342
342343
>>> sparse_data = [
343344
... LabeledPoint(0.0, SparseVector(2, {0: -1.0})),
344345
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
345346
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
346347
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
347348
... ]
348-
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data))
349+
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data), iterations=10)
349350
>>> svm.predict(SparseVector(2, {1: 1.0}))
350351
1
351352
>>> svm.predict(SparseVector(2, {0: -1.0}))

python/pyspark/mllib/regression.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ class LinearRegressionModel(LinearRegressionModelBase):
108108
... LabeledPoint(3.0, [2.0]),
109109
... LabeledPoint(2.0, [3.0])
110110
... ]
111-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=np.array([1.0]))
111+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
112+
... initialWeights=np.array([1.0]))
112113
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
113114
True
114115
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -135,12 +136,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
135136
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
136137
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
137138
... ]
138-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
139+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
140+
... initialWeights=array([1.0]))
139141
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
140142
True
141143
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
142144
True
143-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
145+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
144146
... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
145147
... intercept=True, validateData=True)
146148
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
@@ -238,7 +240,7 @@ class LassoModel(LinearRegressionModelBase):
238240
... LabeledPoint(3.0, [2.0]),
239241
... LabeledPoint(2.0, [3.0])
240242
... ]
241-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
243+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, initialWeights=array([1.0]))
242244
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
243245
True
244246
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -265,12 +267,13 @@ class LassoModel(LinearRegressionModelBase):
265267
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
266268
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
267269
... ]
268-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
270+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
271+
... initialWeights=array([1.0]))
269272
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
270273
True
271274
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
272275
True
273-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
276+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
274277
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
275278
... validateData=True)
276279
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -321,7 +324,8 @@ class RidgeRegressionModel(LinearRegressionModelBase):
321324
... LabeledPoint(3.0, [2.0]),
322325
... LabeledPoint(2.0, [3.0])
323326
... ]
324-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
327+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10,
328+
... initialWeights=array([1.0]))
325329
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
326330
True
327331
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -348,12 +352,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
348352
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
349353
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
350354
... ]
351-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
355+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
356+
... initialWeights=array([1.0]))
352357
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
353358
True
354359
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
355360
True
356-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
361+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
357362
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
358363
... validateData=True)
359364
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -396,7 +401,7 @@ def _test():
396401
from pyspark import SparkContext
397402
import pyspark.mllib.regression
398403
globs = pyspark.mllib.regression.__dict__.copy()
399-
globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
404+
globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
400405
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
401406
globs['sc'].stop()
402407
if failure_count:

0 commit comments

Comments
 (0)