Skip to content

Commit be23e1d

Browse files
author
Davies Liu
committed
speed up python tests
1 parent 9fe4125 commit be23e1d

File tree

7 files changed

+112
-86
lines changed

7 files changed

+112
-86
lines changed

python/pyspark/mllib/classification.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class LogisticRegressionModel(LinearClassificationModel):
8686
... LabeledPoint(0.0, [0.0, 1.0]),
8787
... LabeledPoint(1.0, [1.0, 0.0]),
8888
... ]
89-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
89+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
9090
>>> lrm.predict([1.0, 0.0])
9191
1
9292
>>> lrm.predict([0.0, 1.0])
@@ -95,15 +95,15 @@ class LogisticRegressionModel(LinearClassificationModel):
9595
[1, 0]
9696
>>> lrm.clearThreshold()
9797
>>> lrm.predict([0.0, 1.0])
98-
0.123...
98+
0.279...
9999
100100
>>> sparse_data = [
101101
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
102102
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
103103
... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
104104
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
105105
... ]
106-
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
106+
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data), iterations=10)
107107
>>> lrm.predict(array([0.0, 1.0]))
108108
1
109109
>>> lrm.predict(array([1.0, 0.0]))
@@ -129,7 +129,8 @@ class LogisticRegressionModel(LinearClassificationModel):
129129
... LabeledPoint(1.0, [1.0, 0.0, 0.0]),
130130
... LabeledPoint(2.0, [0.0, 0.0, 1.0])
131131
... ]
132-
>>> mcm = LogisticRegressionWithLBFGS.train(data=sc.parallelize(multi_class_data), numClasses=3)
132+
>>> data = sc.parallelize(multi_class_data)
133+
>>> mcm = LogisticRegressionWithLBFGS.train(data, iterations=10, numClasses=3)
133134
>>> mcm.predict([0.0, 0.5, 0.0])
134135
0
135136
>>> mcm.predict([0.8, 0.0, 0.0])
@@ -298,7 +299,7 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType
298299
... LabeledPoint(0.0, [0.0, 1.0]),
299300
... LabeledPoint(1.0, [1.0, 0.0]),
300301
... ]
301-
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data))
302+
>>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data), iterations=10)
302303
>>> lrm.predict([1.0, 0.0])
303304
1
304305
>>> lrm.predict([0.0, 1.0])
@@ -330,22 +331,22 @@ class SVMModel(LinearClassificationModel):
330331
... LabeledPoint(1.0, [2.0]),
331332
... LabeledPoint(1.0, [3.0])
332333
... ]
333-
>>> svm = SVMWithSGD.train(sc.parallelize(data))
334+
>>> svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
334335
>>> svm.predict([1.0])
335336
1
336337
>>> svm.predict(sc.parallelize([[1.0]])).collect()
337338
[1]
338339
>>> svm.clearThreshold()
339340
>>> svm.predict(array([1.0]))
340-
1.25...
341+
1.44...
341342
342343
>>> sparse_data = [
343344
... LabeledPoint(0.0, SparseVector(2, {0: -1.0})),
344345
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
345346
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
346347
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
347348
... ]
348-
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data))
349+
>>> svm = SVMWithSGD.train(sc.parallelize(sparse_data), iterations=10)
349350
>>> svm.predict(SparseVector(2, {1: 1.0}))
350351
1
351352
>>> svm.predict(SparseVector(2, {0: -1.0}))

python/pyspark/mllib/regression.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ class LinearRegressionModel(LinearRegressionModelBase):
108108
... LabeledPoint(3.0, [2.0]),
109109
... LabeledPoint(2.0, [3.0])
110110
... ]
111-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=np.array([1.0]))
111+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
112+
... initialWeights=np.array([1.0]))
112113
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
113114
True
114115
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -135,12 +136,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
135136
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
136137
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
137138
... ]
138-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
139+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
140+
... initialWeights=array([1.0]))
139141
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
140142
True
141143
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
142144
True
143-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
145+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
144146
... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
145147
... intercept=True, validateData=True)
146148
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
@@ -238,7 +240,7 @@ class LassoModel(LinearRegressionModelBase):
238240
... LabeledPoint(3.0, [2.0]),
239241
... LabeledPoint(2.0, [3.0])
240242
... ]
241-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
243+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, initialWeights=array([1.0]))
242244
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
243245
True
244246
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -265,12 +267,13 @@ class LassoModel(LinearRegressionModelBase):
265267
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
266268
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
267269
... ]
268-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
270+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
271+
... initialWeights=array([1.0]))
269272
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
270273
True
271274
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
272275
True
273-
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
276+
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
274277
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
275278
... validateData=True)
276279
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -321,7 +324,8 @@ class RidgeRegressionModel(LinearRegressionModelBase):
321324
... LabeledPoint(3.0, [2.0]),
322325
... LabeledPoint(2.0, [3.0])
323326
... ]
324-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
327+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10,
328+
... initialWeights=array([1.0]))
325329
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
326330
True
327331
>>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -348,12 +352,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
348352
... LabeledPoint(3.0, SparseVector(1, {0: 2.0})),
349353
... LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
350354
... ]
351-
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
355+
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
356+
... initialWeights=array([1.0]))
352357
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
353358
True
354359
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
355360
True
356-
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
361+
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
357362
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
358363
... validateData=True)
359364
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
@@ -396,7 +401,7 @@ def _test():
396401
from pyspark import SparkContext
397402
import pyspark.mllib.regression
398403
globs = pyspark.mllib.regression.__dict__.copy()
399-
globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
404+
globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
400405
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
401406
globs['sc'].stop()
402407
if failure_count:

python/pyspark/mllib/tests.py

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
else:
3737
import unittest
3838

39+
from pyspark import SparkContext
3940
from pyspark.mllib.common import _to_java_object_rdd
4041
from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
4142
DenseMatrix, Vectors, Matrices
@@ -46,7 +47,6 @@
4647
from pyspark.mllib.feature import IDF
4748
from pyspark.serializers import PickleSerializer
4849
from pyspark.sql import SQLContext
49-
from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
5050

5151
_have_scipy = False
5252
try:
@@ -57,6 +57,12 @@
5757
pass
5858

5959
ser = PickleSerializer()
60+
sc = SparkContext('local[4]', "MLlib tests")
61+
62+
63+
class MLlibTestCase(unittest.TestCase):
64+
def setUp(self):
65+
self.sc = sc
6066

6167

6268
def _squared_distance(a, b):
@@ -66,7 +72,7 @@ def _squared_distance(a, b):
6672
return b.squared_distance(a)
6773

6874

69-
class VectorTests(PySparkTestCase):
75+
class VectorTests(MLlibTestCase):
7076

7177
def _test_serialize(self, v):
7278
self.assertEqual(v, ser.loads(ser.dumps(v)))
@@ -145,7 +151,7 @@ def test_matrix_indexing(self):
145151
self.assertEquals(mat[i, j], expected[i][j])
146152

147153

148-
class ListTests(PySparkTestCase):
154+
class ListTests(MLlibTestCase):
149155

150156
"""
151157
Test MLlib algorithms on plain lists, to make sure they're passed through
@@ -188,7 +194,7 @@ def test_gmm(self):
188194
[-6, -7],
189195
])
190196
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
191-
maxIterations=100, seed=56)
197+
maxIterations=10, seed=56)
192198
labels = clusters.predict(data).collect()
193199
self.assertEquals(labels[0], labels[1])
194200
self.assertEquals(labels[2], labels[3])
@@ -199,9 +205,9 @@ def test_gmm_deterministic(self):
199205
y = range(0, 100, 10)
200206
data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
201207
clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001,
202-
maxIterations=100, seed=63)
208+
maxIterations=10, seed=63)
203209
clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
204-
maxIterations=100, seed=63)
210+
maxIterations=10, seed=63)
205211
for c1, c2 in zip(clusters1.weights, clusters2.weights):
206212
self.assertEquals(round(c1, 7), round(c2, 7))
207213

@@ -220,13 +226,13 @@ def test_classification(self):
220226

221227
temp_dir = tempfile.mkdtemp()
222228

223-
lr_model = LogisticRegressionWithSGD.train(rdd)
229+
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
224230
self.assertTrue(lr_model.predict(features[0]) <= 0)
225231
self.assertTrue(lr_model.predict(features[1]) > 0)
226232
self.assertTrue(lr_model.predict(features[2]) <= 0)
227233
self.assertTrue(lr_model.predict(features[3]) > 0)
228234

229-
svm_model = SVMWithSGD.train(rdd)
235+
svm_model = SVMWithSGD.train(rdd, iterations=10)
230236
self.assertTrue(svm_model.predict(features[0]) <= 0)
231237
self.assertTrue(svm_model.predict(features[1]) > 0)
232238
self.assertTrue(svm_model.predict(features[2]) <= 0)
@@ -240,7 +246,7 @@ def test_classification(self):
240246

241247
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
242248
dt_model = DecisionTree.trainClassifier(
243-
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
249+
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
244250
self.assertTrue(dt_model.predict(features[0]) <= 0)
245251
self.assertTrue(dt_model.predict(features[1]) > 0)
246252
self.assertTrue(dt_model.predict(features[2]) <= 0)
@@ -252,7 +258,8 @@ def test_classification(self):
252258
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
253259

254260
rf_model = RandomForest.trainClassifier(
255-
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
261+
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
262+
maxBins=4)
256263
self.assertTrue(rf_model.predict(features[0]) <= 0)
257264
self.assertTrue(rf_model.predict(features[1]) > 0)
258265
self.assertTrue(rf_model.predict(features[2]) <= 0)
@@ -264,7 +271,7 @@ def test_classification(self):
264271
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
265272

266273
gbt_model = GradientBoostedTrees.trainClassifier(
267-
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
274+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
268275
self.assertTrue(gbt_model.predict(features[0]) <= 0)
269276
self.assertTrue(gbt_model.predict(features[1]) > 0)
270277
self.assertTrue(gbt_model.predict(features[2]) <= 0)
@@ -293,55 +300,55 @@ def test_regression(self):
293300
rdd = self.sc.parallelize(data)
294301
features = [p.features.tolist() for p in data]
295302

296-
lr_model = LinearRegressionWithSGD.train(rdd)
303+
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
297304
self.assertTrue(lr_model.predict(features[0]) <= 0)
298305
self.assertTrue(lr_model.predict(features[1]) > 0)
299306
self.assertTrue(lr_model.predict(features[2]) <= 0)
300307
self.assertTrue(lr_model.predict(features[3]) > 0)
301308

302-
lasso_model = LassoWithSGD.train(rdd)
309+
lasso_model = LassoWithSGD.train(rdd, iterations=10)
303310
self.assertTrue(lasso_model.predict(features[0]) <= 0)
304311
self.assertTrue(lasso_model.predict(features[1]) > 0)
305312
self.assertTrue(lasso_model.predict(features[2]) <= 0)
306313
self.assertTrue(lasso_model.predict(features[3]) > 0)
307314

308-
rr_model = RidgeRegressionWithSGD.train(rdd)
315+
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
309316
self.assertTrue(rr_model.predict(features[0]) <= 0)
310317
self.assertTrue(rr_model.predict(features[1]) > 0)
311318
self.assertTrue(rr_model.predict(features[2]) <= 0)
312319
self.assertTrue(rr_model.predict(features[3]) > 0)
313320

314321
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
315322
dt_model = DecisionTree.trainRegressor(
316-
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
323+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
317324
self.assertTrue(dt_model.predict(features[0]) <= 0)
318325
self.assertTrue(dt_model.predict(features[1]) > 0)
319326
self.assertTrue(dt_model.predict(features[2]) <= 0)
320327
self.assertTrue(dt_model.predict(features[3]) > 0)
321328

322329
rf_model = RandomForest.trainRegressor(
323-
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1)
330+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
324331
self.assertTrue(rf_model.predict(features[0]) <= 0)
325332
self.assertTrue(rf_model.predict(features[1]) > 0)
326333
self.assertTrue(rf_model.predict(features[2]) <= 0)
327334
self.assertTrue(rf_model.predict(features[3]) > 0)
328335

329336
gbt_model = GradientBoostedTrees.trainRegressor(
330-
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
337+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
331338
self.assertTrue(gbt_model.predict(features[0]) <= 0)
332339
self.assertTrue(gbt_model.predict(features[1]) > 0)
333340
self.assertTrue(gbt_model.predict(features[2]) <= 0)
334341
self.assertTrue(gbt_model.predict(features[3]) > 0)
335342

336343
try:
337-
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
338-
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
339-
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
344+
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
345+
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
346+
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
340347
except ValueError:
341348
self.fail()
342349

343350

344-
class StatTests(PySparkTestCase):
351+
class StatTests(MLlibTestCase):
345352
# SPARK-4023
346353
def test_col_with_different_rdds(self):
347354
# numpy
@@ -364,7 +371,7 @@ def test_col_norms(self):
364371
self.assertEqual(10, len(summary.normL2()))
365372

366373

367-
class VectorUDTTests(PySparkTestCase):
374+
class VectorUDTTests(MLlibTestCase):
368375

369376
dv0 = DenseVector([])
370377
dv1 = DenseVector([1.0, 2.0])
@@ -398,7 +405,7 @@ def test_infer_schema(self):
398405

399406

400407
@unittest.skipIf(not _have_scipy, "SciPy not installed")
401-
class SciPyTests(PySparkTestCase):
408+
class SciPyTests(MLlibTestCase):
402409

403410
"""
404411
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
@@ -539,7 +546,7 @@ def test_regression(self):
539546
self.assertTrue(dt_model.predict(features[3]) > 0)
540547

541548

542-
class ChiSqTestTests(PySparkTestCase):
549+
class ChiSqTestTests(MLlibTestCase):
543550
def test_goodness_of_fit(self):
544551
from numpy import inf
545552

@@ -637,13 +644,13 @@ def test_right_number_of_results(self):
637644
self.assertIsNotNone(chi[1000])
638645

639646

640-
class SerDeTest(PySparkTestCase):
647+
class SerDeTest(MLlibTestCase):
641648
def test_to_java_object_rdd(self): # SPARK-6660
642649
data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0L)
643650
self.assertEqual(_to_java_object_rdd(data).count(), 10)
644651

645652

646-
class FeatureTest(PySparkTestCase):
653+
class FeatureTest(MLlibTestCase):
647654
def test_idf_model(self):
648655
data = [
649656
Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
@@ -656,13 +663,8 @@ def test_idf_model(self):
656663
self.assertEqual(len(idf), 11)
657664

658665

659-
class Word2VecTests(PySparkTestCase):
666+
class Word2VecTests(MLlibTestCase):
660667
def test_word2vec_setters(self):
661-
data = [
662-
["I", "have", "a", "pen"],
663-
["I", "like", "soccer", "very", "much"],
664-
["I", "live", "in", "Tokyo"]
665-
]
666668
model = Word2Vec() \
667669
.setVectorSize(2) \
668670
.setLearningRate(0.01) \
@@ -696,3 +698,4 @@ def test_word2vec_get_vectors(self):
696698
unittest.main()
697699
if not _have_scipy:
698700
print "NOTE: SciPy tests were skipped as it does not seem to be installed"
701+
sc.stop()

0 commit comments

Comments
 (0)