36
36
else :
37
37
import unittest
38
38
39
+ from pyspark import SparkContext
39
40
from pyspark .mllib .common import _to_java_object_rdd
40
41
from pyspark .mllib .linalg import Vector , SparseVector , DenseVector , VectorUDT , _convert_to_vector ,\
41
42
DenseMatrix , Vectors , Matrices
46
47
from pyspark .mllib .feature import IDF
47
48
from pyspark .serializers import PickleSerializer
48
49
from pyspark .sql import SQLContext
49
- from pyspark .tests import ReusedPySparkTestCase as PySparkTestCase
50
50
51
51
_have_scipy = False
52
52
try :
57
57
pass
58
58
59
59
ser = PickleSerializer ()
60
+ sc = SparkContext ('local[4]' , "MLlib tests" )
61
+
62
+
63
+ class MLlibTestCase (unittest .TestCase ):
64
+ def setUp (self ):
65
+ self .sc = sc
60
66
61
67
62
68
def _squared_distance (a , b ):
@@ -66,7 +72,7 @@ def _squared_distance(a, b):
66
72
return b .squared_distance (a )
67
73
68
74
69
- class VectorTests (PySparkTestCase ):
75
+ class VectorTests (MLlibTestCase ):
70
76
71
77
def _test_serialize (self , v ):
72
78
self .assertEqual (v , ser .loads (ser .dumps (v )))
@@ -145,7 +151,7 @@ def test_matrix_indexing(self):
145
151
self .assertEquals (mat [i , j ], expected [i ][j ])
146
152
147
153
148
- class ListTests (PySparkTestCase ):
154
+ class ListTests (MLlibTestCase ):
149
155
150
156
"""
151
157
Test MLlib algorithms on plain lists, to make sure they're passed through
@@ -188,7 +194,7 @@ def test_gmm(self):
188
194
[- 6 , - 7 ],
189
195
])
190
196
clusters = GaussianMixture .train (data , 2 , convergenceTol = 0.001 ,
191
- maxIterations = 100 , seed = 56 )
197
+ maxIterations = 10 , seed = 56 )
192
198
labels = clusters .predict (data ).collect ()
193
199
self .assertEquals (labels [0 ], labels [1 ])
194
200
self .assertEquals (labels [2 ], labels [3 ])
@@ -199,9 +205,9 @@ def test_gmm_deterministic(self):
199
205
y = range (0 , 100 , 10 )
200
206
data = self .sc .parallelize ([[a , b ] for a , b in zip (x , y )])
201
207
clusters1 = GaussianMixture .train (data , 5 , convergenceTol = 0.001 ,
202
- maxIterations = 100 , seed = 63 )
208
+ maxIterations = 10 , seed = 63 )
203
209
clusters2 = GaussianMixture .train (data , 5 , convergenceTol = 0.001 ,
204
- maxIterations = 100 , seed = 63 )
210
+ maxIterations = 10 , seed = 63 )
205
211
for c1 , c2 in zip (clusters1 .weights , clusters2 .weights ):
206
212
self .assertEquals (round (c1 , 7 ), round (c2 , 7 ))
207
213
@@ -220,13 +226,13 @@ def test_classification(self):
220
226
221
227
temp_dir = tempfile .mkdtemp ()
222
228
223
- lr_model = LogisticRegressionWithSGD .train (rdd )
229
+ lr_model = LogisticRegressionWithSGD .train (rdd , iterations = 10 )
224
230
self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
225
231
self .assertTrue (lr_model .predict (features [1 ]) > 0 )
226
232
self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
227
233
self .assertTrue (lr_model .predict (features [3 ]) > 0 )
228
234
229
- svm_model = SVMWithSGD .train (rdd )
235
+ svm_model = SVMWithSGD .train (rdd , iterations = 10 )
230
236
self .assertTrue (svm_model .predict (features [0 ]) <= 0 )
231
237
self .assertTrue (svm_model .predict (features [1 ]) > 0 )
232
238
self .assertTrue (svm_model .predict (features [2 ]) <= 0 )
@@ -240,7 +246,7 @@ def test_classification(self):
240
246
241
247
categoricalFeaturesInfo = {0 : 3 } # feature 0 has 3 categories
242
248
dt_model = DecisionTree .trainClassifier (
243
- rdd , numClasses = 2 , categoricalFeaturesInfo = categoricalFeaturesInfo )
249
+ rdd , numClasses = 2 , categoricalFeaturesInfo = categoricalFeaturesInfo , maxBins = 4 )
244
250
self .assertTrue (dt_model .predict (features [0 ]) <= 0 )
245
251
self .assertTrue (dt_model .predict (features [1 ]) > 0 )
246
252
self .assertTrue (dt_model .predict (features [2 ]) <= 0 )
@@ -252,7 +258,8 @@ def test_classification(self):
252
258
self .assertEqual (same_dt_model .toDebugString (), dt_model .toDebugString ())
253
259
254
260
rf_model = RandomForest .trainClassifier (
255
- rdd , numClasses = 2 , categoricalFeaturesInfo = categoricalFeaturesInfo , numTrees = 100 )
261
+ rdd , numClasses = 2 , categoricalFeaturesInfo = categoricalFeaturesInfo , numTrees = 10 ,
262
+ maxBins = 4 )
256
263
self .assertTrue (rf_model .predict (features [0 ]) <= 0 )
257
264
self .assertTrue (rf_model .predict (features [1 ]) > 0 )
258
265
self .assertTrue (rf_model .predict (features [2 ]) <= 0 )
@@ -264,7 +271,7 @@ def test_classification(self):
264
271
self .assertEqual (same_rf_model .toDebugString (), rf_model .toDebugString ())
265
272
266
273
gbt_model = GradientBoostedTrees .trainClassifier (
267
- rdd , categoricalFeaturesInfo = categoricalFeaturesInfo )
274
+ rdd , categoricalFeaturesInfo = categoricalFeaturesInfo , numIterations = 4 )
268
275
self .assertTrue (gbt_model .predict (features [0 ]) <= 0 )
269
276
self .assertTrue (gbt_model .predict (features [1 ]) > 0 )
270
277
self .assertTrue (gbt_model .predict (features [2 ]) <= 0 )
@@ -293,55 +300,55 @@ def test_regression(self):
293
300
rdd = self .sc .parallelize (data )
294
301
features = [p .features .tolist () for p in data ]
295
302
296
- lr_model = LinearRegressionWithSGD .train (rdd )
303
+ lr_model = LinearRegressionWithSGD .train (rdd , iterations = 10 )
297
304
self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
298
305
self .assertTrue (lr_model .predict (features [1 ]) > 0 )
299
306
self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
300
307
self .assertTrue (lr_model .predict (features [3 ]) > 0 )
301
308
302
- lasso_model = LassoWithSGD .train (rdd )
309
+ lasso_model = LassoWithSGD .train (rdd , iterations = 10 )
303
310
self .assertTrue (lasso_model .predict (features [0 ]) <= 0 )
304
311
self .assertTrue (lasso_model .predict (features [1 ]) > 0 )
305
312
self .assertTrue (lasso_model .predict (features [2 ]) <= 0 )
306
313
self .assertTrue (lasso_model .predict (features [3 ]) > 0 )
307
314
308
- rr_model = RidgeRegressionWithSGD .train (rdd )
315
+ rr_model = RidgeRegressionWithSGD .train (rdd , iterations = 10 )
309
316
self .assertTrue (rr_model .predict (features [0 ]) <= 0 )
310
317
self .assertTrue (rr_model .predict (features [1 ]) > 0 )
311
318
self .assertTrue (rr_model .predict (features [2 ]) <= 0 )
312
319
self .assertTrue (rr_model .predict (features [3 ]) > 0 )
313
320
314
321
categoricalFeaturesInfo = {0 : 2 } # feature 0 has 2 categories
315
322
dt_model = DecisionTree .trainRegressor (
316
- rdd , categoricalFeaturesInfo = categoricalFeaturesInfo )
323
+ rdd , categoricalFeaturesInfo = categoricalFeaturesInfo , maxBins = 4 )
317
324
self .assertTrue (dt_model .predict (features [0 ]) <= 0 )
318
325
self .assertTrue (dt_model .predict (features [1 ]) > 0 )
319
326
self .assertTrue (dt_model .predict (features [2 ]) <= 0 )
320
327
self .assertTrue (dt_model .predict (features [3 ]) > 0 )
321
328
322
329
rf_model = RandomForest .trainRegressor (
323
- rdd , categoricalFeaturesInfo = categoricalFeaturesInfo , numTrees = 100 , seed = 1 )
330
+ rdd , categoricalFeaturesInfo = categoricalFeaturesInfo , numTrees = 10 , maxBins = 4 , seed = 1 )
324
331
self .assertTrue (rf_model .predict (features [0 ]) <= 0 )
325
332
self .assertTrue (rf_model .predict (features [1 ]) > 0 )
326
333
self .assertTrue (rf_model .predict (features [2 ]) <= 0 )
327
334
self .assertTrue (rf_model .predict (features [3 ]) > 0 )
328
335
329
336
gbt_model = GradientBoostedTrees .trainRegressor (
330
- rdd , categoricalFeaturesInfo = categoricalFeaturesInfo )
337
+ rdd , categoricalFeaturesInfo = categoricalFeaturesInfo , numIterations = 4 )
331
338
self .assertTrue (gbt_model .predict (features [0 ]) <= 0 )
332
339
self .assertTrue (gbt_model .predict (features [1 ]) > 0 )
333
340
self .assertTrue (gbt_model .predict (features [2 ]) <= 0 )
334
341
self .assertTrue (gbt_model .predict (features [3 ]) > 0 )
335
342
336
343
try :
337
- LinearRegressionWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]))
338
- LassoWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]))
339
- RidgeRegressionWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]))
344
+ LinearRegressionWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]), iterations = 10 )
345
+ LassoWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]), iterations = 10 )
346
+ RidgeRegressionWithSGD .train (rdd , initialWeights = array ([1.0 , 1.0 ]), iterations = 10 )
340
347
except ValueError :
341
348
self .fail ()
342
349
343
350
344
- class StatTests (PySparkTestCase ):
351
+ class StatTests (MLlibTestCase ):
345
352
# SPARK-4023
346
353
def test_col_with_different_rdds (self ):
347
354
# numpy
@@ -364,7 +371,7 @@ def test_col_norms(self):
364
371
self .assertEqual (10 , len (summary .normL2 ()))
365
372
366
373
367
- class VectorUDTTests (PySparkTestCase ):
374
+ class VectorUDTTests (MLlibTestCase ):
368
375
369
376
dv0 = DenseVector ([])
370
377
dv1 = DenseVector ([1.0 , 2.0 ])
@@ -398,7 +405,7 @@ def test_infer_schema(self):
398
405
399
406
400
407
@unittest .skipIf (not _have_scipy , "SciPy not installed" )
401
- class SciPyTests (PySparkTestCase ):
408
+ class SciPyTests (MLlibTestCase ):
402
409
403
410
"""
404
411
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
@@ -539,7 +546,7 @@ def test_regression(self):
539
546
self .assertTrue (dt_model .predict (features [3 ]) > 0 )
540
547
541
548
542
- class ChiSqTestTests (PySparkTestCase ):
549
+ class ChiSqTestTests (MLlibTestCase ):
543
550
def test_goodness_of_fit (self ):
544
551
from numpy import inf
545
552
@@ -637,13 +644,13 @@ def test_right_number_of_results(self):
637
644
self .assertIsNotNone (chi [1000 ])
638
645
639
646
640
- class SerDeTest (PySparkTestCase ):
647
+ class SerDeTest (MLlibTestCase ):
641
648
def test_to_java_object_rdd (self ): # SPARK-6660
642
649
data = RandomRDDs .uniformRDD (self .sc , 10 , 5 , seed = 0L )
643
650
self .assertEqual (_to_java_object_rdd (data ).count (), 10 )
644
651
645
652
646
- class FeatureTest (PySparkTestCase ):
653
+ class FeatureTest (MLlibTestCase ):
647
654
def test_idf_model (self ):
648
655
data = [
649
656
Vectors .dense ([1 , 2 , 6 , 0 , 2 , 3 , 1 , 1 , 0 , 0 , 3 ]),
@@ -656,13 +663,8 @@ def test_idf_model(self):
656
663
self .assertEqual (len (idf ), 11 )
657
664
658
665
659
- class Word2VecTests (PySparkTestCase ):
666
+ class Word2VecTests (MLlibTestCase ):
660
667
def test_word2vec_setters (self ):
661
- data = [
662
- ["I" , "have" , "a" , "pen" ],
663
- ["I" , "like" , "soccer" , "very" , "much" ],
664
- ["I" , "live" , "in" , "Tokyo" ]
665
- ]
666
668
model = Word2Vec () \
667
669
.setVectorSize (2 ) \
668
670
.setLearningRate (0.01 ) \
@@ -696,3 +698,4 @@ def test_word2vec_get_vectors(self):
696
698
unittest .main ()
697
699
if not _have_scipy :
698
700
print "NOTE: SciPy tests were skipped as it does not seem to be installed"
701
+ sc .stop ()
0 commit comments