Skip to content

Commit dbb06c6

Browse files
committed
[MINOR][ML] Fix some PySpark & SparkR flaky tests
## What changes were proposed in this pull request? Some PySpark & SparkR tests run with tiny dataset and tiny ```maxIter```, which means they are not converged. I don’t think checking intermediate result during iteration make sense, and these intermediate result may vulnerable and not stable, so we should switch to check the converged result. We hit this issue at #17746 when we upgrade breeze to 0.13.1. ## How was this patch tested? Existing tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #17757 from yanboliang/flaky-test.
1 parent 7fecf51 commit dbb06c6

File tree

2 files changed

+38
-50
lines changed

2 files changed

+38
-50
lines changed

R/pkg/inst/tests/testthat/test_mllib_classification.R

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -284,22 +284,11 @@ test_that("spark.mlp", {
284284
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
285285

286286
# test initialWeights
287-
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
287+
model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
288288
c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
289289
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
290290
expect_equal(head(mlpPredictions$prediction, 10),
291-
c("1.0", "1.0", "2.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
292-
293-
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
294-
c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
295-
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
296-
expect_equal(head(mlpPredictions$prediction, 10),
297-
c("1.0", "1.0", "2.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
298-
299-
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
300-
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
301-
expect_equal(head(mlpPredictions$prediction, 10),
302-
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "0.0", "1.0", "0.0"))
291+
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
303292

304293
# Test formula works well
305294
df <- suppressWarnings(createDataFrame(iris))
@@ -310,8 +299,6 @@ test_that("spark.mlp", {
310299
expect_equal(summary$numOfOutputs, 3)
311300
expect_equal(summary$layers, c(4, 3))
312301
expect_equal(length(summary$weights), 15)
313-
expect_equal(head(summary$weights, 5), list(-0.5793153, -4.652961, 6.216155, -6.649478,
314-
-10.51147), tolerance = 1e-3)
315302
})
316303

317304
test_that("spark.naiveBayes", {

python/pyspark/ml/classification.py

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -185,34 +185,33 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
185185
>>> from pyspark.sql import Row
186186
>>> from pyspark.ml.linalg import Vectors
187187
>>> bdf = sc.parallelize([
188-
... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
189-
... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF()
190-
>>> blor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
188+
... Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
189+
... Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
190+
... Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
191+
... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF()
192+
>>> blor = LogisticRegression(regParam=0.01, weightCol="weight")
191193
>>> blorModel = blor.fit(bdf)
192194
>>> blorModel.coefficients
193-
DenseVector([5.4...])
195+
DenseVector([-1.080..., -0.646...])
194196
>>> blorModel.intercept
195-
-2.63...
196-
>>> mdf = sc.parallelize([
197-
... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
198-
... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], [])),
199-
... Row(label=2.0, weight=2.0, features=Vectors.dense(3.0))]).toDF()
200-
>>> mlor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight",
201-
... family="multinomial")
197+
3.112...
198+
>>> data_path = "data/mllib/sample_multiclass_classification_data.txt"
199+
>>> mdf = spark.read.format("libsvm").load(data_path)
200+
>>> mlor = LogisticRegression(regParam=0.1, elasticNetParam=1.0, family="multinomial")
202201
>>> mlorModel = mlor.fit(mdf)
203202
>>> mlorModel.coefficientMatrix
204-
DenseMatrix(3, 1, [-2.3..., 0.2..., 2.1...], 1)
203+
SparseMatrix(3, 4, [0, 1, 2, 3], [3, 2, 1], [1.87..., -2.75..., -0.50...], 1)
205204
>>> mlorModel.interceptVector
206-
DenseVector([2.1..., 0.6..., -2.8...])
207-
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
205+
DenseVector([0.04..., -0.42..., 0.37...])
206+
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF()
208207
>>> result = blorModel.transform(test0).head()
209208
>>> result.prediction
210-
0.0
209+
1.0
211210
>>> result.probability
212-
DenseVector([0.99..., 0.00...])
211+
DenseVector([0.02..., 0.97...])
213212
>>> result.rawPrediction
214-
DenseVector([8.12..., -8.12...])
215-
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF()
213+
DenseVector([-3.54..., 3.54...])
214+
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
216215
>>> blorModel.transform(test1).head().prediction
217216
1.0
218217
>>> blor.setParams("vector")
@@ -222,8 +221,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
222221
>>> lr_path = temp_path + "/lr"
223222
>>> blor.save(lr_path)
224223
>>> lr2 = LogisticRegression.load(lr_path)
225-
>>> lr2.getMaxIter()
226-
5
224+
>>> lr2.getRegParam()
225+
0.01
227226
>>> model_path = temp_path + "/lr_model"
228227
>>> blorModel.save(model_path)
229228
>>> model2 = LogisticRegressionModel.load(model_path)
@@ -1480,31 +1479,33 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
14801479
14811480
>>> from pyspark.sql import Row
14821481
>>> from pyspark.ml.linalg import Vectors
1483-
>>> df = sc.parallelize([
1484-
... Row(label=0.0, features=Vectors.dense(1.0, 0.8)),
1485-
... Row(label=1.0, features=Vectors.sparse(2, [], [])),
1486-
... Row(label=2.0, features=Vectors.dense(0.5, 0.5))]).toDF()
1487-
>>> lr = LogisticRegression(maxIter=5, regParam=0.01)
1482+
>>> data_path = "data/mllib/sample_multiclass_classification_data.txt"
1483+
>>> df = spark.read.format("libsvm").load(data_path)
1484+
>>> lr = LogisticRegression(regParam=0.01)
14881485
>>> ovr = OneVsRest(classifier=lr)
14891486
>>> model = ovr.fit(df)
1490-
>>> [x.coefficients for x in model.models]
1491-
[DenseVector([4.9791, 2.426]), DenseVector([-4.1198, -5.9326]), DenseVector([-3.314, 5.2423])]
1487+
>>> model.models[0].coefficients
1488+
DenseVector([0.5..., -1.0..., 3.4..., 4.2...])
1489+
>>> model.models[1].coefficients
1490+
DenseVector([-2.1..., 3.1..., -2.6..., -2.3...])
1491+
>>> model.models[2].coefficients
1492+
DenseVector([0.3..., -3.4..., 1.0..., -1.1...])
14921493
>>> [x.intercept for x in model.models]
1493-
[-5.06544..., 2.30341..., -1.29133...]
1494-
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0))]).toDF()
1494+
[-2.7..., -2.5..., -1.3...]
1495+
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0, 1.0, 1.0))]).toDF()
14951496
>>> model.transform(test0).head().prediction
1496-
1.0
1497-
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
1498-
>>> model.transform(test1).head().prediction
14991497
0.0
1500-
>>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4))]).toDF()
1501-
>>> model.transform(test2).head().prediction
1498+
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(4, [0], [1.0]))]).toDF()
1499+
>>> model.transform(test1).head().prediction
15021500
2.0
1501+
>>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4, 0.3, 0.2))]).toDF()
1502+
>>> model.transform(test2).head().prediction
1503+
0.0
15031504
>>> model_path = temp_path + "/ovr_model"
15041505
>>> model.save(model_path)
15051506
>>> model2 = OneVsRestModel.load(model_path)
15061507
>>> model2.transform(test0).head().prediction
1507-
1.0
1508+
0.0
15081509
15091510
.. versionadded:: 2.0.0
15101511
"""

0 commit comments

Comments
 (0)