fix

apache · Jul 21, 2020 · 1586e30 · 1586e30
1 parent 86e0579
commit 1586e30
Show file tree

Hide file tree

Showing 9 changed files with 192 additions and 59 deletions.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala
@@ -64,7 +64,12 @@ trait DefaultReadWriteTest extends TempDirectory { self: Suite =>
             case (Array(values), Array(newValues)) =>
               assert(values === newValues, s"Values do not match on param ${p.name}.")
             case (value, newValue) =>
-              assert(value === newValue, s"Values do not match on param ${p.name}.")
+              if (value.isInstanceOf[Double] && value.asInstanceOf[Double].isNaN) {
+                assert(newValue.isInstanceOf[Double] && newValue.asInstanceOf[Double].isNaN,
+                  s"Values do not match on param ${p.name}.")
+              } else {
+                assert(value === newValue, s"Values do not match on param ${p.name}.")
+              }
           }
         } else {
           assert(!newInstance.isDefined(p), s"Param ${p.name} shouldn't be defined.")

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -513,8 +513,8 @@ class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitInterce
                       " all predictions 0.0 and -Inf will make all predictions 1.0.",
                       typeConverter=TypeConverters.toFloat)
 
-    def __init__(self):
-        super(_LinearSVCParams, self).__init__()
+    def __init__(self, *args):
+        super(_LinearSVCParams, self).__init__(*args)
         self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True,
                          standardization=True, threshold=0.0, aggregationDepth=2,
                          blockSize=1)
@@ -587,6 +587,8 @@ class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadabl
     True
     >>> model.intercept == model2.intercept
     True
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
 
     .. versionadded:: 2.2.0
     """
@@ -820,8 +822,8 @@ class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam,
                                     "classes for multinomial regression.",
                                     typeConverter=TypeConverters.toVector)
 
-    def __init__(self):
-        super(_LogisticRegressionParams, self).__init__()
+    def __init__(self, *args):
+        super(_LogisticRegressionParams, self).__init__(*args)
         self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto",
                          blockSize=1)
 
@@ -1017,7 +1019,8 @@ class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams
     >>> blorModel.intercept == model2.intercept
     True
     >>> model2
-    LogisticRegressionModel: uid=..., numClasses=2, numFeatures=2
+    >>> blorModel.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
 
     .. versionadded:: 1.3.0
     """
@@ -1313,8 +1316,8 @@ class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams):
     Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`.
     """
 
-    def __init__(self):
-        super(_DecisionTreeClassifierParams, self).__init__()
+    def __init__(self, *args):
+        super(_DecisionTreeClassifierParams, self).__init__(*args)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini", leafCol="", minWeightFractionPerNode=0.0)
@@ -1384,7 +1387,8 @@ class DecisionTreeClassifier(_JavaProbabilisticClassifier, _DecisionTreeClassifi
     >>> model2 = DecisionTreeClassificationModel.load(model_path)
     >>> model.featureImportances == model2.featureImportances
     True
-
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
     >>> df3 = spark.createDataFrame([
     ...     (1.0, 0.2, Vectors.dense(1.0)),
     ...     (1.0, 0.8, Vectors.dense(1.0)),
@@ -1550,8 +1554,8 @@ class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams):
     Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`.
     """
 
-    def __init__(self):
-        super(_RandomForestClassifierParams, self).__init__()
+    def __init__(self, *args):
+        super(_RandomForestClassifierParams, self).__init__(*args)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini", numTrees=20, featureSubsetStrategy="auto",
@@ -1628,6 +1632,8 @@ class RandomForestClassifier(_JavaProbabilisticClassifier, _RandomForestClassifi
     >>> model2 = RandomForestClassificationModel.load(model_path)
     >>> model.featureImportances == model2.featureImportances
     True
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1893,8 +1899,8 @@ class _GBTClassifierParams(_GBTParams, _HasVarianceImpurity):
                      "Supported options: " + ", ".join(supportedLossTypes),
                      typeConverter=TypeConverters.toString)
 
-    def __init__(self):
-        super(_GBTClassifierParams, self).__init__()
+    def __init__(self, *args):
+        super(_GBTClassifierParams, self).__init__(*args)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0,
@@ -1992,6 +1998,8 @@ class GBTClassifier(_JavaProbabilisticClassifier, _GBTClassifierParams,
     True
     >>> model.treeWeights == model2.treeWeights
     True
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
     >>> model.trees
     [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...]
     >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)],
@@ -2225,8 +2233,8 @@ class _NaiveBayesParams(_PredictorParams, HasWeightCol):
                       "and gaussian.",
                       typeConverter=TypeConverters.toString)
 
-    def __init__(self):
-        super(_NaiveBayesParams, self).__init__()
+    def __init__(self, *args):
+        super(_NaiveBayesParams, self).__init__(*args)
         self._setDefault(smoothing=1.0, modelType="multinomial")
 
     @since("1.5.0")
@@ -2312,6 +2320,8 @@ class NaiveBayes(_JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds,
     True
     >>> model.theta == model2.theta
     True
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
     >>> nb = nb.setThresholds([0.01, 10.00])
     >>> model3 = nb.fit(df)
     >>> result = model3.transform(test0).head()
@@ -2438,8 +2448,8 @@ class _MultilayerPerceptronParams(_ProbabilisticClassifierParams, HasSeed, HasMa
     initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.",
                            typeConverter=TypeConverters.toVector)
 
-    def __init__(self):
-        super(_MultilayerPerceptronParams, self).__init__()
+    def __init__(self, *args):
+        super(_MultilayerPerceptronParams, self).__init__(*args)
         self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
 
     @since("1.6.0")
@@ -2521,6 +2531,8 @@ class MultilayerPerceptronClassifier(_JavaProbabilisticClassifier, _MultilayerPe
     True
     >>> model.weights == model2.weights
     True
+    >>> model.transform(testDF).take(1) == model2.transform(testDF).take(1)
+    True
     >>> mlp2 = mlp2.setInitialWeights(list(range(0, 12)))
     >>> model3 = mlp2.fit(df)
     >>> model3.weights != model2.weights
@@ -2695,6 +2707,8 @@ class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, JavaMLReadable, Jav
     >>> model2 = OneVsRestModel.load(model_path)
     >>> model2.transform(test0).head().newPrediction
     0.0
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
     >>> model.transform(test2).columns
     ['features', 'rawPrediction', 'newPrediction']
 
@@ -3120,6 +3134,17 @@ class FMClassifier(_JavaProbabilisticClassifier, _FactorizationMachinesParams, J
     DenseVector([14.8232])
     >>> model.factors
     DenseMatrix(1, 2, [0.0163, -0.0051], 1)
+    >>> model_path = temp_path + "/fm_model"
+    >>> model.save(model_path)
+    >>> model2 = FMClassificationModel.load(model_path)
+    >>> model2.intercept
+    -7.316665276826291
+    >>> model2.linear
+    DenseVector([14.8232])
+    >>> model2.factors
+    DenseMatrix(1, 2, [0.0163, -0.0051], 1)
+    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    True
 
     .. versionadded:: 3.0.0
     """

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -109,8 +109,8 @@ class _GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC
     k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " +
               "Must be > 1.", typeConverter=TypeConverters.toInt)
 
-    def __init__(self):
-        super(_GaussianMixtureParams, self).__init__()
+    def __init__(self, *args):
+        super(_GaussianMixtureParams, self).__init__(*args)
         self._setDefault(k=2, tol=0.01, maxIter=100, aggregationDepth=2, blockSize=1)
 
     @since("2.0.0")
@@ -325,6 +325,8 @@ class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, Jav
     Row(mean=DenseVector([0.825, 0.8675]))
     >>> model2.gaussiansDF.select("cov").head()
     Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))
+    >>> model.transform(df).take(1) == model2.transform(df).take(1)
+    True
     >>> gm2.setWeightCol("weight")
     GaussianMixture...
 
@@ -503,8 +505,8 @@ class _KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTo
     initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
                       "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
 
-    def __init__(self):
-        super(_KMeansParams, self).__init__()
+    def __init__(self, *args):
+        super(_KMeansParams, self).__init__(*args)
         self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20,
                          distanceMeasure="euclidean")
 
@@ -637,6 +639,8 @@ class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable):
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
     array([ True,  True], dtype=bool)
+    >>> model.transform(df).take(1) == model2.transform(df).take(1)
+    True
 
     .. versionadded:: 1.5.0
     """
@@ -760,8 +764,8 @@ class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC
                                     "proportion of points (if < 1.0) of a divisible cluster.",
                                     typeConverter=TypeConverters.toFloat)
 
-    def __init__(self):
-        super(_BisectingKMeansParams, self).__init__()
+    def __init__(self, *args):
+        super(_BisectingKMeansParams, self).__init__(*args)
         self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0)
 
     @since("2.0.0")
@@ -914,6 +918,8 @@ class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, Jav
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
     array([ True,  True], dtype=bool)
+    >>> model.transform(df).take(1) == model2.transform(df).take(1)
+    True
 
     .. versionadded:: 2.0.0
     """
@@ -1072,8 +1078,8 @@ class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval):
                                " partition is lost, so set this bit with care.",
                                TypeConverters.toBoolean)
 
-    def __init__(self):
-        super(_LDAParams, self).__init__()
+    def __init__(self, *args):
+        super(_LDAParams, self).__init__(*args)
         self._setDefault(maxIter=20, checkpointInterval=10,
                          k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
                          subsamplingRate=0.05, optimizeDocConcentration=True,
@@ -1389,6 +1395,8 @@ class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable):
     >>> local_model_path = temp_path + "/lda_local_model"
     >>> localModel.save(local_model_path)
     >>> sameLocalModel = LocalLDAModel.load(local_model_path)
+    >>> model.transform(df).take(1) == sameLocalModel.transform(df).take(1)
+    True
 
     .. versionadded:: 2.0.0
     """
@@ -1600,8 +1608,8 @@ class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol):
                    "Name of the input column for destination vertex IDs.",
                    typeConverter=TypeConverters.toString)
 
-    def __init__(self):
-        super(_PowerIterationClusteringParams, self).__init__()
+    def __init__(self, *args):
+        super(_PowerIterationClusteringParams, self).__init__(*args)
         self._setDefault(k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst")
 
     @since("2.4.0")
@@ -1677,6 +1685,8 @@ class PowerIterationClustering(_PowerIterationClusteringParams, JavaParams, Java
     2
     >>> pic2.getMaxIter()
     40
+    >>> pic2.assignClusters(df).take(6) == assignments.take(6)
+    True
 
     .. versionadded:: 2.4.0
     """