Skip to content

Commit

Permalink
[SPARK-32310][ML][PYSPARK][3.0] ML params default value parity
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
backporting the changes to 3.0
set params default values in trait Params for feature and tuning in both Scala and Python.

### Why are the changes needed?
Make ML has the same default param values between estimator and its corresponding transformer, and also between Scala and Python.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Existing and modified tests

Closes #29159 from huaxingao/set_default_3.0.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Huaxin Gao <huaxing@us.ibm.com>
  • Loading branch information
huaxingao committed Jul 24, 2020
1 parent f50432f commit 8a52bda
Show file tree
Hide file tree
Showing 38 changed files with 368 additions and 238 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFactorSize(value: Int): this.type = set(factorSize, value)
setDefault(factorSize -> 8)

/**
* Set whether to fit intercept term.
Expand All @@ -96,7 +95,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set whether to fit linear term.
Expand All @@ -106,7 +104,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitLinear(value: Boolean): this.type = set(fitLinear, value)
setDefault(fitLinear -> true)

/**
* Set the L2 regularization parameter.
Expand All @@ -116,7 +113,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the mini-batch fraction parameter.
Expand All @@ -126,7 +122,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value)
setDefault(miniBatchFraction -> 1.0)

/**
* Set the standard deviation of initial coefficients.
Expand All @@ -136,7 +131,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setInitStd(value: Double): this.type = set(initStd, value)
setDefault(initStd -> 0.01)

/**
* Set the maximum number of iterations.
Expand All @@ -146,7 +140,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the initial step size for the first step (like learning rate).
Expand All @@ -156,7 +149,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setStepSize(value: Double): this.type = set(stepSize, value)
setDefault(stepSize -> 1.0)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -166,7 +158,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Set the solver algorithm used for optimization.
Expand All @@ -177,7 +168,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setSolver(value: String): this.type = set(solver, value)
setDefault(solver -> AdamW)

/**
* Set the random seed for weight initialization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
*/
final override val threshold: DoubleParam = new DoubleParam(this, "threshold",
"threshold in binary classification prediction applied to rawPrediction")

setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6,
standardization -> true, threshold -> 0.0, aggregationDepth -> 2)
}

/**
Expand Down Expand Up @@ -81,7 +84,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -91,7 +93,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Whether to fit an intercept term.
Expand All @@ -101,7 +102,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -112,7 +112,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -122,7 +121,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

/**
* Set the value of param [[weightCol]].
Expand All @@ -141,7 +139,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold -> 0.0)

/**
* Suggested depth for treeAggregate (greater than or equal to 2).
Expand All @@ -153,7 +150,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

@Since("2.2.0")
override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
Expand Down Expand Up @@ -300,7 +296,6 @@ class LinearSVCModel private[classification] (

@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold, 0.0)

private val margin: Vector => Double = (features) => {
BLAS.dot(features, coefficients) + intercept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts)
}

setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6,
fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5,
aggregationDepth -> 2)

override protected def validateAndTransformSchema(
schema: StructType,
fitting: Boolean,
Expand Down Expand Up @@ -290,7 +294,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the ElasticNet mixing parameter.
Expand All @@ -306,7 +309,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
setDefault(elasticNetParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -316,7 +318,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -327,7 +328,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to fit an intercept term.
Expand All @@ -337,7 +337,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Sets the value of param [[family]].
Expand All @@ -347,7 +346,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setFamily(value: String): this.type = set(family, value)
setDefault(family -> "auto")

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -361,11 +359,9 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.5.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

@Since("1.5.0")
override def setThreshold(value: Double): this.type = super.setThreshold(value)
setDefault(threshold -> 0.5)

@Since("1.5.0")
override def getThreshold: Double = super.getThreshold
Expand Down Expand Up @@ -396,7 +392,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

/**
* Set the lower bounds on coefficients if fitting under bound constrained optimization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW

/** @group getParam */
final def getModelType: String = $(modelType)

setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial)
}

// scalastyle:off line.size.limit
Expand Down Expand Up @@ -106,7 +108,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setSmoothing(value: Double): this.type = set(smoothing, value)
setDefault(smoothing -> 1.0)

/**
* Set the model type using a string (case-sensitive).
Expand All @@ -116,7 +117,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setModelType(value: String): this.type = set(modelType, value)
setDefault(modelType -> Multinomial)

/**
* Sets the value of param [[weightCol]].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter
@Since("2.0.0")
def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize)

setDefault(k -> 4, maxIter -> 20, minDivisibleClusterSize -> 1.0)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -225,11 +227,6 @@ class BisectingKMeans @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {

setDefault(
k -> 4,
maxIter -> 20,
minDivisibleClusterSize -> 1.0)

@Since("2.0.0")
override def copy(extra: ParamMap): BisectingKMeans = defaultCopy(extra)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
@Since("2.0.0")
def getK: Int = $(k)

setDefault(k -> 2, maxIter -> 100, tol -> 0.01)

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -323,11 +325,6 @@ class GaussianMixture @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 100,
tol -> 0.01)

@Since("2.0.0")
override def copy(extra: ParamMap): GaussianMixture = defaultCopy(extra)

Expand Down
11 changes: 3 additions & 8 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
@Since("1.5.0")
def getInitSteps: Int = $(initSteps)

setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2,
tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -270,14 +273,6 @@ class KMeans @Since("1.5.0") (
@Since("1.5.0") override val uid: String)
extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 2,
tol -> 1e-4,
distanceMeasure -> DistanceMeasure.EUCLIDEAN)

@Since("1.5.0")
override def copy(extra: ParamMap): KMeans = defaultCopy(extra)

Expand Down
11 changes: 5 additions & 6 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
" with estimates of the topic mixture distribution for each document (often called \"theta\"" +
" in the literature). Returns a vector of zeros for an empty document.")

setDefault(topicDistributionCol -> "topicDistribution")

/** @group getParam */
@Since("1.6.0")
def getTopicDistributionCol: String = $(topicDistributionCol)
Expand Down Expand Up @@ -315,6 +313,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
@Since("2.0.0")
def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint)

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true,
topicDistributionCol -> "topicDistribution")

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -863,10 +866,6 @@ class LDA @Since("1.6.0") (
@Since("1.6.0")
def this() = this(Identifiable.randomUID("lda"))

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true)

/**
* The features for LDA should be a `Vector` representing the word counts in a document.
* The vector should be of length vocabSize, with counts for each term (word).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ private[clustering] trait PowerIterationClusteringParams extends Params with Has
@Since("2.4.0")
def getDstCol: String = $(dstCol)

setDefault(srcCol -> "src", dstCol -> "dst")
setDefault(srcCol -> "src", dstCol -> "dst", k -> 2, maxIter -> 20, initMode -> "random")
}

/**
Expand All @@ -111,11 +111,6 @@ class PowerIterationClustering private[clustering] (
@Since("2.4.0") override val uid: String)
extends PowerIterationClusteringParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 20,
initMode -> "random")

@Since("2.4.0")
def this() = this(Identifiable.randomUID("PowerIterationClustering"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
@Since("3.0.0")
def setNumBins(value: Int): this.type = set(numBins, value)

setDefault(numBins -> 1000)

/** @group setParam */
@Since("1.5.0")
def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value)
Expand All @@ -94,7 +92,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
@Since("3.0.0")
def setWeightCol(value: String): this.type = set(weightCol, value)

setDefault(metricName -> "areaUnderROC")
setDefault(metricName -> "areaUnderROC", numBins -> 1000)

@Since("2.0.0")
override def evaluate(dataset: Dataset[_]): Double = {
Expand Down
Loading

0 comments on commit 8a52bda

Please sign in to comment.