Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32310][ML][PySpark][3.0] ML params default value parity #29159

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFactorSize(value: Int): this.type = set(factorSize, value)
setDefault(factorSize -> 8)

/**
* Set whether to fit intercept term.
Expand All @@ -96,7 +95,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set whether to fit linear term.
Expand All @@ -106,7 +104,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitLinear(value: Boolean): this.type = set(fitLinear, value)
setDefault(fitLinear -> true)

/**
* Set the L2 regularization parameter.
Expand All @@ -116,7 +113,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the mini-batch fraction parameter.
Expand All @@ -126,7 +122,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value)
setDefault(miniBatchFraction -> 1.0)

/**
* Set the standard deviation of initial coefficients.
Expand All @@ -136,7 +131,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setInitStd(value: Double): this.type = set(initStd, value)
setDefault(initStd -> 0.01)

/**
* Set the maximum number of iterations.
Expand All @@ -146,7 +140,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the initial step size for the first step (like learning rate).
Expand All @@ -156,7 +149,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setStepSize(value: Double): this.type = set(stepSize, value)
setDefault(stepSize -> 1.0)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -166,7 +158,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Set the solver algorithm used for optimization.
Expand All @@ -177,7 +168,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setSolver(value: String): this.type = set(solver, value)
setDefault(solver -> AdamW)

/**
* Set the random seed for weight initialization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
*/
final override val threshold: DoubleParam = new DoubleParam(this, "threshold",
"threshold in binary classification prediction applied to rawPrediction")

setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6,
standardization -> true, threshold -> 0.0, aggregationDepth -> 2)
}

/**
Expand Down Expand Up @@ -81,7 +84,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -91,7 +93,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Whether to fit an intercept term.
Expand All @@ -101,7 +102,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -112,7 +112,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -122,7 +121,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

/**
* Set the value of param [[weightCol]].
Expand All @@ -141,7 +139,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold -> 0.0)

/**
* Suggested depth for treeAggregate (greater than or equal to 2).
Expand All @@ -153,7 +150,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

@Since("2.2.0")
override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
Expand Down Expand Up @@ -300,7 +296,6 @@ class LinearSVCModel private[classification] (

@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold, 0.0)

private val margin: Vector => Double = (features) => {
BLAS.dot(features, coefficients) + intercept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts)
}

setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6,
fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5,
aggregationDepth -> 2)

override protected def validateAndTransformSchema(
schema: StructType,
fitting: Boolean,
Expand Down Expand Up @@ -290,7 +294,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the ElasticNet mixing parameter.
Expand All @@ -306,7 +309,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
setDefault(elasticNetParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -316,7 +318,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -327,7 +328,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to fit an intercept term.
Expand All @@ -337,7 +337,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Sets the value of param [[family]].
Expand All @@ -347,7 +346,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setFamily(value: String): this.type = set(family, value)
setDefault(family -> "auto")

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -361,11 +359,9 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.5.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

@Since("1.5.0")
override def setThreshold(value: Double): this.type = super.setThreshold(value)
setDefault(threshold -> 0.5)

@Since("1.5.0")
override def getThreshold: Double = super.getThreshold
Expand Down Expand Up @@ -396,7 +392,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

/**
* Set the lower bounds on coefficients if fitting under bound constrained optimization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW

/** @group getParam */
final def getModelType: String = $(modelType)

setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial)
}

// scalastyle:off line.size.limit
Expand Down Expand Up @@ -106,7 +108,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setSmoothing(value: Double): this.type = set(smoothing, value)
setDefault(smoothing -> 1.0)

/**
* Set the model type using a string (case-sensitive).
Expand All @@ -116,7 +117,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setModelType(value: String): this.type = set(modelType, value)
setDefault(modelType -> Multinomial)

/**
* Sets the value of param [[weightCol]].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter
@Since("2.0.0")
def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize)

setDefault(k -> 4, maxIter -> 20, minDivisibleClusterSize -> 1.0)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -225,11 +227,6 @@ class BisectingKMeans @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {

setDefault(
k -> 4,
maxIter -> 20,
minDivisibleClusterSize -> 1.0)

@Since("2.0.0")
override def copy(extra: ParamMap): BisectingKMeans = defaultCopy(extra)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
@Since("2.0.0")
def getK: Int = $(k)

setDefault(k -> 2, maxIter -> 100, tol -> 0.01)

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -323,11 +325,6 @@ class GaussianMixture @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 100,
tol -> 0.01)

@Since("2.0.0")
override def copy(extra: ParamMap): GaussianMixture = defaultCopy(extra)

Expand Down
11 changes: 3 additions & 8 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
@Since("1.5.0")
def getInitSteps: Int = $(initSteps)

setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2,
tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -270,14 +273,6 @@ class KMeans @Since("1.5.0") (
@Since("1.5.0") override val uid: String)
extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 2,
tol -> 1e-4,
distanceMeasure -> DistanceMeasure.EUCLIDEAN)

@Since("1.5.0")
override def copy(extra: ParamMap): KMeans = defaultCopy(extra)

Expand Down
11 changes: 5 additions & 6 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
" with estimates of the topic mixture distribution for each document (often called \"theta\"" +
" in the literature). Returns a vector of zeros for an empty document.")

setDefault(topicDistributionCol -> "topicDistribution")

/** @group getParam */
@Since("1.6.0")
def getTopicDistributionCol: String = $(topicDistributionCol)
Expand Down Expand Up @@ -315,6 +313,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
@Since("2.0.0")
def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint)

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true,
topicDistributionCol -> "topicDistribution")

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -863,10 +866,6 @@ class LDA @Since("1.6.0") (
@Since("1.6.0")
def this() = this(Identifiable.randomUID("lda"))

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true)

/**
* The features for LDA should be a `Vector` representing the word counts in a document.
* The vector should be of length vocabSize, with counts for each term (word).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ private[clustering] trait PowerIterationClusteringParams extends Params with Has
@Since("2.4.0")
def getDstCol: String = $(dstCol)

setDefault(srcCol -> "src", dstCol -> "dst")
setDefault(srcCol -> "src", dstCol -> "dst", k -> 2, maxIter -> 20, initMode -> "random")
}

/**
Expand All @@ -111,11 +111,6 @@ class PowerIterationClustering private[clustering] (
@Since("2.4.0") override val uid: String)
extends PowerIterationClusteringParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 20,
initMode -> "random")

@Since("2.4.0")
def this() = this(Identifiable.randomUID("PowerIterationClustering"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
@Since("3.0.0")
def setNumBins(value: Int): this.type = set(numBins, value)

setDefault(numBins -> 1000)

/** @group setParam */
@Since("1.5.0")
def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value)
Expand All @@ -94,7 +92,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
@Since("3.0.0")
def setWeightCol(value: String): this.type = set(weightCol, value)

setDefault(metricName -> "areaUnderROC")
setDefault(metricName -> "areaUnderROC", numBins -> 1000)

@Since("2.0.0")
override def evaluate(dataset: Dataset[_]): Double = {
Expand Down
Loading