Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32310][ML][PySpark] ML params default value parity in classification, regression, clustering and fpm #29112

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFactorSize(value: Int): this.type = set(factorSize, value)
setDefault(factorSize -> 8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do the default params of FMClassifier move?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I move setDefault for these params to FactorizationMachinesParams in FMRegressor when I fixed solver last time


/**
* Set whether to fit intercept term.
Expand All @@ -95,7 +94,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set whether to fit linear term.
Expand All @@ -105,7 +103,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setFitLinear(value: Boolean): this.type = set(fitLinear, value)
setDefault(fitLinear -> true)

/**
* Set the L2 regularization parameter.
Expand All @@ -115,7 +112,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the mini-batch fraction parameter.
Expand All @@ -125,7 +121,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value)
setDefault(miniBatchFraction -> 1.0)

/**
* Set the standard deviation of initial coefficients.
Expand All @@ -135,7 +130,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setInitStd(value: Double): this.type = set(initStd, value)
setDefault(initStd -> 0.01)

/**
* Set the maximum number of iterations.
Expand All @@ -145,7 +139,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the initial step size for the first step (like learning rate).
Expand All @@ -155,7 +148,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setStepSize(value: Double): this.type = set(stepSize, value)
setDefault(stepSize -> 1.0)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -165,7 +157,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Set the solver algorithm used for optimization.
Expand All @@ -176,7 +167,6 @@ class FMClassifier @Since("3.0.0") (
*/
@Since("3.0.0")
def setSolver(value: String): this.type = set(solver, value)
setDefault(solver -> AdamW)

/**
* Set the random seed for weight initialization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
*/
final override val threshold: DoubleParam = new DoubleParam(this, "threshold",
"threshold in binary classification prediction applied to rawPrediction")

setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6,
standardization -> true, threshold -> 0.0, aggregationDepth -> 2, blockSize -> 1)
}

/**
Expand Down Expand Up @@ -82,7 +85,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -92,7 +94,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Whether to fit an intercept term.
Expand All @@ -102,7 +103,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -113,7 +113,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -123,7 +122,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

/**
* Set the value of param [[weightCol]].
Expand All @@ -142,7 +140,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold -> 0.0)

/**
* Suggested depth for treeAggregate (greater than or equal to 2).
Expand All @@ -154,7 +151,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("2.2.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

/**
* Set block size for stacking input data in matrices.
Expand All @@ -173,7 +169,6 @@ class LinearSVC @Since("2.2.0") (
*/
@Since("3.1.0")
def setBlockSize(value: Int): this.type = set(blockSize, value)
setDefault(blockSize -> 1)

@Since("2.2.0")
override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
Expand Down Expand Up @@ -381,7 +376,6 @@ class LinearSVCModel private[classification] (

@Since("2.2.0")
def setThreshold(value: Double): this.type = set(threshold, value)
setDefault(threshold, 0.0)

private val margin: Vector => Double = (features) => {
BLAS.dot(features, coefficients) + intercept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
@Since("2.2.0")
def getUpperBoundsOnIntercepts: Vector = $(upperBoundsOnIntercepts)

setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6,
fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5,
aggregationDepth -> 2, blockSize -> 1)

protected def usingBoundConstrainedOptimization: Boolean = {
isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) ||
isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts)
Expand Down Expand Up @@ -290,7 +294,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set the ElasticNet mixing parameter.
Expand All @@ -306,7 +309,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
setDefault(elasticNetParam -> 0.0)

/**
* Set the maximum number of iterations.
Expand All @@ -316,7 +318,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.2.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)

/**
* Set the convergence tolerance of iterations.
Expand All @@ -327,7 +328,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)

/**
* Whether to fit an intercept term.
Expand All @@ -337,7 +337,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.4.0")
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Sets the value of param [[family]].
Expand All @@ -347,7 +346,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setFamily(value: String): this.type = set(family, value)
setDefault(family -> "auto")

/**
* Whether to standardize the training features before fitting the model.
Expand All @@ -361,11 +359,9 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("1.5.0")
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)

@Since("1.5.0")
override def setThreshold(value: Double): this.type = super.setThreshold(value)
setDefault(threshold -> 0.5)

@Since("1.5.0")
override def getThreshold: Double = super.getThreshold
Expand Down Expand Up @@ -396,7 +392,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("2.1.0")
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)

/**
* Set the lower bounds on coefficients if fitting under bound constrained optimization.
Expand Down Expand Up @@ -447,7 +442,6 @@ class LogisticRegression @Since("1.2.0") (
*/
@Since("3.1.0")
def setBlockSize(value: Int): this.type = set(blockSize, value)
setDefault(blockSize -> 1)

private def assertBoundConstrainedOptimizationParamsValid(
numCoefficientSets: Int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW

/** @group getParam */
final def getModelType: String = $(modelType)

setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial)
}

// scalastyle:off line.size.limit
Expand Down Expand Up @@ -107,7 +109,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setSmoothing(value: Double): this.type = set(smoothing, value)
setDefault(smoothing -> 1.0)

/**
* Set the model type using a string (case-sensitive).
Expand All @@ -117,7 +118,6 @@ class NaiveBayes @Since("1.5.0") (
*/
@Since("1.5.0")
def setModelType(value: String): this.type = set(modelType, value)
setDefault(modelType -> Multinomial)

/**
* Sets the value of param [[weightCol]].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter
@Since("2.0.0")
def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize)

setDefault(k -> 4, maxIter -> 20, minDivisibleClusterSize -> 1.0)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -226,11 +228,6 @@ class BisectingKMeans @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {

setDefault(
k -> 4,
maxIter -> 20,
minDivisibleClusterSize -> 1.0)

@Since("2.0.0")
override def copy(extra: ParamMap): BisectingKMeans = defaultCopy(extra)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
@Since("2.0.0")
def getK: Int = $(k)

setDefault(k -> 2, maxIter -> 100, tol -> 0.01, blockSize -> 1)

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -328,11 +330,6 @@ class GaussianMixture @Since("2.0.0") (
@Since("2.0.0") override val uid: String)
extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 100,
tol -> 0.01)

@Since("2.0.0")
override def copy(extra: ParamMap): GaussianMixture = defaultCopy(extra)

Expand Down Expand Up @@ -392,7 +389,6 @@ class GaussianMixture @Since("2.0.0") (
*/
@Since("3.1.0")
def setBlockSize(value: Int): this.type = set(blockSize, value)
setDefault(blockSize -> 1)

/**
* Number of samples per cluster to use when initializing Gaussians.
Expand Down
11 changes: 3 additions & 8 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
@Since("1.5.0")
def getInitSteps: Int = $(initSteps)

setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2,
tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN)

/**
* Validates and transforms the input schema.
* @param schema input schema
Expand Down Expand Up @@ -271,14 +274,6 @@ class KMeans @Since("1.5.0") (
@Since("1.5.0") override val uid: String)
extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {

setDefault(
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 2,
tol -> 1e-4,
distanceMeasure -> DistanceMeasure.EUCLIDEAN)

@Since("1.5.0")
override def copy(extra: ParamMap): KMeans = defaultCopy(extra)

Expand Down
11 changes: 5 additions & 6 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
" with estimates of the topic mixture distribution for each document (often called \"theta\"" +
" in the literature). Returns a vector of zeros for an empty document.")

setDefault(topicDistributionCol -> "topicDistribution")

/** @group getParam */
@Since("1.6.0")
def getTopicDistributionCol: String = $(topicDistributionCol)
Expand Down Expand Up @@ -315,6 +313,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
@Since("2.0.0")
def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint)

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true,
topicDistributionCol -> "topicDistribution")

/**
* Validates and transforms the input schema.
*
Expand Down Expand Up @@ -863,10 +866,6 @@ class LDA @Since("1.6.0") (
@Since("1.6.0")
def this() = this(Identifiable.randomUID("lda"))

setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
optimizeDocConcentration -> true, keepLastCheckpoint -> true)

/**
* The features for LDA should be a `Vector` representing the word counts in a document.
* The vector should be of length vocabSize, with counts for each term (word).
Expand Down
Loading