Skip to content

[SPARK-10231] [MLLIB] update @Since annotation for mllib.classification #8421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ import org.apache.spark.rdd.RDD
* belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc.
*/
@Experimental
@Since("0.8.0")
trait ClassificationModel extends Serializable {
/**
* Predict values for the given data set using the model trained.
*
* @param testData RDD representing data points to be predicted
* @return an RDD[Double] where each entry contains the corresponding prediction
*/
@Since("0.8.0")
@Since("1.0.0")
def predict(testData: RDD[Vector]): RDD[Double]

/**
Expand All @@ -46,15 +47,15 @@ trait ClassificationModel extends Serializable {
* @param testData array representing a single data point
* @return predicted category from the trained model
*/
@Since("0.8.0")
@Since("1.0.0")
def predict(testData: Vector): Double

/**
* Predict values for examples stored in a JavaRDD.
* @param testData JavaRDD representing data points to be predicted
* @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction
*/
@Since("0.8.0")
@Since("1.0.0")
def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD
* Multinomial Logistic Regression. By default, it is binary logistic regression
* so numClasses will be set to 2.
*/
class LogisticRegressionModel (
override val weights: Vector,
override val intercept: Double,
val numFeatures: Int,
val numClasses: Int)
@Since("0.8.0")
class LogisticRegressionModel @Since("1.3.0") (
@Since("1.0.0") override val weights: Vector,
@Since("1.0.0") override val intercept: Double,
@Since("1.3.0") val numFeatures: Int,
@Since("1.3.0") val numClasses: Int)
extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
with Saveable with PMMLExportable {

Expand Down Expand Up @@ -75,6 +76,7 @@ class LogisticRegressionModel (
/**
* Constructs a [[LogisticRegressionModel]] with weights and intercept for binary classification.
*/
@Since("1.0.0")
def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)

private var threshold: Option[Double] = Some(0.5)
Expand Down Expand Up @@ -166,12 +168,12 @@ class LogisticRegressionModel (

override protected def formatVersion: String = "1.0"

@Since("1.4.0")
override def toString: String = {
s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
}
}

@Since("1.3.0")
object LogisticRegressionModel extends Loader[LogisticRegressionModel] {

@Since("1.3.0")
Expand Down Expand Up @@ -207,6 +209,7 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
* for k classes multi-label classification problem.
* Using [[LogisticRegressionWithLBFGS]] is recommended over this.
*/
@Since("0.8.0")
class LogisticRegressionWithSGD private[mllib] (
private var stepSize: Double,
private var numIterations: Int,
Expand All @@ -216,6 +219,7 @@ class LogisticRegressionWithSGD private[mllib] (

private val gradient = new LogisticGradient()
private val updater = new SquaredL2Updater()
@Since("0.8.0")
override val optimizer = new GradientDescent(gradient, updater)
.setStepSize(stepSize)
.setNumIterations(numIterations)
Expand All @@ -227,6 +231,7 @@ class LogisticRegressionWithSGD private[mllib] (
* Construct a LogisticRegression object with default parameters: {stepSize: 1.0,
* numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}.
*/
@Since("0.8.0")
def this() = this(1.0, 100, 0.01, 1.0)

override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
Expand All @@ -238,6 +243,7 @@ class LogisticRegressionWithSGD private[mllib] (
* Top-level methods for calling Logistic Regression using Stochastic Gradient Descent.
* NOTE: Labels used in Logistic Regression should be {0, 1}
*/
@Since("0.8.0")
object LogisticRegressionWithSGD {
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
// Java programs.
Expand Down Expand Up @@ -333,11 +339,13 @@ object LogisticRegressionWithSGD {
* NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
* for k classes multi-label classification problem.
*/
@Since("1.1.0")
class LogisticRegressionWithLBFGS
extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

this.setFeatureScaling(true)

@Since("1.1.0")
override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)

override protected val validators = List(multiLabelValidator)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
* where D is number of features
* @param modelType The type of NB model to fit can be "multinomial" or "bernoulli"
*/
@Since("0.9.0")
class NaiveBayesModel private[spark] (
val labels: Array[Double],
val pi: Array[Double],
val theta: Array[Array[Double]],
val modelType: String)
@Since("1.0.0") val labels: Array[Double],
@Since("0.9.0") val pi: Array[Double],
@Since("0.9.0") val theta: Array[Array[Double]],
@Since("1.4.0") val modelType: String)
extends ClassificationModel with Serializable with Saveable {

import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes}
Expand Down Expand Up @@ -83,6 +84,7 @@ class NaiveBayesModel private[spark] (
throw new UnknownError(s"Invalid modelType: $modelType.")
}

@Since("1.0.0")
override def predict(testData: RDD[Vector]): RDD[Double] = {
val bcModel = testData.context.broadcast(this)
testData.mapPartitions { iter =>
Expand All @@ -91,6 +93,7 @@ class NaiveBayesModel private[spark] (
}
}

@Since("1.0.0")
override def predict(testData: Vector): Double = {
modelType match {
case Multinomial =>
Expand All @@ -107,6 +110,7 @@ class NaiveBayesModel private[spark] (
* @return an RDD[Vector] where each entry contains the predicted posterior class probabilities,
* in the same order as class labels
*/
@Since("1.5.0")
def predictProbabilities(testData: RDD[Vector]): RDD[Vector] = {
val bcModel = testData.context.broadcast(this)
testData.mapPartitions { iter =>
Expand All @@ -122,6 +126,7 @@ class NaiveBayesModel private[spark] (
* @return predicted posterior class probabilities from the trained model,
* in the same order as class labels
*/
@Since("1.5.0")
def predictProbabilities(testData: Vector): Vector = {
modelType match {
case Multinomial =>
Expand Down Expand Up @@ -158,6 +163,7 @@ class NaiveBayesModel private[spark] (
new DenseVector(scaledProbs.map(_ / probSum))
}

@Since("1.3.0")
override def save(sc: SparkContext, path: String): Unit = {
val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType)
NaiveBayesModel.SaveLoadV2_0.save(sc, path, data)
Expand All @@ -166,6 +172,7 @@ class NaiveBayesModel private[spark] (
override protected def formatVersion: String = "2.0"
}

@Since("1.3.0")
object NaiveBayesModel extends Loader[NaiveBayesModel] {

import org.apache.spark.mllib.util.Loader._
Expand Down Expand Up @@ -199,6 +206,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
dataRDD.write.parquet(dataPath(path))
}

@Since("1.3.0")
def load(sc: SparkContext, path: String): NaiveBayesModel = {
val sqlContext = new SQLContext(sc)
// Load Parquet data.
Expand Down Expand Up @@ -301,30 +309,35 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
* document classification. By making every vector a 0-1 vector, it can also be used as
* Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative.
*/

@Since("0.9.0")
class NaiveBayes private (
private var lambda: Double,
private var modelType: String) extends Serializable with Logging {

import NaiveBayes.{Bernoulli, Multinomial}

@Since("1.4.0")
def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)

@Since("0.9.0")
def this() = this(1.0, NaiveBayes.Multinomial)

/** Set the smoothing parameter. Default: 1.0. */
@Since("0.9.0")
def setLambda(lambda: Double): NaiveBayes = {
this.lambda = lambda
this
}

/** Get the smoothing parameter. */
@Since("1.4.0")
def getLambda: Double = lambda

/**
* Set the model type using a string (case-sensitive).
* Supported options: "multinomial" (default) and "bernoulli".
*/
@Since("1.4.0")
def setModelType(modelType: String): NaiveBayes = {
require(NaiveBayes.supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown modelType: $modelType.")
Expand All @@ -333,13 +346,15 @@ class NaiveBayes private (
}

/** Get the model type. */
@Since("1.4.0")
def getModelType: String = this.modelType

/**
* Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries.
*
* @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
*/
@Since("0.9.0")
def run(data: RDD[LabeledPoint]): NaiveBayesModel = {
val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
val values = v match {
Expand Down Expand Up @@ -423,6 +438,7 @@ class NaiveBayes private (
/**
* Top-level methods for calling naive Bayes.
*/
@Since("0.9.0")
object NaiveBayes {

/** String name for multinomial model type. */
Expand Down Expand Up @@ -485,7 +501,7 @@ object NaiveBayes {
* @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be
* multinomial or bernoulli
*/
@Since("0.9.0")
@Since("1.4.0")
def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
require(supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown modelType: $modelType.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ import org.apache.spark.rdd.RDD
* @param weights Weights computed for every feature.
* @param intercept Intercept computed for this model.
*/
class SVMModel (
override val weights: Vector,
override val intercept: Double)
@Since("0.8.0")
class SVMModel @Since("1.1.0") (
@Since("1.0.0") override val weights: Vector,
@Since("0.8.0") override val intercept: Double)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This means the class is since 0.8, the constructor is since 1.1, weights is since 1.0, and intercept is since 0.8.

extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
with Saveable with PMMLExportable {

Expand All @@ -47,7 +48,7 @@ class SVMModel (
* with prediction score greater than or equal to this threshold is identified as an positive,
* and negative otherwise. The default value is 0.0.
*/
@Since("1.3.0")
@Since("1.0.0")
@Experimental
def setThreshold(threshold: Double): this.type = {
this.threshold = Some(threshold)
Expand Down Expand Up @@ -92,12 +93,12 @@ class SVMModel (

override protected def formatVersion: String = "1.0"

@Since("1.4.0")
override def toString: String = {
s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}"
}
}

@Since("1.3.0")
object SVMModel extends Loader[SVMModel] {

@Since("1.3.0")
Expand Down Expand Up @@ -132,6 +133,7 @@ object SVMModel extends Loader[SVMModel] {
* regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
* NOTE: Labels used in SVM should be {0, 1}.
*/
@Since("0.8.0")
class SVMWithSGD private (
private var stepSize: Double,
private var numIterations: Int,
Expand All @@ -141,6 +143,7 @@ class SVMWithSGD private (

private val gradient = new HingeGradient()
private val updater = new SquaredL2Updater()
@Since("0.8.0")
override val optimizer = new GradientDescent(gradient, updater)
.setStepSize(stepSize)
.setNumIterations(numIterations)
Expand All @@ -152,6 +155,7 @@ class SVMWithSGD private (
* Construct a SVM object with default parameters: {stepSize: 1.0, numIterations: 100,
* regParm: 0.01, miniBatchFraction: 1.0}.
*/
@Since("0.8.0")
def this() = this(1.0, 100, 0.01, 1.0)

override protected def createModel(weights: Vector, intercept: Double) = {
Expand All @@ -162,6 +166,7 @@ class SVMWithSGD private (
/**
* Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}.
*/
@Since("0.8.0")
object SVMWithSGD {

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package org.apache.spark.mllib.classification

import org.apache.spark.annotation.Experimental
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.StreamingLinearAlgorithm

Expand All @@ -44,6 +44,7 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
* }}}
*/
@Experimental
@Since("1.3.0")
class StreamingLogisticRegressionWithSGD private[mllib] (
private var stepSize: Double,
private var numIterations: Int,
Expand All @@ -58,6 +59,7 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
* Initial weights must be set before using trainOn or predictOn
* (see `StreamingLinearAlgorithm`)
*/
@Since("1.3.0")
def this() = this(0.1, 50, 1.0, 0.0)

protected val algorithm = new LogisticRegressionWithSGD(
Expand All @@ -66,30 +68,35 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
protected var model: Option[LogisticRegressionModel] = None

/** Set the step size for gradient descent. Default: 0.1. */
@Since("1.3.0")
def setStepSize(stepSize: Double): this.type = {
this.algorithm.optimizer.setStepSize(stepSize)
this
}

/** Set the number of iterations of gradient descent to run per update. Default: 50. */
@Since("1.3.0")
def setNumIterations(numIterations: Int): this.type = {
this.algorithm.optimizer.setNumIterations(numIterations)
this
}

/** Set the fraction of each batch to use for updates. Default: 1.0. */
@Since("1.3.0")
def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction)
this
}

/** Set the regularization parameter. Default: 0.0. */
@Since("1.3.0")
def setRegParam(regParam: Double): this.type = {
this.algorithm.optimizer.setRegParam(regParam)
this
}

/** Set the initial weights. Default: [0.0, 0.0]. */
@Since("1.3.0")
def setInitialWeights(initialWeights: Vector): this.type = {
this.model = Some(algorithm.createModel(initialWeights, 0.0))
this
Expand Down