From 01321bc0fec54a1610d0873c17fa7354137d3a6b Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 25 Nov 2020 10:24:41 +0900 Subject: [PATCH] [SPARK-33252][PYTHON][DOCS] Migration to NumPy documentation style in MLlib (pyspark.mllib.*) ### What changes were proposed in this pull request? This PR proposes migration of `pyspark.mllib` to NumPy documentation style. ### Why are the changes needed? To improve documentation style. Before: ![old](https://user-images.githubusercontent.com/1554276/100097941-90234980-2e5d-11eb-8b4d-c25d98d85191.png) After: ![new](https://user-images.githubusercontent.com/1554276/100097966-987b8480-2e5d-11eb-9e02-07b18c327624.png) ### Does this PR introduce _any_ user-facing change? Yes, this changes both rendered HTML docs and console representation (SPARK-33243). ### How was this patch tested? `dev/lint-python` and manual inspection. Closes #30413 from zero323/SPARK-33252. Authored-by: zero323 Signed-off-by: HyukjinKwon --- .../docs/source/reference/pyspark.mllib.rst | 3 +- python/pyspark/mllib/classification.py | 353 ++++++----- python/pyspark/mllib/clustering.py | 576 +++++++++++------- python/pyspark/mllib/evaluation.py | 60 +- python/pyspark/mllib/feature.py | 288 ++++++--- python/pyspark/mllib/feature.pyi | 4 +- python/pyspark/mllib/fpm.py | 86 +-- python/pyspark/mllib/fpm.pyi | 4 +- python/pyspark/mllib/linalg/__init__.py | 132 +++- python/pyspark/mllib/linalg/distributed.py | 495 ++++++++++----- python/pyspark/mllib/linalg/distributed.pyi | 6 +- python/pyspark/mllib/random.py | 378 ++++++++---- python/pyspark/mllib/recommendation.py | 116 ++-- python/pyspark/mllib/regression.py | 392 +++++++----- python/pyspark/mllib/stat/KernelDensity.py | 2 + python/pyspark/mllib/stat/__init__.py | 5 +- python/pyspark/mllib/stat/_statistics.py | 115 ++-- python/pyspark/mllib/stat/distribution.py | 2 + python/pyspark/mllib/tree.py | 469 +++++++------- python/pyspark/mllib/util.py | 256 +++++--- 20 files changed, 2375 insertions(+), 1367 deletions(-) diff --git a/python/docs/source/reference/pyspark.mllib.rst b/python/docs/source/reference/pyspark.mllib.rst index acc834c065ac3..df5ea017d0fbf 100644 --- a/python/docs/source/reference/pyspark.mllib.rst +++ b/python/docs/source/reference/pyspark.mllib.rst @@ -216,6 +216,8 @@ Statistics ChiSqTestResult MultivariateGaussian KernelDensity + ChiSqTestResult + KolmogorovSmirnovTestResult Tree @@ -250,4 +252,3 @@ Utilities Loader MLUtils Saveable - diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index bbca216cce493..bd43e91afd280 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -88,20 +88,26 @@ class LogisticRegressionModel(LinearClassificationModel): Classification model trained using Multinomial/Binary Logistic Regression. - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. (Only used in Binary Logistic - Regression. In Multinomial Logistic Regression, the intercepts will - not bea single value, so the intercepts will be part of the - weights.) - :param numFeatures: - The dimension of the features. - :param numClasses: - The number of possible outcomes for k classes classification problem - in Multinomial Logistic Regression. By default, it is binary - logistic regression so numClasses will be set to 2. + .. versionadded:: 0.9.0 + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. (Only used in Binary Logistic + Regression. In Multinomial Logistic Regression, the intercepts will + not be a single value, so the intercepts will be part of the + weights.) + numFeatures : int + The dimension of the features. + numClasses : int + The number of possible outcomes for k classes classification problem + in Multinomial Logistic Regression. By default, it is binary + logistic regression so numClasses will be set to 2. + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -159,8 +165,6 @@ class LogisticRegressionModel(LinearClassificationModel): 1 >>> mcm.predict([0.0, 0.0, 0.3]) 2 - - .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept, numFeatures, numClasses): super(LogisticRegressionModel, self).__init__(weights, intercept) @@ -263,54 +267,60 @@ def __repr__(self): class LogisticRegressionWithSGD(object): """ + Train a classification model for Binary Logistic Regression using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.classification.LogisticRegression or - LogisticRegressionWithLBFGS. + .. deprecated:: 2.0.0 + Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS. """ @classmethod - @since('0.9.0') def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): """ Train a logistic regression model on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or " @@ -326,55 +336,65 @@ def train(rdd, i): class LogisticRegressionWithLBFGS(object): """ + Train a classification model for Multinomial/Binary Logistic Regression + using Limited-memory BFGS. + + Standard feature scaling and L2 regularization are used by default. .. versionadded:: 1.2.0 """ @classmethod - @since('1.2.0') def train(cls, data, iterations=100, initialWeights=None, regParam=0.0, regType="l2", intercept=False, corrections=10, tolerance=1e-6, validateData=True, numClasses=2): """ Train a logistic regression model on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param corrections: - The number of corrections used in the LBFGS update. - If a known updater is used for binary classification, - it calls the ml implementation and this parameter will - have no effect. (default: 10) - :param tolerance: - The convergence tolerance of iterations for L-BFGS. - (default: 1e-6) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param numClasses: - The number of classes (i.e., outcomes) a label can take in - Multinomial Logistic Regression. - (default: 2) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + corrections : int, optional + The number of corrections used in the LBFGS update. + If a known updater is used for binary classification, + it calls the ml implementation and this parameter will + have no effect. (default: 10) + tolerance : float, optional + The convergence tolerance of iterations for L-BFGS. + (default: 1e-6) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + numClasses : int, optional + The number of classes (i.e., outcomes) a label can take in + Multinomial Logistic Regression. + (default: 2) + + Examples + -------- >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), ... LabeledPoint(1.0, [1.0, 0.0]), @@ -406,11 +426,17 @@ class SVMModel(LinearClassificationModel): """ Model for Support Vector Machines (SVMs). - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. + .. versionadded:: 0.9.0 + + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0]), @@ -451,8 +477,6 @@ class SVMModel(LinearClassificationModel): ... rmtree(path) ... except: ... pass - - .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept): super(SVMModel, self).__init__(weights, intercept) @@ -501,53 +525,59 @@ def load(cls, sc, path): class SVMWithSGD(object): """ + Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): """ Train a support vector machine on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regType: - The type of regularizer used for training our model. - Allowed values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regType : str, optional + The type of regularizer used for training our model. + Allowed values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step), @@ -563,14 +593,20 @@ class NaiveBayesModel(Saveable, Loader): """ Model for Naive Bayes classifiers. - :param labels: - List of labels. - :param pi: - Log of class priors, whose dimension is C, number of labels. - :param theta: - Log of class conditional probabilities, whose dimension is C-by-D, - where D is number of features. + .. versionadded:: 0.9.0 + Parameters + ---------- + labels : :py:class:`numpy.ndarray` + List of labels. + pi : :py:class:`numpy.ndarray` + Log of class priors, whose dimension is C, number of labels. + theta : :py:class:`numpy.ndarray` + Log of class conditional probabilities, whose dimension is C-by-D, + where D is number of features. + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0, 0.0]), @@ -605,8 +641,6 @@ class NaiveBayesModel(Saveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 0.9.0 """ def __init__(self, labels, pi, theta): self.labels = labels @@ -652,11 +686,12 @@ def load(cls, sc, path): class NaiveBayes(object): """ + Train a Multinomial Naive Bayes model. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) @@ -669,11 +704,15 @@ def train(cls, data, lambda_=1.0): it can also be used as `Bernoulli NB `_. The input feature values must be nonnegative. - :param data: - RDD of LabeledPoint. - :param lambda_: - The smoothing parameter. - (default: 1.0) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + lambda\\_ : float, optional + The smoothing parameter. + (default: 1.0) """ first = data.first() if not isinstance(first, LabeledPoint): @@ -694,23 +733,25 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): of features must be constant. An initial weight vector must be provided. - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param regParam: - L2 Regularization parameter. - (default: 0.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - .. versionadded:: 1.5.0 + + Parameters + ---------- + stepSize : float, optional + Step size for each iteration of gradient descent. + (default: 0.1) + numIterations : int, optional + Number of iterations run for each batch of data. + (default: 50) + miniBatchFraction : float, optional + Fraction of each batch of data to use for updates. + (default: 1.0) + regParam : float, optional + L2 Regularization parameter. + (default: 0.0) + convergenceTol : float, optional + Value used to determine when to terminate iterations. + (default: 0.001) """ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.0, convergenceTol=0.001): diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b99a4150c396d..e1a009643c5f2 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -41,6 +41,10 @@ class BisectingKMeansModel(JavaModelWrapper): """ A clustering model derived from the bisecting k-means method. + .. versionadded:: 2.0.0 + + Examples + -------- >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) >>> bskm = BisectingKMeans() >>> model = bskm.train(sc.parallelize(data, 2), k=4) @@ -51,8 +55,6 @@ class BisectingKMeansModel(JavaModelWrapper): 4 >>> model.computeCost(p) 0.0 - - .. versionadded:: 2.0.0 """ def __init__(self, java_model): @@ -72,17 +74,25 @@ def k(self): """Get the number of clusters""" return self.call("k") - @since('2.0.0') def predict(self, x): """ Find the cluster that each of the points belongs to in this model. - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. + .. versionadded:: 2.0.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to determine cluster index. + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). + + Returns + ------- + int or :py:class:`pyspark.RDD` of int + Predicted cluster index or an RDD of predicted cluster indices + if the input is an RDD. """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) @@ -91,15 +101,20 @@ def predict(self, x): x = _convert_to_vector(x) return self.call("predict", x) - @since('2.0.0') def computeCost(self, x): """ Return the Bisecting K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. If provided with an RDD of points returns the sum. - :param point: - A data point (or RDD of points) to compute the cost(s). + .. versionadded:: 2.0.0 + + Parameters + ---------- + point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to compute the cost(s). + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) @@ -122,37 +137,43 @@ class BisectingKMeans(object): clusters on the bottom level would result more than `k` leaf clusters, larger clusters get higher priority. - Based on - `Steinbach, Karypis, and Kumar, A comparison of document clustering - techniques, KDD Workshop on Text Mining, 2000 - `_. - .. versionadded:: 2.0.0 + + Notes + ----- + See the original paper [1]_ + + .. [1] Steinbach, M. et al. “A Comparison of Document Clustering Techniques.” (2000). + KDD Workshop on Text Mining, 2000 + http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf """ @classmethod - @since('2.0.0') def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): """ Runs the bisecting k-means algorithm return the model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - The desired number of leaf clusters. The actual number could - be smaller if there are no divisible leaf clusters. - (default: 4) - :param maxIterations: - Maximum number of iterations allowed to split clusters. - (default: 20) - :param minDivisibleClusterSize: - Minimum number of points (if >= 1.0) or the minimum proportion - of points (if < 1.0) of a divisible cluster. - (default: 1) - :param seed: - Random seed value for cluster initialization. - (default: -1888008604 from classOf[BisectingKMeans].getName.##) + .. versionadded:: 2.0.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + Training points as an `RDD` of `Vector` or convertible + sequence types. + k : int, optional + The desired number of leaf clusters. The actual number could + be smaller if there are no divisible leaf clusters. + (default: 4) + maxIterations : int, optional + Maximum number of iterations allowed to split clusters. + (default: 20) + minDivisibleClusterSize : float, optional + Minimum number of points (if >= 1.0) or the minimum proportion + of points (if < 1.0) of a divisible cluster. + (default: 1) + seed : int, optional + Random seed value for cluster initialization. + (default: -1888008604 from classOf[BisectingKMeans].getName.##) """ java_model = callMLlibFunc( "trainBisectingKMeans", rdd.map(_convert_to_vector), @@ -165,6 +186,10 @@ class KMeansModel(Saveable, Loader): """A clustering model derived from the k-means method. + .. versionadded:: 0.9.0 + + Examples + -------- >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) >>> model = KMeans.train( ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random", @@ -213,8 +238,6 @@ class KMeansModel(Saveable, Loader): ... initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)])) >>> model.clusterCenters [array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])] - - .. versionadded:: 0.9.0 """ def __init__(self, centers): @@ -232,17 +255,25 @@ def k(self): """Total number of clusters.""" return len(self.centers) - @since('0.9.0') def predict(self, x): """ Find the cluster that each of the points belongs to in this model. - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. + .. versionadded:: 0.9.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to determine cluster index. + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). + + Returns + ------- + int or :py:class:`pyspark.RDD` of int + Predicted cluster index or an RDD of predicted cluster indices + if the input is an RDD. """ best = 0 best_distance = float("inf") @@ -257,15 +288,18 @@ def predict(self, x): best_distance = distance return best - @since('1.4.0') def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. - :param rdd: - The RDD of points to compute the cost on. + .. versionadded:: 1.4.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + The RDD of points to compute the cost on. """ cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers]) @@ -292,46 +326,51 @@ def load(cls, sc, path): class KMeans(object): """ + K-means clustering. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, rdd, k, maxIterations=100, initializationMode="k-means||", seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of clusters to create. - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param initializationMode: - The initialization algorithm. This can be either "random" or - "k-means||". - (default: "k-means||") - :param seed: - Random seed value for cluster initialization. Set as None to - generate seed based on system time. - (default: None) - :param initializationSteps: - Number of steps for the k-means|| initialization mode. - This is an advanced setting -- the default of 2 is almost - always enough. - (default: 2) - :param epsilon: - Distance threshold within which a center will be considered to - have converged. If all centers move less than this Euclidean - distance, iterations are stopped. - (default: 1e-4) - :param initialModel: - Initial cluster centers can be provided as a KMeansModel object - rather than using the random or k-means|| initializationModel. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` + or convertible sequence types. + k : int + Number of clusters to create. + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 100) + initializationMode : str, optional + The initialization algorithm. This can be either "random" or + "k-means||". + (default: "k-means||") + seed : int, optional + Random seed value for cluster initialization. Set as None to + generate seed based on system time. + (default: None) + initializationSteps : + Number of steps for the k-means|| initialization mode. + This is an advanced setting -- the default of 2 is almost + always enough. + (default: 2) + epsilon : float, optional + Distance threshold within which a center will be considered to + have converged. If all centers move less than this Euclidean + distance, iterations are stopped. + (default: 1e-4) + initialModel : :py:class:`KMeansModel`, optional + Initial cluster centers can be provided as a KMeansModel object + rather than using the random or k-means|| initializationModel. + (default: None) """ clusterInitialModel = [] if initialModel is not None: @@ -352,6 +391,10 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ A clustering model derived from the Gaussian Mixture Model method. + .. versionadded:: 1.3.0 + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors, DenseMatrix >>> from numpy.testing import assert_equal >>> from shutil import rmtree @@ -410,8 +453,6 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): True >>> labels[2]==labels[3]==labels[4] True - - .. versionadded:: 1.3.0 """ @property @@ -440,17 +481,23 @@ def k(self): """Number of gaussians in mixture.""" return len(self.weights) - @since('1.3.0') def predict(self, x): """ Find the cluster to which the point 'x' or each point in RDD 'x' has maximum membership in this model. - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - Predicted cluster label or an RDD of predicted cluster labels - if the input is an RDD. + .. versionadded:: 1.3.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A feature vector or an RDD of vectors representing data points. + + Returns + ------- + numpy.float64 or :py:class:`pyspark.RDD` of int + Predicted cluster label or an RDD of predicted cluster labels + if the input is an RDD. """ if isinstance(x, RDD): cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z))) @@ -459,16 +506,22 @@ def predict(self, x): z = self.predictSoft(x) return z.argmax() - @since('1.3.0') def predictSoft(self, x): """ Find the membership of point 'x' or each point in RDD 'x' to all mixture components. - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - The membership value to all mixture components for vector 'x' - or each vector in RDD 'x'. + .. versionadded:: 1.3.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A feature vector or an RDD of vectors representing data points. + + Returns + ------- + numpy.ndarray or :py:class:`pyspark.RDD` + The membership value to all mixture components for vector 'x' + or each vector in RDD 'x'. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) @@ -479,14 +532,16 @@ def predictSoft(self, x): return self.call("predictSoft", _convert_to_vector(x)).toArray() @classmethod - @since('1.5.0') def load(cls, sc, path): """Load the GaussianMixtureModel from disk. - :param sc: - SparkContext. - :param path: - Path to where the model is stored. + .. versionadded:: 1.5.0 + + Parameters + ---------- + sc : :py:class:`SparkContext` + path : str + Path to where the model is stored. """ model = cls._load_java(sc, path) wrapper = sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model) @@ -499,32 +554,36 @@ class GaussianMixture(object): .. versionadded:: 1.3.0 """ + @classmethod - @since('1.3.0') def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None): """ Train a Gaussian Mixture clustering model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of independent Gaussians in the mixture model. - :param convergenceTol: - Maximum change in log-likelihood at which convergence is - considered to have occurred. - (default: 1e-3) - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param seed: - Random seed for initial Gaussian distribution. Set as None to - generate seed based on system time. - (default: None) - :param initialModel: - Initial GMM starting point, bypassing the random - initialization. - (default: None) + .. versionadded:: 1.3.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` + or convertible sequence types. + k : int + Number of independent Gaussians in the mixture model. + convergenceTol : float, optional + Maximum change in log-likelihood at which convergence is + considered to have occurred. + (default: 1e-3) + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 100) + seed : int, optional + Random seed for initial Gaussian distribution. Set as None to + generate seed based on system time. + (default: None) + initialModel : GaussianMixtureModel, optional + Initial GMM starting point, bypassing the random + initialization. + (default: None) """ initialModelWeights = None initialModelMu = None @@ -545,8 +604,12 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - Model produced by [[PowerIterationClustering]]. + Model produced by :py:class:`PowerIterationClustering`. + .. versionadded:: 1.5.0 + + Examples + -------- >>> import math >>> def genCircle(r, n): ... points = [] @@ -589,8 +652,6 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.5.0 """ @property @@ -623,37 +684,48 @@ def load(cls, sc, path): class PowerIterationClustering(object): """ - Power Iteration Clustering (PIC), a scalable graph clustering algorithm - developed by [[http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf Lin and Cohen]]. - From the abstract: PIC finds a very low-dimensional embedding of a - dataset using truncated power iteration on a normalized pair-wise - similarity matrix of the data. + Power Iteration Clustering (PIC), a scalable graph clustering algorithm. + + + Developed by Lin and Cohen [1]_. From the abstract: + + "PIC finds a very low-dimensional embedding of a + dataset using truncated power iteration on a normalized pair-wise + similarity matrix of the data." .. versionadded:: 1.5.0 + + .. [1] Lin, Frank & Cohen, William. (2010). Power Iteration Clustering. + http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf """ @classmethod - @since('1.5.0') def train(cls, rdd, k, maxIterations=100, initMode="random"): r""" - :param rdd: - An RDD of (i, j, s\ :sub:`ij`\) tuples representing the - affinity matrix, which is the matrix A in the PIC paper. The - similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric - matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with - nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or - (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, - because it is assumed s\ :sub:`ij`\ = 0.0. - :param k: - Number of clusters. - :param maxIterations: - Maximum number of iterations of the PIC algorithm. - (default: 100) - :param initMode: - Initialization mode. This can be either "random" to use - a random vector as vertex properties, or "degree" to use - normalized sum similarities. - (default: "random") + Train PowerIterationClusteringModel + + .. versionadded:: 1.5.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + An RDD of (i, j, s\ :sub:`ij`\) tuples representing the + affinity matrix, which is the matrix A in the PIC paper. The + similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric + matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with + nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or + (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, + because it is assumed s\ :sub:`ij`\ = 0.0. + k : int + Number of clusters. + maxIterations : int, optional + Maximum number of iterations of the PIC algorithm. + (default: 100) + initMode : str, optional + Initialization mode. This can be either "random" to use + a random vector as vertex properties, or "degree" to use + normalized sum similarities. + (default: "random") """ model = callMLlibFunc("trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode) @@ -673,29 +745,37 @@ class StreamingKMeansModel(KMeansModel): The update formula for each centroid is given by - * c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t) - * n_t+1 = n_t * a + m_t + - c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t) + - n_t+1 = n_t * a + m_t where - * c_t: Centroid at the n_th iteration. - * n_t: Number of samples (or) weights associated with the centroid - at the n_th iteration. - * x_t: Centroid of the new data closest to c_t. - * m_t: Number of samples (or) weights of the new data closest to c_t - * c_t+1: New centroid. - * n_t+1: New number of weights. - * a: Decay Factor, which gives the forgetfulness. + - c_t: Centroid at the n_th iteration. + - n_t: Number of samples (or) weights associated with the centroid + at the n_th iteration. + - x_t: Centroid of the new data closest to c_t. + - m_t: Number of samples (or) weights of the new data closest to c_t + - c_t+1: New centroid. + - n_t+1: New number of weights. + - a: Decay Factor, which gives the forgetfulness. - .. note:: If a is set to 1, it is the weighted mean of the previous - and new data. If it set to zero, the old centroids are completely - forgotten. - - :param clusterCenters: - Initial cluster centers. - :param clusterWeights: - List of weights assigned to each cluster. + .. versionadded:: 1.5.0 + Parameters + ---------- + clusterCenters : list of :py:class:`pyspark.mllib.linalg.Vector` or covertible + Initial cluster centers. + clusterWeights : :py:class:`pyspark.mllib.linalg.Vector` or covertible + List of weights assigned to each cluster. + + Notes + ----- + If a is set to 1, it is the weighted mean of the previous + and new data. If it set to zero, the old centroids are completely + forgotten. + + Examples + -------- >>> initCenters = [[0.0, 0.0], [1.0, 1.0]] >>> initWeights = [1.0, 1.0] >>> stkm = StreamingKMeansModel(initCenters, initWeights) @@ -723,8 +803,6 @@ class StreamingKMeansModel(KMeansModel): 0 >>> stkm.predict([1.5, 1.5]) 1 - - .. versionadded:: 1.5.0 """ def __init__(self, clusterCenters, clusterWeights): super(StreamingKMeansModel, self).__init__(centers=clusterCenters) @@ -740,14 +818,18 @@ def clusterWeights(self): def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data - :param data: - RDD with new data for the model update. - :param decayFactor: - Forgetfulness of the previous centroids. - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor - is raised to the power of number of new points and if batches, - then decay factor will be used as is. + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD with new data for the model update. + decayFactor : float + Forgetfulness of the previous centroids. + timeUnit : str + Can be "batches" or "points". If points, then the decay factor + is raised to the power of number of new points and if batches, + then decay factor will be used as is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) @@ -772,19 +854,21 @@ class StreamingKMeans(object): More details on how the centroids are updated are provided under the docs of StreamingKMeansModel. - :param k: - Number of clusters. - (default: 2) - :param decayFactor: - Forgetfulness of the previous centroids. - (default: 1.0) - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor is - raised to the power of number of new points and if batches, then - decay factor will be used as is. - (default: "batches") - .. versionadded:: 1.5.0 + + Parameters + ---------- + k : int, optional + Number of clusters. + (default: 2) + decayFactor : float, optional + Forgetfulness of the previous centroids. + (default: 1.0) + timeUnit : str, optional + Can be "batches" or "points". If points, then the decay factor is + raised to the power of number of new points and if batches, then + decay factor will be used as is. + (default: "batches") """ def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"): self._k = k @@ -887,13 +971,23 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader): Latent Dirichlet Allocation (LDA), a topic model designed for text documents. Terminology + - "word" = "term": an element of the vocabulary - "token": instance of a term appearing in a document - "topic": multinomial distribution over words representing some concept - References: - - Original LDA paper (journal version): - Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. + .. versionadded:: 1.5.0 + + Notes + ----- + See the original LDA paper (journal version) [1]_ + + .. [1] Blei, D. et al. "Latent Dirichlet Allocation." + J. Mach. Learn. Res. 3 (2003): 993-1022. + https://www.jmlr.org/papers/v3/blei03a + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> from numpy.testing import assert_almost_equal, assert_equal >>> data = [ @@ -925,8 +1019,6 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.5.0 """ @since('1.5.0') @@ -939,19 +1031,24 @@ def vocabSize(self): """Vocabulary size (number of terms or terms in the vocabulary)""" return self.call("vocabSize") - @since('1.6.0') def describeTopics(self, maxTermsPerTopic=None): """Return the topics described by weighted terms. - WARNING: If vocabSize and k are large, this can return a large object! - - :param maxTermsPerTopic: - Maximum number of terms to collect for each topic. - (default: vocabulary size) - :return: - Array over topics. Each topic is represented as a pair of - matching arrays: (term indices, term weights in topic). - Each topic's terms are sorted in order of decreasing weight. + .. versionadded:: 1.6.0 + .. warning:: If vocabSize and k are large, this can return a large object! + + Parameters + ---------- + maxTermsPerTopic : int, optional + Maximum number of terms to collect for each topic. + (default: vocabulary size) + + Returns + ------- + list + Array over topics. Each topic is represented as a pair of + matching arrays: (term indices, term weights in topic). + Each topic's terms are sorted in order of decreasing weight. """ if maxTermsPerTopic is None: topics = self.call("describeTopics") @@ -960,14 +1057,16 @@ def describeTopics(self, maxTermsPerTopic=None): return topics @classmethod - @since('1.5.0') def load(cls, sc, path): """Load the LDAModel from disk. - :param sc: - SparkContext. - :param path: - Path to where the model is stored. + .. versionadded:: 1.5.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + path : str + Path to where the model is stored. """ if not isinstance(sc, SparkContext): raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) @@ -979,47 +1078,52 @@ def load(cls, sc, path): class LDA(object): """ + Train Latent Dirichlet Allocation (LDA) model. + .. versionadded:: 1.5.0 """ @classmethod - @since('1.5.0') def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0, topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"): """Train a LDA model. - :param rdd: - RDD of documents, which are tuples of document IDs and term - (word) count vectors. The term count vectors are "bags of - words" with a fixed-size vocabulary (where the vocabulary size - is the length of the vector). Document IDs must be unique - and >= 0. - :param k: - Number of topics to infer, i.e., the number of soft cluster - centers. - (default: 10) - :param maxIterations: - Maximum number of iterations allowed. - (default: 20) - :param docConcentration: - Concentration parameter (commonly named "alpha") for the prior - placed on documents' distributions over topics ("theta"). - (default: -1.0) - :param topicConcentration: - Concentration parameter (commonly named "beta" or "eta") for - the prior placed on topics' distributions over terms. - (default: -1.0) - :param seed: - Random seed for cluster initialization. Set as None to generate - seed based on system time. - (default: None) - :param checkpointInterval: - Period (in iterations) between checkpoints. - (default: 10) - :param optimizer: - LDAOptimizer used to perform the actual calculation. Currently - "em", "online" are supported. - (default: "em") + .. versionadded:: 1.5.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + RDD of documents, which are tuples of document IDs and term + (word) count vectors. The term count vectors are "bags of + words" with a fixed-size vocabulary (where the vocabulary size + is the length of the vector). Document IDs must be unique + and >= 0. + k : int, optional + Number of topics to infer, i.e., the number of soft cluster + centers. + (default: 10) + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 20) + docConcentration : float, optional + Concentration parameter (commonly named "alpha") for the prior + placed on documents' distributions over topics ("theta"). + (default: -1.0) + topicConcentration : float, optional + Concentration parameter (commonly named "beta" or "eta") for + the prior placed on topics' distributions over terms. + (default: -1.0) + seed : int, optional + Random seed for cluster initialization. Set as None to generate + seed based on system time. + (default: None) + checkpointInterval : int, optional + Period (in iterations) between checkpoints. + (default: 10) + optimizer : str, optional + LDAOptimizer used to perform the actual calculation. Currently + "em", "online" are supported. + (default: "em") """ model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations, docConcentration, topicConcentration, seed, diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index f3be827fb6e4f..198a9791774a9 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -30,8 +30,15 @@ class BinaryClassificationMetrics(JavaModelWrapper): """ Evaluator for binary classification. - :param scoreAndLabels: an RDD of score, label and optional weight. + .. versionadded:: 1.4.0 + + Parameters + ---------- + scoreAndLabels : :py:class:`pyspark.RDD` + an RDD of score, label and optional weight. + Examples + -------- >>> scoreAndLabels = sc.parallelize([ ... (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2) >>> metrics = BinaryClassificationMetrics(scoreAndLabels) @@ -48,8 +55,6 @@ class BinaryClassificationMetrics(JavaModelWrapper): 0.79... >>> metrics.areaUnderPR 0.88... - - .. versionadded:: 1.4.0 """ def __init__(self, scoreAndLabels): @@ -95,8 +100,15 @@ class RegressionMetrics(JavaModelWrapper): """ Evaluator for regression. - :param predictionAndObservations: an RDD of prediction, observation and optional weight. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndObservations : :py:class:`pyspark.RDD` + an RDD of prediction, observation and optional weight. + Examples + -------- >>> predictionAndObservations = sc.parallelize([ ... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)]) >>> metrics = RegressionMetrics(predictionAndObservations) @@ -115,8 +127,6 @@ class RegressionMetrics(JavaModelWrapper): >>> metrics = RegressionMetrics(predictionAndObservationsWithOptWeight) >>> metrics.rootMeanSquaredError 0.68... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndObservations): @@ -182,9 +192,15 @@ class MulticlassMetrics(JavaModelWrapper): """ Evaluator for multiclass classification. - :param predictionAndLabels: an RDD of prediction, label, optional weight - and optional probability. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of prediction, label, optional weight and optional probability. + Examples + -------- >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) >>> metrics = MulticlassMetrics(predictionAndLabels) @@ -246,8 +262,6 @@ class MulticlassMetrics(JavaModelWrapper): >>> metrics = MulticlassMetrics(predictionAndLabelsWithProbabilities) >>> metrics.logLoss() 0.9682... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): @@ -377,9 +391,15 @@ class RankingMetrics(JavaModelWrapper): """ Evaluator for ranking algorithms. - :param predictionAndLabels: an RDD of (predicted ranking, - ground truth set) pairs. + .. versionadded:: 1.4.0 + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of (predicted ranking, ground truth set) pairs. + + Examples + -------- >>> predictionAndLabels = sc.parallelize([ ... ([1, 6, 2, 7, 8, 3, 9, 10, 4, 5], [1, 2, 3, 4, 5]), ... ([4, 1, 5, 6, 2, 7, 3, 8, 9, 10], [1, 2, 3]), @@ -407,8 +427,6 @@ class RankingMetrics(JavaModelWrapper): 0.35... >>> metrics.recallAt(15) 0.66... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): @@ -484,10 +502,16 @@ class MultilabelMetrics(JavaModelWrapper): """ Evaluator for multilabel classification. - :param predictionAndLabels: an RDD of (predictions, labels) pairs, - both are non-null Arrays, each with - unique elements. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of (predictions, labels) pairs, + both are non-null Arrays, each with unique elements. + Examples + -------- >>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]), ... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]), ... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]) @@ -516,8 +540,6 @@ class MultilabelMetrics(JavaModelWrapper): 0.28... >>> metrics.accuracy 0.54... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index d95f9197eaedf..1d37ab815655b 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -41,7 +41,10 @@ def transform(self, vector): """ Applies transformation on a vector. - :param vector: vector to be transformed. + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + vector or convertible or RDD to be transformed. """ raise NotImplementedError @@ -56,8 +59,15 @@ class Normalizer(VectorTransformer): For `p` = float('inf'), max(abs(vector)) will be used as norm for normalization. - :param p: Normalization in L^p^ space, p = 2 by default. + .. versionadded:: 1.2.0 + + Parameters + ---------- + p : float, optional + Normalization in L^p^ space, p = 2 by default. + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> v = Vectors.dense(range(3)) >>> nor = Normalizer(1) @@ -71,21 +81,27 @@ class Normalizer(VectorTransformer): >>> nor2 = Normalizer(float("inf")) >>> nor2.transform(v) DenseVector([0.0, 0.5, 1.0]) - - .. versionadded:: 1.2.0 """ def __init__(self, p=2.0): assert p >= 1.0, "p should be greater than 1.0" self.p = float(p) - @since('1.2.0') def transform(self, vector): """ Applies unit length normalization on a vector. - :param vector: vector or RDD of vector to be normalized. - :return: normalized vector. If the norm of the input is zero, it - will return the input vector. + .. versionadded:: 1.2.0 + + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + vector or RDD of vector to be normalized. + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + normalized vector(s). If the norm of the input is zero, it + will return the input vector. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) @@ -103,11 +119,16 @@ def transform(self, vector): """ Applies transformation on a vector or an RDD[Vector]. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be transformed. - :param vector: Vector or RDD of Vector to be transformed. + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) @@ -123,19 +144,29 @@ class StandardScalerModel(JavaVectorTransformer): .. versionadded:: 1.2.0 """ - @since('1.2.0') def transform(self, vector): """ Applies standardization transformation on a vector. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. versionadded:: 1.2.0 + + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be standardized. - :param vector: Vector or RDD of Vector to be standardized. - :return: Standardized vector. If the variance of a column is - zero, it will return default `0.0` for the column with - zero variance. + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Standardized vector(s). If the variance of a column is + zero, it will return default `0.0` for the column with + zero variance. + + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ return JavaVectorTransformer.transform(self, vector) @@ -196,12 +227,20 @@ class StandardScaler(object): variance using column summary statistics on the samples in the training set. - :param withMean: False by default. Centers the data with mean - before scaling. It will build a dense output, so take - care when applying to sparse input. - :param withStd: True by default. Scales the data to unit - standard deviation. + .. versionadded:: 1.2.0 + Parameters + ---------- + withMean : bool, optional + False by default. Centers the data with mean + before scaling. It will build a dense output, so take + care when applying to sparse input. + withStd : bool, optional + True by default. Scales the data to unit + standard deviation. + + Examples + -------- >>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] >>> dataset = sc.parallelize(vs) >>> standardizer = StandardScaler(True, True) @@ -218,8 +257,6 @@ class StandardScaler(object): True >>> model.withMean True - - .. versionadded:: 1.2.0 """ def __init__(self, withMean=False, withStd=True): if not (withMean or withStd): @@ -227,15 +264,22 @@ def __init__(self, withMean=False, withStd=True): self.withMean = withMean self.withStd = withStd - @since('1.2.0') def fit(self, dataset): """ Computes the mean and variance and stores as a model to be used for later scaling. - :param dataset: The data used to compute the mean and variance - to build the transformation model. - :return: a StandardScalarModel + .. versionadded:: 1.2.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.RDD` + The data used to compute the mean and variance + to build the transformation model. + + Returns + ------- + :py:class:`StandardScalerModel` """ dataset = dataset.map(_convert_to_vector) jmodel = callMLlibFunc("fitStandardScaler", self.withMean, self.withStd, dataset) @@ -249,13 +293,21 @@ class ChiSqSelectorModel(JavaVectorTransformer): .. versionadded:: 1.4.0 """ - @since('1.4.0') def transform(self, vector): """ Applies transformation on a vector. - :param vector: Vector or RDD of Vector to be transformed. - :return: transformed vector. + .. versionadded:: 1.4.0 + + Examples + -------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be transformed. + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + transformed vector(s). """ return JavaVectorTransformer.transform(self, vector) @@ -284,6 +336,10 @@ class ChiSqSelector(object): By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. + .. versionadded:: 1.4.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector, DenseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = sc.parallelize([ @@ -306,8 +362,6 @@ class ChiSqSelector(object): >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data) >>> model.transform(DenseVector([7.0, 9.0, 5.0])) DenseVector([7.0]) - - .. versionadded:: 1.4.0 """ def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05): @@ -372,15 +426,18 @@ def setSelectorType(self, selectorType): self.selectorType = str(selectorType) return self - @since('1.4.0') def fit(self, data): """ Returns a ChiSquared feature selector. - :param data: an `RDD[LabeledPoint]` containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - Apply feature discretizer before using this function. + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` of :py:class:`pyspark.mllib.regression.LabeledPoint` + containing the labeled dataset with categorical features. + Real-valued features will be treated as categorical for each + distinct value. Apply feature discretizer before using this function. """ jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures, self.percentile, self.fpr, self.fdr, self.fwe, data) @@ -399,6 +456,10 @@ class PCA(object): """ A feature transformer that projects vectors to a low-dimensional space using PCA. + .. versionadded:: 1.5.0 + + Examples + -------- >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])] @@ -408,20 +469,26 @@ class PCA(object): 1.648... >>> pcArray[1] -4.013... - - .. versionadded:: 1.5.0 """ def __init__(self, k): """ - :param k: number of principal components. + Parameters + ---------- + k : int + number of principal components. """ self.k = int(k) - @since('1.5.0') def fit(self, data): """ Computes a [[PCAModel]] that contains the principal components of the input vectors. - :param data: source vectors + + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + source vectors """ jmodel = callMLlibFunc("fitPCA", self.k, data) return PCAModel(jmodel) @@ -432,16 +499,23 @@ class HashingTF(object): Maps a sequence of terms to their term frequencies using the hashing trick. - .. note:: The terms must be hashable (can not be dict/set/list...). + .. versionadded:: 1.2.0 + + Parameters + ---------- + numFeatures : int, optional + number of features (default: 2^20) - :param numFeatures: number of features (default: 2^20) + Notes + ----- + The terms must be hashable (can not be dict/set/list...). + Examples + -------- >>> htf = HashingTF(100) >>> doc = "a a b b c d".split(" ") >>> htf.transform(doc) SparseVector(100, {...}) - - .. versionadded:: 1.2.0 """ def __init__(self, numFeatures=1 << 20): self.numFeatures = numFeatures @@ -485,7 +559,7 @@ class IDFModel(JavaVectorTransformer): .. versionadded:: 1.2.0 """ - @since('1.2.0') + def transform(self, x): """ Transforms term frequency (TF) vectors to TF-IDF vectors. @@ -494,13 +568,24 @@ def transform(self, x): the terms which occur in fewer than `minDocFreq` documents will have an entry of 0. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. versionadded:: 1.2.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + an RDD of term frequency vectors or a term frequency + vector - :param x: an RDD of term frequency vectors or a term frequency - vector - :return: an RDD of TF-IDF vectors or a TF-IDF vector + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + an RDD of TF-IDF vectors or a TF-IDF vector + + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ return JavaVectorTransformer.transform(self, x) @@ -539,9 +624,15 @@ class IDF(object): `minDocFreq`). For terms that are not in at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0. - :param minDocFreq: minimum of documents in which a term - should appear for filtering + .. versionadded:: 1.2.0 + + Parameters + ---------- + minDocFreq : int + minimum of documents in which a term should appear for filtering + Examples + -------- >>> n = 4 >>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)), ... Vectors.dense([0.0, 1.0, 2.0, 3.0]), @@ -560,18 +651,20 @@ class IDF(object): DenseVector([0.0, 0.0, 1.3863, 0.863]) >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0))) SparseVector(4, {1: 0.0, 3: 0.5754}) - - .. versionadded:: 1.2.0 """ def __init__(self, minDocFreq=0): self.minDocFreq = minDocFreq - @since('1.2.0') def fit(self, dataset): """ Computes the inverse document frequency. - :param dataset: an RDD of term frequency vectors + .. versionadded:: 1.2.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.RDD` + an RDD of term frequency vectors """ if not isinstance(dataset, RDD): raise TypeError("dataset should be an RDD of term frequency vectors") @@ -582,34 +675,55 @@ def fit(self, dataset): class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): """ class for Word2Vec model - - .. versionadded:: 1.2.0 """ - @since('1.2.0') + def transform(self, word): """ Transforms a word to its vector representation - .. note:: Local use only + .. versionadded:: 1.2.0 + + Parameters + ---------- + word : str + a word - :param word: a word - :return: vector representation of word(s) + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` + vector representation of word(s) + + Notes + ----- + Local use only """ try: return self.call("transform", word) except Py4JJavaError: raise ValueError("%s not found" % word) - @since('1.2.0') def findSynonyms(self, word, num): """ Find synonyms of a word - :param word: a word or a vector representation of word - :param num: number of synonyms to find - :return: array of (word, cosineSimilarity) + .. versionadded:: 1.2.0 + + Parameters + ---------- + + word : str or :py:class:`pyspark.mllib.linalg.Vector` + a word or a vector representation of word + num : int + number of synonyms to find + + Returns + ------- + :py:class:`collections.abc.Iterable` + array of (word, cosineSimilarity) - .. note:: Local use only + Notes + ----- + Local use only """ if not isinstance(word, str): word = _convert_to_vector(word) @@ -653,6 +767,10 @@ class Word2Vec(object): and Distributed Representations of Words and Phrases and their Compositionality. + .. versionadded:: 1.2.0 + + Examples + -------- >>> sentence = "a b " * 100 + "a c " * 10 >>> localDoc = [sentence, sentence] >>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" ")) @@ -686,9 +804,6 @@ class Word2Vec(object): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.2.0 - """ def __init__(self): """ @@ -761,13 +876,20 @@ def setWindowSize(self, windowSize): self.windowSize = windowSize return self - @since('1.2.0') def fit(self, data): """ Computes the vector representation of each word in vocabulary. - :param data: training data. RDD of list of string - :return: Word2VecModel instance + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + training data. RDD of list of string + + Returns + ------- + :py:class:`Word2VecModel` """ if not isinstance(data, RDD): raise TypeError("data should be an RDD of list of string") @@ -783,6 +905,10 @@ class ElementwiseProduct(VectorTransformer): Scales each column of the vector, with the supplied weight vector. i.e the elementwise product. + .. versionadded:: 1.5.0 + + Examples + -------- >>> weight = Vectors.dense([1.0, 2.0, 3.0]) >>> eprod = ElementwiseProduct(weight) >>> a = Vectors.dense([2.0, 1.0, 3.0]) @@ -792,8 +918,6 @@ class ElementwiseProduct(VectorTransformer): >>> rdd = sc.parallelize([a, b]) >>> eprod.transform(rdd).collect() [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])] - - .. versionadded:: 1.5.0 """ def __init__(self, scalingVector): self.scalingVector = _convert_to_vector(scalingVector) diff --git a/python/pyspark/mllib/feature.pyi b/python/pyspark/mllib/feature.pyi index 9ccec36abd6ff..24a46f6bee798 100644 --- a/python/pyspark/mllib/feature.pyi +++ b/python/pyspark/mllib/feature.pyi @@ -17,7 +17,7 @@ # under the License. from typing import overload -from typing import Iterable, Hashable, List, Tuple +from typing import Iterable, Hashable, List, Tuple, Union from pyspark.mllib._typing import VectorLike from pyspark.context import SparkContext @@ -135,7 +135,7 @@ class IDF: class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader[Word2VecModel]): def transform(self, word: str) -> Vector: ... # type: ignore - def findSynonyms(self, word: str, num: int) -> Iterable[Tuple[str, float]]: ... + def findSynonyms(self, word: Union[str, VectorLike], num: int) -> Iterable[Tuple[str, float]]: ... def getVectors(self) -> JavaMap: ... @classmethod def load(cls, sc: SparkContext, path: str) -> Word2VecModel: ... diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index cbbd7b351b20d..1f87a15cb11c9 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -32,6 +32,10 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): A FP-Growth model for mining frequent itemsets using the Parallel FP-Growth algorithm. + .. versionadded:: 1.4.0 + + Examples + -------- >>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] >>> rdd = sc.parallelize(data, 2) >>> model = FPGrowth.train(rdd, 0.6, 2) @@ -42,8 +46,6 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> sameModel = FPGrowthModel.load(sc, model_path) >>> sorted(model.freqItemsets().collect()) == sorted(sameModel.freqItemsets().collect()) True - - .. versionadded:: 1.4.0 """ @since("1.4.0") @@ -72,20 +74,23 @@ class FPGrowth(object): """ @classmethod - @since("1.4.0") def train(cls, data, minSupport=0.3, numPartitions=-1): """ Computes an FP-Growth model that contains frequent itemsets. - :param data: - The input data set, each element contains a transaction. - :param minSupport: - The minimal support level. - (default: 0.3) - :param numPartitions: - The number of partitions used by parallel FP-growth. A value - of -1 will use the same number as input data. - (default: -1) + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The input data set, each element contains a transaction. + minSupport : float, optional + The minimal support level. + (default: 0.3) + numPartitions : int, optional + The number of partitions used by parallel FP-growth. A value + of -1 will use the same number as input data. + (default: -1) """ model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions)) return FPGrowthModel(model) @@ -103,6 +108,10 @@ class PrefixSpanModel(JavaModelWrapper): """ Model fitted by PrefixSpan + .. versionadded:: 1.6.0 + + Examples + -------- >>> data = [ ... [["a", "b"], ["c"]], ... [["a"], ["c", "b"], ["a", "b"]], @@ -112,8 +121,6 @@ class PrefixSpanModel(JavaModelWrapper): >>> model = PrefixSpan.train(rdd) >>> sorted(model.freqSequences().collect()) [FreqSequence(sequence=[['a']], freq=3), FreqSequence(sequence=[['a'], ['a']], freq=1), ... - - .. versionadded:: 1.6.0 """ @since("1.6.0") @@ -125,38 +132,45 @@ def freqSequences(self): class PrefixSpan(object): """ A parallel PrefixSpan algorithm to mine frequent sequential patterns. - The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: - Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth - ([[https://doi.org/10.1109/ICDE.2001.914830]]). + The PrefixSpan algorithm is described in Jian Pei et al (2001) [1]_ .. versionadded:: 1.6.0 + + .. [1] Jian Pei et al., + "PrefixSpan,: mining sequential patterns efficiently by prefix-projected pattern growth," + Proceedings 17th International Conference on Data Engineering, Heidelberg, + Germany, 2001, pp. 215-224, + doi: https://doi.org/10.1109/ICDE.2001.914830 """ @classmethod - @since("1.6.0") def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ Finds the complete set of frequent sequential patterns in the input sequences of itemsets. - :param data: - The input data set, each element contains a sequence of - itemsets. - :param minSupport: - The minimal support level of the sequential pattern, any - pattern that appears more than (minSupport * - size-of-the-dataset) times will be output. - (default: 0.1) - :param maxPatternLength: - The maximal length of the sequential pattern, any pattern - that appears less than maxPatternLength will be output. - (default: 10) - :param maxLocalProjDBSize: - The maximum number of items (including delimiters used in the - internal storage format) allowed in a projected database before - local processing. If a projected database exceeds this size, - another iteration of distributed prefix growth is run. - (default: 32000000) + .. versionadded:: 1.6.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The input data set, each element contains a sequence of + itemsets. + minSupport : float, optional + The minimal support level of the sequential pattern, any + pattern that appears more than (minSupport * + size-of-the-dataset) times will be output. + (default: 0.1) + maxPatternLength : int, optional + The maximal length of the sequential pattern, any pattern + that appears less than maxPatternLength will be output. + (default: 10) + maxLocalProjDBSize : int, optional + The maximum number of items (including delimiters used in the + internal storage format) allowed in a projected database before + local processing. If a projected database exceeds this size, + another iteration of distributed prefix growth is run. + (default: 32000000) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) diff --git a/python/pyspark/mllib/fpm.pyi b/python/pyspark/mllib/fpm.pyi index 880baae1a91a5..c5a6b5f6806c0 100644 --- a/python/pyspark/mllib/fpm.pyi +++ b/python/pyspark/mllib/fpm.pyi @@ -37,8 +37,8 @@ class FPGrowth: cls, data: RDD[List[T]], minSupport: float = ..., numPartitions: int = ... ) -> FPGrowthModel[T]: ... class FreqItemset(Generic[T]): - items = ... # List[T] - freq = ... # int + items: List[T] + freq: int class PrefixSpanModel(JavaModelWrapper, Generic[T]): def freqSequences(self) -> RDD[PrefixSpan.FreqSequence[T]]: ... diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index c1402fb98a50d..f20004ab70ab3 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -71,6 +71,8 @@ def _vector_size(v): """ Returns the size of the vector. + Examples + -------- >>> _vector_size([1., 2., 3.]) 3 >>> _vector_size((1., 2., 3.)) @@ -231,7 +233,9 @@ def toArray(self): """ Convert the vector into an numpy.ndarray - :return: numpy.ndarray + Returns + ------- + :py:class:`numpy.ndarray` """ raise NotImplementedError @@ -240,7 +244,9 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.Vector` + Returns + ------- + :py:class:`pyspark.ml.linalg.Vector` """ raise NotImplementedError @@ -251,6 +257,8 @@ class DenseVector(Vector): storage and arithmetics will be delegated to the underlying numpy array. + Examples + -------- >>> v = Vectors.dense([1.0, 2.0]) >>> u = Vectors.dense([3.0, 4.0]) >>> v + u @@ -282,6 +290,8 @@ def parse(s): """ Parse string representation back into the DenseVector. + Examples + -------- >>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]') DenseVector([0.0, 1.0, 2.0, 3.0]) """ @@ -312,6 +322,8 @@ def norm(self, p): """ Calculates the norm of a DenseVector. + Examples + -------- >>> a = DenseVector([0, -1, 2, -3]) >>> a.norm(2) 3.7... @@ -327,6 +339,8 @@ def dot(self, other): and a target NumPy array that is either 1- or 2-dimensional. Equivalent to calling numpy.dot of the two vectors. + Examples + -------- >>> dense = DenseVector(array.array('d', [1., 2.])) >>> dense.dot(dense) 5.0 @@ -367,6 +381,8 @@ def squared_distance(self, other): """ Squared distance of two Vectors. + Examples + -------- >>> dense1 = DenseVector(array.array('d', [1., 2.])) >>> dense1.squared_distance(dense1) 0.0 @@ -412,9 +428,11 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.DenseVector` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.DenseVector` """ return newlinalg.DenseVector(self.array) @@ -501,12 +519,18 @@ def __init__(self, size, *args): (index, value) pairs, or two separate arrays of indices and values (sorted by index). - :param size: Size of the vector. - :param args: Active entries, as a dictionary {index: value, ...}, - a list of tuples [(index, value), ...], or a list of strictly - increasing indices and a list of corresponding values [index, ...], - [value, ...]. Inactive entries are treated as zeros. - + Parameters + ---------- + size : int + Size of the vector. + args + Active entries, as a dictionary {index: value, ...}, + a list of tuples [(index, value), ...], or a list of strictly + increasing indices and a list of corresponding values [index, ...], + [value, ...]. Inactive entries are treated as zeros. + + Examples + -------- >>> SparseVector(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) @@ -556,6 +580,8 @@ def norm(self, p): """ Calculates the norm of a SparseVector. + Examples + -------- >>> a = SparseVector(4, [0, 1], [3., -4.]) >>> a.norm(1) 7.0 @@ -574,6 +600,8 @@ def parse(s): """ Parse string representation back into the SparseVector. + Examples + -------- >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )') SparseVector(4, {0: 4.0, 1: 5.0}) """ @@ -622,6 +650,8 @@ def dot(self, other): """ Dot product with a SparseVector or 1- or 2-dimensional Numpy array. + Examples + -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.dot(a) 25.0 @@ -678,6 +708,8 @@ def squared_distance(self, other): """ Squared distance from a SparseVector or 1-dimensional NumPy array. + Examples + -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.squared_distance(a) 0.0 @@ -754,9 +786,11 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.SparseVector` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.SparseVector` """ return newlinalg.SparseVector(self.size, self.indices, self.values) @@ -828,10 +862,12 @@ class Vectors(object): """ Factory methods for working with vectors. - .. note:: Dense vectors are simply represented as NumPy array objects, - so there is no need to covert them for use in MLlib. For sparse vectors, - the factory methods in this class create an MLlib-compatible type, or users - can pass in SciPy's `scipy.sparse` column vectors. + Notes + ----- + Dense vectors are simply represented as NumPy array objects, + so there is no need to covert them for use in MLlib. For sparse vectors, + the factory methods in this class create an MLlib-compatible type, or users + can pass in SciPy's `scipy.sparse` column vectors. """ @staticmethod @@ -841,10 +877,16 @@ def sparse(size, *args): (index, value) pairs, or two separate arrays of indices and values (sorted by index). - :param size: Size of the vector. - :param args: Non-zero entries, as a dictionary, list of tuples, - or two sorted lists containing indices and values. + Parameters + ---------- + size : int + Size of the vector. + args + Non-zero entries, as a dictionary, list of tuples, + or two sorted lists containing indices and values. + Examples + -------- >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) @@ -859,6 +901,8 @@ def dense(*elements): """ Create a dense vector of 64-bit floats from a Python list or numbers. + Examples + -------- >>> Vectors.dense([1, 2, 3]) DenseVector([1.0, 2.0, 3.0]) >>> Vectors.dense(1.0, 2.0) @@ -875,10 +919,15 @@ def fromML(vec): Convert a vector from the new mllib-local representation. This does NOT copy the data; it copies references. - :param vec: a :py:class:`pyspark.ml.linalg.Vector` - :return: a :py:class:`pyspark.mllib.linalg.Vector` - .. versionadded:: 2.0.0 + + Parameters + ---------- + vec : :py:class:`pyspark.ml.linalg.Vector` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` """ if isinstance(vec, newlinalg.DenseVector): return DenseVector(vec.array) @@ -893,6 +942,8 @@ def stringify(vector): Converts a vector into a string, which can be recognized by Vectors.parse(). + Examples + -------- >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) '(2,[1],[1.0])' >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) @@ -907,6 +958,8 @@ def squared_distance(v1, v2): a and b can be of type SparseVector, DenseVector, np.ndarray or array.array. + Examples + -------- >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) >>> b = Vectors.dense([2, 5, 4, 1]) >>> a.squared_distance(b) @@ -926,6 +979,8 @@ def norm(vector, p): def parse(s): """Parse a string representation back into the Vector. + Examples + -------- >>> Vectors.parse('[2,1,2 ]') DenseVector([2.0, 1.0, 2.0]) >>> Vectors.parse(' ( 100, [0], [2])') @@ -1023,6 +1078,8 @@ def __str__(self): """ Pretty printing of a DenseMatrix + Examples + -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> print(dm) DenseMatrix([[ 0., 2.], @@ -1044,6 +1101,8 @@ def __repr__(self): """ Representation of a DenseMatrix + Examples + -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> dm DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) @@ -1067,6 +1126,8 @@ def toArray(self): """ Return an numpy.ndarray + Examples + -------- >>> m = DenseMatrix(2, 2, range(4)) >>> m.toArray() array([[ 0., 2.], @@ -1098,9 +1159,11 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.DenseMatrix` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.DenseMatrix` """ return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) @@ -1154,6 +1217,8 @@ def __str__(self): """ Pretty printing of a SparseMatrix + Examples + -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> print(sm1) 2 X 2 CSCMatrix @@ -1200,6 +1265,8 @@ def __repr__(self): """ Representation of a SparseMatrix + Examples + -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> sm1 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) @@ -1281,9 +1348,11 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.SparseMatrix` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.SparseMatrix` """ return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, self.values, self.isTransposed) @@ -1314,10 +1383,15 @@ def fromML(mat): Convert a matrix from the new mllib-local representation. This does NOT copy the data; it copies references. - :param mat: a :py:class:`pyspark.ml.linalg.Matrix` - :return: a :py:class:`pyspark.mllib.linalg.Matrix` - .. versionadded:: 2.0.0 + + Parameters + ---------- + mat : :py:class:`pyspark.ml.linalg.Matrix` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Matrix` """ if isinstance(mat, newlinalg.DenseMatrix): return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 603d31d3d7b26..f0e889b15bf51 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -55,16 +55,22 @@ class RowMatrix(DistributedMatrix): Represents a row-oriented distributed Matrix with no meaningful row indices. - :param rows: An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single - vector typed column. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the number of - records in the `rows` RDD. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. + + Parameters + ---------- + rows : :py:class:`pyspark.RDD` or :py:class:`pyspark.sql.DataFrame` + An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single + vector typed column. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the number of + records in the `rows` RDD. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. """ def __init__(self, rows, numRows=0, numCols=0): """ @@ -77,6 +83,8 @@ def __init__(self, rows, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -108,6 +116,8 @@ def rows(self): """ Rows of the RowMatrix stored as an RDD of vectors. + Examples + -------- >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]])) >>> rows = mat.rows >>> rows.first() @@ -119,6 +129,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], ... [7, 8, 9], [10, 11, 12]]) @@ -136,6 +148,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], ... [7, 8, 9], [10, 11, 12]]) @@ -149,14 +163,19 @@ def numCols(self): """ return self._java_matrix_wrapper.call("numCols") - @since('2.0.0') def computeColumnSummaryStatistics(self): """ Computes column-wise summary statistics. - :return: :class:`MultivariateStatisticalSummary` object - containing column-wise summary statistics. + .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`MultivariateStatisticalSummary` + object containing column-wise summary statistics. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -167,14 +186,19 @@ def computeColumnSummaryStatistics(self): java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics") return MultivariateStatisticalSummary(java_col_stats) - @since('2.0.0') def computeCovariance(self): """ Computes the covariance matrix, treating each row as an observation. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + Examples + -------- >>> rows = sc.parallelize([[1, 2], [2, 1]]) >>> mat = RowMatrix(rows) @@ -183,13 +207,18 @@ def computeCovariance(self): """ return self._java_matrix_wrapper.call("computeCovariance") - @since('2.0.0') def computeGramianMatrix(self): """ Computes the Gramian matrix `A^T A`. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -220,11 +249,12 @@ def columnSimilarities(self, threshold=0.0): similarity threshold. To describe the guarantee, we set some notation: - * Let A be the smallest in magnitude non-zero element of - this matrix. - * Let B be the largest in magnitude non-zero element of - this matrix. - * Let L be the maximum number of non-zeros per row. + + - Let A be the smallest in magnitude non-zero element of + this matrix. + - Let B be the largest in magnitude non-zero element of + this matrix. + - Let L be the maximum number of non-zeros per row. For example, for {0,1} matrices: A=B=1. Another example, for the Netflix matrix: A=1, B=5 @@ -236,20 +266,31 @@ def columnSimilarities(self, threshold=0.0): The shuffle size is bounded by the *smaller* of the following two expressions: - * O(n log(n) L / (threshold * A)) - * O(m L^2^) + - O(n log(n) L / (threshold * A)) + - O(m L^2^) The latter is the cost of the brute-force approach, so for non-zero thresholds, the cost is always cheaper than the brute-force approach. - :param: threshold: Set to 0 for deterministic guaranteed - correctness. Similarities above this - threshold are estimated with the cost vs - estimate quality trade-off described above. - :return: An n x n sparse upper-triangular CoordinateMatrix of - cosine similarities between columns of this matrix. + .. versionadded:: 2.0.0 + + Parameters + ---------- + threshold : float, optional + Set to 0 for deterministic guaranteed + correctness. Similarities above this + threshold are estimated with the cost vs + estimate quality trade-off described above. + Returns + ------- + :py:class:`CoordinateMatrix` + An n x n sparse upper-triangular CoordinateMatrix of + cosine similarities between columns of this matrix. + + Examples + -------- >>> rows = sc.parallelize([[1, 2], [1, 5]]) >>> mat = RowMatrix(rows) @@ -260,23 +301,32 @@ def columnSimilarities(self, threshold=0.0): java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold)) return CoordinateMatrix(java_sims_mat) - @since('2.0.0') def tallSkinnyQR(self, computeQ=False): """ Compute the QR decomposition of this RowMatrix. The implementation is designed to optimize the QR decomposition - (factorization) for the RowMatrix of a tall and skinny shape. + (factorization) for the RowMatrix of a tall and skinny shape [1]_. - Reference: - Paul G. Constantine, David F. Gleich. "Tall and skinny QR - factorizations in MapReduce architectures" - ([[https://doi.org/10.1145/1996092.1996103]]) + .. [1] Paul G. Constantine, David F. Gleich. "Tall and skinny QR + factorizations in MapReduce architectures" + https://doi.org/10.1145/1996092.1996103 - :param: computeQ: whether to computeQ - :return: QRDecomposition(Q: RowMatrix, R: Matrix), where - Q = None if computeQ = false. + .. versionadded:: 2.0.0 + Parameters + ---------- + computeQ : bool, optional + whether to computeQ + + Returns + ------- + :py:class:`pyspark.mllib.linalg.QRDecomposition` + QRDecomposition(Q: RowMatrix, R: Matrix), where + Q = None if computeQ = false. + + Examples + -------- >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]]) >>> mat = RowMatrix(rows) >>> decomp = mat.tallSkinnyQR(True) @@ -301,7 +351,6 @@ def tallSkinnyQR(self, computeQ=False): R = decomp.call("R") return QRDecomposition(Q, R) - @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the RowMatrix. @@ -309,27 +358,39 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where - * U: (m X k) (left singular vectors) is a RowMatrix whose - columns are the eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues - (singular values) in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns - are the eigenvectors of (A' X A) + - U: (m X k) (left singular vectors) is a RowMatrix whose + columns are the eigenvectors of (A X A') + - s: DenseVector consisting of square root of the eigenvalues + (singular values) in descending order. + - v: (n X k) (right singular vectors) is a Matrix whose columns + are the eigenvectors of (A' X A) For more specific details on implementation, please refer the Scala documentation. - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: :py:class:`SingularValueDecomposition` - + .. versionadded:: 2.2.0 + + Parameters + ---------- + k : int + Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). + computeU : bool, optional + Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + rCond : float, optional + Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. + + Returns + ------- + :py:class:`SingularValueDecomposition` + + Examples + -------- >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]]) >>> rm = RowMatrix(rows) @@ -345,16 +406,27 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) - @since('2.2.0') def computePrincipalComponents(self, k): """ Computes the k principal components of the given row matrix - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.2.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. - :param k: Number of principal components to keep. - :returns: :py:class:`pyspark.mllib.linalg.DenseMatrix` + Parameters + ---------- + k : int + Number of principal components to keep. + Returns + ------- + :py:class:`pyspark.mllib.linalg.DenseMatrix` + + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) >>> rm = RowMatrix(rows) @@ -370,15 +442,24 @@ def computePrincipalComponents(self, k): """ return self._java_matrix_wrapper.call("computePrincipalComponents", k) - @since('2.2.0') def multiply(self, matrix): """ Multiply this matrix by a local dense matrix on the right. - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`RowMatrix` + .. versionadded:: 2.2.0 + + Parameters + ---------- + matrix : :py:class:`pyspark.mllib.linalg.Matrix` + a local dense matrix whose number of rows must match the number of columns + of this matrix + Returns + ------- + :py:class:`RowMatrix` + + Examples + -------- >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] @@ -438,8 +519,12 @@ class IndexedRow(object): Just a wrapper over a (int, vector) tuple. - :param index: The index for the given row. - :param vector: The row in the matrix at the given index. + Parameters + ---------- + index : int + The index for the given row. + vector : :py:class:`pyspark.mllib.linalg.Vector` or convertible + The row in the matrix at the given index. """ def __init__(self, index, vector): self.index = int(index) @@ -462,16 +547,21 @@ class IndexedRowMatrix(DistributedMatrix): """ Represents a row-oriented distributed Matrix with indexed rows. - :param rows: An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a - int typed column of indices and a vector typed column. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. + Parameters + ---------- + rows : :py:class:`pyspark.RDD` + An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a + int typed column of indices and a vector typed column. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. """ def __init__(self, rows, numRows=0, numCols=0): """ @@ -484,6 +574,8 @@ def __init__(self, rows, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -524,6 +616,8 @@ def rows(self): """ Rows of the IndexedRowMatrix stored as an RDD of IndexedRows. + Examples + -------- >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])])) >>> rows = mat.rows @@ -542,6 +636,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6]), ... IndexedRow(2, [7, 8, 9]), @@ -561,6 +657,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6]), ... IndexedRow(2, [7, 8, 9]), @@ -580,6 +678,8 @@ def columnSimilarities(self): """ Compute all cosine similarities between columns. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -590,13 +690,18 @@ def columnSimilarities(self): java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities") return CoordinateMatrix(java_coordinate_matrix) - @since('2.0.0') def computeGramianMatrix(self): """ Computes the Gramian matrix `A^T A`. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -610,6 +715,8 @@ def toRowMatrix(self): """ Convert this matrix to a RowMatrix. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows).toRowMatrix() @@ -623,6 +730,8 @@ def toCoordinateMatrix(self): """ Convert this matrix to a CoordinateMatrix. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 0]), ... IndexedRow(6, [0, 5])]) >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix() @@ -636,13 +745,19 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): """ Convert this matrix to a BlockMatrix. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - + Parameters + ---------- + rowsPerBlock : int, optional + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int, optional + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows).toBlockMatrix() @@ -661,7 +776,6 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): colsPerBlock) return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) - @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the IndexedRowMatrix. @@ -679,17 +793,29 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): For more specific details on implementation, please refer the scala documentation. - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: SingularValueDecomposition object - + .. versionadded:: 2.2.0 + + Parameters + ---------- + k : int + Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). + computeU : bool, optional + Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + rCond : float, optional + Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. + + Returns + ------- + :py:class:`SingularValueDecomposition` + + Examples + -------- >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))] >>> irm = IndexedRowMatrix(sc.parallelize(rows)) >>> svd_model = irm.computeSVD(2, True) @@ -705,15 +831,24 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) - @since('2.2.0') def multiply(self, matrix): """ Multiply this matrix by a local dense matrix on the right. - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`IndexedRowMatrix` + .. versionadded:: 2.2.0 + + Parameters + ---------- + matrix : :py:class:`pyspark.mllib.linalg.Matrix` + a local dense matrix whose number of rows must match the number of columns + of this matrix + Returns + ------- + :py:class:`IndexedRowMatrix` + + Examples + -------- >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] @@ -730,9 +865,14 @@ class MatrixEntry(object): Just a wrapper over a (int, int, float) tuple. - :param i: The row index of the matrix. - :param j: The column index of the matrix. - :param value: The (i, j)th entry of the matrix, as a float. + Parameters + ---------- + i : int + The row index of the matrix. + j : int + The column index of the matrix. + value : float + The (i, j)th entry of the matrix, as a float. """ def __init__(self, i, j, value): self.i = int(i) @@ -756,16 +896,21 @@ class CoordinateMatrix(DistributedMatrix): """ Represents a matrix in coordinate format. - :param entries: An RDD of MatrixEntry inputs or - (int, int, float) tuples. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the max row - index plus one. + Parameters + ---------- + entries : :py:class:`pyspark.RDD` + An RDD of MatrixEntry inputs or + (int, int, float) tuples. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the max row + index plus one. """ def __init__(self, entries, numRows=0, numCols=0): """ @@ -778,6 +923,8 @@ def __init__(self, entries, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries) @@ -817,6 +964,8 @@ def entries(self): Entries of the CoordinateMatrix stored as an RDD of MatrixEntries. + Examples + -------- >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)])) >>> entries = mat.entries @@ -835,6 +984,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -853,6 +1004,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -867,11 +1020,14 @@ def numCols(self): """ return self._java_matrix_wrapper.call("numCols") - @since('2.0.0') def transpose(self): """ Transpose this CoordinateMatrix. + .. versionadded:: 2.0.0 + + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -891,6 +1047,8 @@ def toRowMatrix(self): """ Convert this matrix to a RowMatrix. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toRowMatrix() @@ -915,6 +1073,8 @@ def toIndexedRowMatrix(self): """ Convert this matrix to an IndexedRowMatrix. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix() @@ -938,13 +1098,19 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): """ Convert this matrix to a BlockMatrix. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - + Parameters + ---------- + rowsPerBlock : int, optional + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int, optional + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toBlockMatrix() @@ -983,26 +1149,33 @@ class BlockMatrix(DistributedMatrix): """ Represents a distributed matrix in blocks of local matrices. - :param blocks: An RDD of sub-matrix blocks - ((blockRowIndex, blockColIndex), sub-matrix) that - form this distributed matrix. If multiple blocks - with the same index exist, the results for - operations like add and multiply will be - unpredictable. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - :param numRows: Number of rows of this matrix. If the supplied - value is less than or equal to zero, the number - of rows will be calculated when `numRows` is - invoked. - :param numCols: Number of columns of this matrix. If the supplied - value is less than or equal to zero, the number - of columns will be calculated when `numCols` is - invoked. + Parameters + ---------- + blocks : :py:class:`pyspark.RDD` + An RDD of sub-matrix blocks + ((blockRowIndex, blockColIndex), sub-matrix) that + form this distributed matrix. If multiple blocks + with the same index exist, the results for + operations like add and multiply will be + unpredictable. + rowsPerBlock : int + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + numRows : int, optional + Number of rows of this matrix. If the supplied + value is less than or equal to zero, the number + of rows will be calculated when `numRows` is + invoked. + numCols : int, optional + Number of columns of this matrix. If the supplied + value is less than or equal to zero, the number + of columns will be calculated when `numCols` is + invoked. """ def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): """ @@ -1015,6 +1188,8 @@ def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1058,6 +1233,8 @@ def blocks(self): ((blockRowIndex, blockColIndex), sub-matrix) that form this distributed matrix. + Examples + -------- >>> mat = BlockMatrix( ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) @@ -1079,6 +1256,8 @@ def rowsPerBlock(self): """ Number of rows that make up each block. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1092,6 +1271,8 @@ def colsPerBlock(self): """ Number of columns that make up each block. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1105,6 +1286,8 @@ def numRowBlocks(self): """ Number of rows of blocks in the BlockMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1118,6 +1301,8 @@ def numColBlocks(self): """ Number of columns of blocks in the BlockMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1130,6 +1315,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) @@ -1147,6 +1334,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) @@ -1197,6 +1386,8 @@ def add(self, other): two dense sub matrix blocks are added, the output block will also be a DenseMatrix. + Examples + -------- >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12]) @@ -1220,7 +1411,6 @@ def add(self, other): java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix) return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - @since('2.0.0') def subtract(self, other): """ Subtracts the given block matrix `other` from this block matrix: @@ -1232,6 +1422,10 @@ def subtract(self, other): If two dense sub matrix blocks are subtracted, the output block will also be a DenseMatrix. + .. versionadded:: 2.0.0 + + Examples + -------- >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2]) >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3]) @@ -1265,6 +1459,8 @@ def multiply(self, other): This may cause some performance issues until support for multiplying two sparse matrices is added. + Examples + -------- >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12]) >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) @@ -1290,12 +1486,15 @@ def multiply(self, other): java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix) return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - @since('2.0.0') def transpose(self): """ Transpose this BlockMatrix. Returns a new BlockMatrix instance sharing the same underlying data. Is a lazy operation. + .. versionadded:: 2.0.0 + + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1311,6 +1510,8 @@ def toLocalMatrix(self): """ Collect the distributed matrix on the driver as a DenseMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix() @@ -1333,6 +1534,8 @@ def toIndexedRowMatrix(self): """ Convert this matrix to an IndexedRowMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix() @@ -1356,6 +1559,8 @@ def toCoordinateMatrix(self): """ Convert this matrix to a CoordinateMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])), ... ((1, 0), Matrices.dense(1, 2, [7, 8]))]) >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix() diff --git a/python/pyspark/mllib/linalg/distributed.pyi b/python/pyspark/mllib/linalg/distributed.pyi index 238c4ea32e4e8..7ec2d60c5a947 100644 --- a/python/pyspark/mllib/linalg/distributed.pyi +++ b/python/pyspark/mllib/linalg/distributed.pyi @@ -22,6 +22,7 @@ from pyspark.storagelevel import StorageLevel from pyspark.mllib.common import JavaModelWrapper from pyspark.mllib.linalg import Vector, Matrix, QRDecomposition from pyspark.mllib.stat import MultivariateStatisticalSummary +import pyspark.sql.dataframe from numpy import ndarray # noqa: F401 VectorLike = Union[Vector, Sequence[Union[float, int]]] @@ -35,7 +36,10 @@ class DistributedMatrix: class RowMatrix(DistributedMatrix): def __init__( - self, rows: RDD[Vector], numRows: int = ..., numCols: int = ... + self, + rows: Union[RDD[Vector], pyspark.sql.dataframe.DataFrame], + numRows: int = ..., + numCols: int = ..., ) -> None: ... @property def rows(self) -> RDD[Vector]: ... diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py index 6106c58584882..a33dfe26fbad9 100644 --- a/python/pyspark/mllib/random.py +++ b/python/pyspark/mllib/random.py @@ -22,7 +22,6 @@ import sys from functools import wraps -from pyspark import since from pyspark.mllib.common import callMLlibFunc @@ -46,7 +45,6 @@ class RandomRDDs(object): """ @staticmethod - @since("1.1.0") def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the @@ -56,12 +54,26 @@ def uniformRDD(sc, size, numPartitions=None, seed=None): to U(a, b), use ``RandomRDDs.uniformRDD(sc, n, p, seed).map(lambda v: a + (b - a) * v)`` - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. + + Examples + -------- >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 @@ -76,7 +88,6 @@ def uniformRDD(sc, size, numPartitions=None, seed=None): return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed) @staticmethod - @since("1.1.0") def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal @@ -86,12 +97,26 @@ def normalRDD(sc, size, numPartitions=None, seed=None): to some other normal N(mean, sigma^2), use ``RandomRDDs.normal(sc, n, p, seed).map(lambda v: mean + sigma * v)`` - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). + + Examples + -------- >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1) >>> stats = x.stats() >>> stats.count() @@ -104,20 +129,34 @@ def normalRDD(sc, size, numPartitions=None, seed=None): return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed) @staticmethod - @since("1.3.0") def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input mean and standard distribution. - :param sc: SparkContext used to create the RDD. - :param mean: mean for the log Normal distribution - :param std: std for the log Normal distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + mean : float + mean for the log Normal distribution + std : float + std for the log Normal distribution + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + RDD of float comprised of i.i.d. samples ~ log N(mean, std). + + Examples + -------- >>> from math import sqrt, exp >>> mean = 0.0 >>> std = 1.0 @@ -137,19 +176,33 @@ def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): size, numPartitions, seed) @staticmethod - @since("1.1.0") def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Pois(mean). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or lambda, for the Poisson distribution. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Pois(mean). + + Examples + -------- >>> mean = 100.0 >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=2) >>> stats = x.stats() @@ -164,19 +217,33 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None): return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed) @staticmethod - @since("1.3.0") def exponentialRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Exponential distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Exp(mean). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or 1 / lambda, for the Exponential distribution. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Exp(mean). + + Examples + -------- >>> mean = 2.0 >>> x = RandomRDDs.exponentialRDD(sc, mean, 1000, seed=2) >>> stats = x.stats() @@ -191,20 +258,35 @@ def exponentialRDD(sc, mean, size, numPartitions=None, seed=None): return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed) @staticmethod - @since("1.3.0") def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Gamma distribution with the input shape and scale. - :param sc: SparkContext used to create the RDD. - :param shape: shape (> 0) parameter for the Gamma distribution - :param scale: scale (> 0) parameter for the Gamma distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + shape : float + shape (> 0) parameter for the Gamma distribution + scale : float + scale (> 0) parameter for the Gamma distribution + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). + + Examples + -------- >>> from math import sqrt >>> shape = 1.0 >>> scale = 2.0 @@ -224,19 +306,33 @@ def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.1.0") def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD. - :param seed: Seed for the RNG that generates the seed for the generator in each partition. - :return: RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD. + seed : int, optional + Seed for the RNG that generates the seed for the generator in each partition. + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. + + Examples + -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape @@ -250,19 +346,33 @@ def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.1.0") def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. + + Examples + -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect()) >>> mat.shape @@ -276,21 +386,37 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.3.0") def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the log normal distribution. - :param sc: SparkContext used to create the RDD. - :param mean: Mean of the log normal distribution - :param std: Standard Deviation of the log normal distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean of the log normal distribution + std : float + Standard Deviation of the log normal distribution + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. + + Examples + -------- >>> import numpy as np >>> from math import sqrt, exp >>> mean = 0.0 @@ -311,20 +437,35 @@ def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed @staticmethod @toArray - @since("1.1.0") def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or lambda, for the Poisson distribution. + numRows : float + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`) + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean). + + Examples + -------- >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1) @@ -342,20 +483,35 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.3.0") def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Exponential distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or 1 / lambda, for the Exponential distribution. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`) + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). + + Examples + -------- >>> import numpy as np >>> mean = 0.5 >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) @@ -373,21 +529,37 @@ def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=No @staticmethod @toArray - @since("1.3.0") def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Gamma distribution. - :param sc: SparkContext used to create the RDD. - :param shape: Shape (> 0) of the Gamma distribution - :param scale: Scale (> 0) of the Gamma distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + shape : float + Shape (> 0) of the Gamma distribution + scale : float + Scale (> 0) of the Gamma distribution + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional, + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). + + Examples + -------- >>> import numpy as np >>> from math import sqrt >>> shape = 1.0 diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 3dd7cb200c280..7a5fb6e6eea9e 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -32,13 +32,15 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])): """ Represents a (user, product, rating) tuple. + .. versionadded:: 1.2.0 + + Examples + -------- >>> r = Rating(1, 2, 5.0) >>> (r.user, r.product, r.rating) (1, 2, 5.0) >>> (r[0], r[1], r[2]) (1, 2, 5.0) - - .. versionadded:: 1.2.0 """ def __reduce__(self): @@ -51,6 +53,10 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): """A matrix factorisation model trained by regularized alternating least-squares. + .. versionadded:: 0.9.0 + + Examples + -------- >>> r1 = (1, 1, 1.0) >>> r2 = (1, 2, 2.0) >>> r3 = (2, 1, 2.0) @@ -126,8 +132,6 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 0.9.0 """ @since("0.9.0") def predict(self, user, product): @@ -237,7 +241,6 @@ def _prepare(cls, ratings): return ratings @classmethod - @since("0.9.0") def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ @@ -247,35 +250,38 @@ def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + ratings : :py:class:`pyspark.RDD` + RDD of `Rating` or (userID, productID, rating) tuple. + rank : int + Number of features to use (also referred to as the number of latent factors). + iterations : int, optional + Number of iterations of ALS. + (default: 5) + lambda\\_ : float, optional + Regularization parameter. + (default: 0.01) + blocks : int, optional + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + nonnegative : bool, optional + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + seed : bool, optional + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) return MatrixFactorizationModel(model) @classmethod - @since("0.9.0") def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None): """ @@ -285,31 +291,35 @@ def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alp given rank (number of features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param alpha: - A constant used in computing confidence. - (default: 0.01) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + ratings : :py:class:`pyspark.RDD` + RDD of `Rating` or (userID, productID, rating) tuple. + rank : int + Number of features to use (also referred to as the number of latent factors). + iterations : int, optional + Number of iterations of ALS. + (default: 5) + lambda\\_ : float, optional + Regularization parameter. + (default: 0.01) + blocks : int, optional + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + alpha : float, optional + A constant used in computing confidence. + (default: 0.01) + nonnegative : bool, optional + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + seed : int, optional + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 77bca86ac1b27..e549b0ac43721 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -39,15 +39,19 @@ class LabeledPoint(object): """ Class that represents the features and labels of a data point. - :param label: - Label for this data point. - :param features: - Vector of features for this point (NumPy array, list, - pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). - - .. note:: 'label' and 'features' are accessible as class attributes. - .. versionadded:: 1.0.0 + + Parameters + ---------- + label : int + Label for this data point. + features : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Vector of features for this point (NumPy array, list, + pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). + + Notes + ----- + 'label' and 'features' are accessible as class attributes. """ def __init__(self, label, features): @@ -69,12 +73,14 @@ class LinearModel(object): """ A linear model that has a vector of coefficients and an intercept. - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. - .. versionadded:: 0.9.0 + + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. """ def __init__(self, weights, intercept): @@ -102,14 +108,16 @@ class LinearRegressionModelBase(LinearModel): """A linear regression model. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1) >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6 True >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6 True - - .. versionadded:: 0.9.0 """ @since("0.9.0") @@ -129,6 +137,10 @@ class LinearRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -181,8 +193,6 @@ class LinearRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -224,11 +234,13 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights): class LinearRegressionWithSGD(object): """ + Train a linear regression model with no regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression`. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.0, regType=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -244,42 +256,47 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.0) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization - None for no regularization (default) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression.", DeprecationWarning) @@ -299,6 +316,10 @@ class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_1 penalty term. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -351,8 +372,6 @@ class LassoModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -375,12 +394,14 @@ def load(cls, sc, path): class LassoWithSGD(object): """ + Train a regression model with L1-regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. - Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression` with elasticNetParam = 1.0. + Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -395,35 +416,39 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. " @@ -444,6 +469,10 @@ class RidgeRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_2 penalty term. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -496,8 +525,6 @@ class RidgeRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -520,13 +547,15 @@ def load(cls, sc, path): class RidgeRegressionWithSGD(object): """ + Train a regression model with L2-regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. - Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for - LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression` with elasticNetParam = 0.0. + Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for + LinearRegression. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -541,35 +570,39 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. " @@ -589,15 +622,21 @@ class IsotonicRegressionModel(Saveable, Loader): """ Regression model for isotonic regression. - :param boundaries: - Array of boundaries for which predictions are known. Boundaries - must be sorted in increasing order. - :param predictions: - Array of predictions associated to the boundaries at the same - index. Results of isotonic regression and therefore monotone. - :param isotonic: - Indicates whether this is isotonic or antitonic. + .. versionadded:: 1.4.0 + Parameters + ---------- + boundaries : ndarray + Array of boundaries for which predictions are known. Boundaries + must be sorted in increasing order. + predictions : ndarray + Array of predictions associated to the boundaries at the same + index. Results of isotonic regression and therefore monotone. + isotonic : true + Indicates whether this is isotonic or antitonic. + + Examples + -------- >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] >>> irm = IsotonicRegression.train(sc.parallelize(data)) >>> irm.predict(3) @@ -619,8 +658,6 @@ class IsotonicRegressionModel(Saveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.4.0 """ def __init__(self, boundaries, predictions, isotonic): @@ -628,7 +665,6 @@ def __init__(self, boundaries, predictions, isotonic): self.predictions = predictions self.isotonic = isotonic - @since("1.4.0") def predict(self, x): """ Predict labels for provided features. @@ -647,8 +683,13 @@ def predict(self, x): values with the same boundary then the same rules as in 2) are used. - :param x: - Feature or RDD of Features to be labeled. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Feature or RDD of Features to be labeled. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) @@ -680,35 +721,42 @@ class IsotonicRegression(object): Currently implemented using parallelized pool adjacent violators algorithm. Only univariate (single feature) algorithm supported. - Sequential PAV implementation based on: + .. versionadded:: 1.4.0 + + Notes + ----- + Sequential PAV implementation based on + Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani (2011) [1]_ - Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. - "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. - Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf + Sequential PAV parallelization based on + Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset (1996) [2]_ - Sequential PAV parallelization based on: + See also + `Isotonic regression (Wikipedia) `_. - Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. + .. [1] Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. + "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. + Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf + .. [2] Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset "An approach to parallelizing isotonic regression." Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf - - See `Isotonic regression (Wikipedia) `_. - - .. versionadded:: 1.4.0 """ @classmethod - @since("1.4.0") def train(cls, data, isotonic=True): """ Train an isotonic regression model on the given data. - :param data: - RDD of (label, feature, weight) tuples. - :param isotonic: - Whether this is isotonic (which is default) or antitonic. - (default: True) + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD of (label, feature, weight) tuples. + isotonic : bool, optional + Whether this is isotonic (which is default) or antitonic. + (default: True) """ boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", data.map(_convert_to_vector), bool(isotonic)) @@ -741,26 +789,32 @@ def _validate(self, dstream): raise ValueError( "Model must be intialized using setInitialWeights") - @since("1.5.0") def predictOn(self, dstream): """ Use the model to make predictions on batches of data from a DStream. - :return: - DStream containing predictions. + .. versionadded:: 1.5.0 + + Returns + ------- + :py:class:`pyspark.streaming.DStream` + DStream containing predictions. """ self._validate(dstream) return dstream.map(lambda x: self._model.predict(x)) - @since("1.5.0") def predictOnValues(self, dstream): """ Use the model to make predictions on the values of a DStream and carry over its keys. - :return: - DStream containing the input keys and the predictions as values. + .. versionadded:: 1.5.0 + + Returns + ------- + :py:class:`pyspark.streaming.DStream` + DStream containing predictions. """ self._validate(dstream) return dstream.mapValues(lambda x: self._model.predict(x)) @@ -779,20 +833,22 @@ class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm): of features must be constant. An initial weight vector must be provided. - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - .. versionadded:: 1.5.0 + + Parameters + ---------- + stepSize : float, optional + Step size for each iteration of gradient descent. + (default: 0.1) + numIterations : int, optional + Number of iterations run for each batch of data. + (default: 50) + miniBatchFraction : float, optional + Fraction of each batch of data to use for updates. + (default: 1.0) + convergenceTol : float, optional + Value used to determine when to terminate iterations. + (default: 0.001) """ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001): self.stepSize = stepSize diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py index 56444c152f0ba..1d4d43e53519c 100644 --- a/python/pyspark/mllib/stat/KernelDensity.py +++ b/python/pyspark/mllib/stat/KernelDensity.py @@ -26,6 +26,8 @@ class KernelDensity(object): Estimate probability density at required points given an RDD of samples from the population. + Examples + -------- >>> kd = KernelDensity() >>> sample = sc.parallelize([0.0, 1.0]) >>> kd.setSample(sample) diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py index 0fb33061838af..d3b4ddf7e4c68 100644 --- a/python/pyspark/mllib/stat/__init__.py +++ b/python/pyspark/mllib/stat/__init__.py @@ -21,8 +21,9 @@ from pyspark.mllib.stat._statistics import Statistics, MultivariateStatisticalSummary from pyspark.mllib.stat.distribution import MultivariateGaussian -from pyspark.mllib.stat.test import ChiSqTestResult +from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult from pyspark.mllib.stat.KernelDensity import KernelDensity -__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult", +__all__ = ["Statistics", "MultivariateStatisticalSummary", + "ChiSqTestResult", "KolmogorovSmirnovTestResult", "MultivariateGaussian", "KernelDensity"] diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 43454ba5187dd..a4b45cf55febe 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -65,11 +65,19 @@ def colStats(rdd): """ Computes column-wise summary statistics for the input RDD[Vector]. - :param rdd: an RDD[Vector] for which column-wise summary statistics - are to be computed. - :return: :class:`MultivariateStatisticalSummary` object containing - column-wise summary statistics. - + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + an RDD[Vector] for which column-wise summary statistics + are to be computed. + + Returns + ------- + :class:`MultivariateStatisticalSummary` + object containing column-wise summary statistics. + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), ... Vectors.dense([4, 5, 0, 3]), @@ -103,13 +111,24 @@ def corr(x, y=None, method=None): to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. - :param x: an RDD of vector for which the correlation matrix is to be computed, - or an RDD of float of the same cardinality as y when y is specified. - :param y: an RDD of float of the same cardinality as x. - :param method: String specifying the method to use for computing correlation. - Supported: `pearson` (default), `spearman` - :return: Correlation matrix comparing columns in x. - + Parameters + ---------- + x : :py:class:`pyspark.RDD` + an RDD of vector for which the correlation matrix is to be computed, + or an RDD of float of the same cardinality as y when y is specified. + y : :py:class:`pyspark.RDD`, optional + an RDD of float of the same cardinality as x. + method : str, optional + String specifying the method to use for computing correlation. + Supported: `pearson` (default), `spearman` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Matrix` + Correlation matrix comparing columns in x. + + Examples + -------- >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) @@ -172,20 +191,33 @@ def chiSqTest(observed, expected=None): contingency matrix for which the chi-squared statistic is computed. All label and feature values must be categorical. - .. note:: `observed` cannot contain negative values - - :param observed: it could be a vector containing the observed categorical - counts/relative frequencies, or the contingency matrix - (containing either counts or relative frequencies), - or an RDD of LabeledPoint containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - :param expected: Vector containing the expected categorical counts/relative - frequencies. `expected` is rescaled if the `expected` sum - differs from the `observed` sum. - :return: ChiSquaredTest object containing the test statistic, degrees - of freedom, p-value, the method used, and the null hypothesis. - + Parameters + ---------- + observed : :py:class:`pyspark.mllib.linalg.Vector` or \ + :py:class:`pyspark.mllib.linalg.Matrix` + it could be a vector containing the observed categorical + counts/relative frequencies, or the contingency matrix + (containing either counts or relative frequencies), + or an RDD of LabeledPoint containing the labeled dataset + with categorical features. Real-valued features will be + treated as categorical for each distinct value. + expected : :py:class:`pyspark.mllib.linalg.Vector` + Vector containing the expected categorical counts/relative + frequencies. `expected` is rescaled if the `expected` sum + differs from the `observed` sum. + + Returns + ------- + :py:class:`pyspark.mllib.stat.ChiSqTestResult` + object containing the test statistic, degrees + of freedom, p-value, the method used, and the null hypothesis. + + Notes + ----- + `observed` cannot contain negative values + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors, Matrices >>> observed = Vectors.dense([4, 6, 5]) >>> pearson = Statistics.chiSqTest(observed) @@ -259,17 +291,28 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): For specific details of the implementation, please have a look at the Scala documentation. - :param data: RDD, samples from the data - :param distName: string, currently only "norm" is supported. - (Normal distribution) to calculate the - theoretical distribution of the data. - :param params: additional values which need to be provided for - a certain distribution. - If not provided, the default values are used. - :return: KolmogorovSmirnovTestResult object containing the test - statistic, degrees of freedom, p-value, - the method used, and the null hypothesis. + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD, samples from the data + distName : str, optional + string, currently only "norm" is supported. + (Normal distribution) to calculate the + theoretical distribution of the data. + params + additional values which need to be provided for + a certain distribution. + If not provided, the default values are used. + + Returns + ------- + :py:class:`pyspark.mllib.stat.KolmogorovSmirnovTestResult` + object containing the test statistic, degrees of freedom, p-value, + the method used, and the null hypothesis. + + Examples + -------- >>> kstest = Statistics.kolmogorovSmirnovTest >>> data = sc.parallelize([-1.0, 0.0, 1.0]) >>> ksmodel = kstest(data, "norm") diff --git a/python/pyspark/mllib/stat/distribution.py b/python/pyspark/mllib/stat/distribution.py index 46f7a1d2f277a..aa35ac6dfdae1 100644 --- a/python/pyspark/mllib/stat/distribution.py +++ b/python/pyspark/mllib/stat/distribution.py @@ -24,6 +24,8 @@ class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])): """Represents a (mu, sigma) tuple + Examples + -------- >>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0))) >>> (m.mu, m.sigma.toArray()) (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]])) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index e05dfdb953ceb..493dcf8db6fd2 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -33,15 +33,18 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): .. versionadded:: 1.3.0 """ - @since("1.3.0") def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. versionadded:: 1.3.0 + + Notes + ----- + In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -79,18 +82,23 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): .. versionadded:: 1.1.0 """ - @since("1.1.0") def predict(self, x): """ Predict the label of one or more examples. - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. versionadded:: 1.1.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Data point (feature vector), or an RDD of data points (feature + vectors). - :param x: - Data point (feature vector), or an RDD of data points (feature - vectors). + Notes + ----- + In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -143,45 +151,50 @@ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, m return DecisionTreeModel(model) @classmethod - @since("1.1.0") def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a decision tree model for classification. - :param data: - Training data: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - + .. versionadded:: 1.1.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training data: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + numClasses : int + Number of classes for classification. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + impurity : str, optional + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". + (default: "gini") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 5) + maxBins : int, optional + Number of bins used for finding splits at each node. + (default: 32) + minInstancesPerNode : int, optional + Minimum number of instances required at child nodes to create + the parent split. + (default: 1) + minInfoGain : float, optional + Minimum info gain required to create a split. + (default: 0.0) + + Returns + ------- + :py:class:`DecisionTreeModel` + + Examples + -------- >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree @@ -222,35 +235,39 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Train a decision tree model for regression. - :param data: - Training data: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training data: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + impurity : str, optional + Criterion used for information gain calculation. + The only supported value for regression is "variance". + (default: "variance") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 5) + maxBins : int, optional + Number of bins used for finding splits at each node. + (default: 32) + minInstancesPerNode : int, optional + Minimum number of instances required at child nodes to create + the parent split. + (default: 1) + minInfoGain : float, optional + Minimum info gain required to create a split. + (default: 0.0) + + Returns + ------- + :py:class:`DecisionTreeModel` + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector @@ -313,7 +330,6 @@ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, return RandomForestModel(model) @classmethod - @since("1.2.0") def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=None): @@ -321,44 +337,51 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Train a random forest model for binary or multiclass classification. - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + numClasses : int + Number of classes for classification. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + numTrees : int + Number of trees in the random forest. + featureSubsetStrategy : str, optional + Number of features to consider for splits at each node. + Supported values: "auto", "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". + (default: "auto") + impurity : str, optional + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". + (default: "gini") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 4) + maxBins : int, optional + Maximum number of bins used for splitting features. + (default: 32) + seed : int, Optional + Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. + (default: None) + + Returns + ------- + :py:class:`RandomForestModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> @@ -405,47 +428,55 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, maxDepth, maxBins, seed) @classmethod - @since("1.2.0") def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32, seed=None): """ Train a random forest model for regression. - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + numTrees : int + Number of trees in the random forest. + featureSubsetStrategy : str, optional + Number of features to consider for splits at each node. + Supported values: "auto", "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + + - if numTrees == 1, set to "all"; + - if numTrees > 1 (forest) set to "onethird" for regression. + + (default: "auto") + impurity : str, optional + Criterion used for information gain calculation. + The only supported value for regression is "variance". + (default: "variance") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 4) + maxBins : int, optional + Maximum number of bins used for splitting features. + (default: 32) + seed : int, optional + Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. + (default: None) + + Returns + ------- + :py:class:`RandomForestModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> from pyspark.mllib.linalg import SparseVector @@ -505,45 +536,51 @@ def _train(cls, data, algo, categoricalFeaturesInfo, return GradientBoostedTreesModel(model) @classmethod - @since("1.3.0") def trainClassifier(cls, data, categoricalFeaturesInfo, loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Train a gradient-boosted trees model for classification. - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1}. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "logLoss") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.3.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1}. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + loss : str, optional + Loss function used for minimization during gradient boosting. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". + (default: "logLoss") + numIterations : int, optional + Number of iterations of boosting. + (default: 100) + learningRate : float, optional + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 3) + maxBins : int, optional + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories. + (default: 32) + + Returns + ------- + :py:class:`GradientBoostedTreesModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> @@ -574,44 +611,50 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins) @classmethod - @since("1.3.0") def trainRegressor(cls, data, categoricalFeaturesInfo, loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Train a gradient-boosted trees model for regression. - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "leastSquaresError") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.3.0 + + Parameters + ---------- + data : + Training dataset: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + loss : str, optional + Loss function used for minimization during gradient boosting. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". + (default: "leastSquaresError") + numIterations : int, optional + Number of iterations of boosting. + (default: 100) + learningRate : float, optional + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 3) + maxBins : int, optional + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories. + (default: 32) + + Returns + ------- + :py:class:`GradientBoostedTreesModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> from pyspark.mllib.linalg import SparseVector diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a0be29a82e3dc..68feb9563852c 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -65,7 +65,6 @@ def _convert_labeled_point_to_libsvm(p): return " ".join(items) @staticmethod - @since("1.0.0") def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): """ Loads labeled data in the LIBSVM format into an RDD of @@ -79,20 +78,33 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): method parses each line into a LabeledPoint, where the feature indices are converted to zero-based. - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param numFeatures: number of features, which will be determined - from the input data if a nonpositive value - is given. This is useful when the dataset is - already split into multiple files and you - want to load them separately, because some - features may not present in certain files, - which leads to inconsistent feature - dimensions. - :param minPartitions: min number of partitions - :return: labeled data stored as an RDD of LabeledPoint - + .. versionadded:: 1.0.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context + path : str + file or directory path in any Hadoop-supported file system URI + numFeatures : int, optional + number of features, which will be determined + from the input data if a nonpositive value + is given. This is useful when the dataset is + already split into multiple files and you + want to load them separately, because some + features may not present in certain files, + which leads to inconsistent feature + dimensions. + minPartitions : int, optional + min number of partitions + + Returns + ------- + :py:class:`pyspark.RDD` + labeled data stored as an RDD of LabeledPoint + + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint @@ -118,14 +130,21 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) @staticmethod - @since("1.0.0") def saveAsLibSVMFile(data, dir): """ Save labeled data in LIBSVM format. - :param data: an RDD of LabeledPoint to be saved - :param dir: directory to save the data + .. versionadded:: 1.0.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + an RDD of LabeledPoint to be saved + dir : str + directory to save the data + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from fileinput import input >>> from pyspark.mllib.regression import LabeledPoint @@ -143,17 +162,28 @@ def saveAsLibSVMFile(data, dir): lines.saveAsTextFile(dir) @staticmethod - @since("1.1.0") def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param minPartitions: min number of partitions - :return: labeled data stored as an RDD of LabeledPoint + .. versionadded:: 1.0.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context + path : str + file or directory path in any Hadoop-supported file system URI + minPartitions : int, optional + min number of partitions + Returns + ------- + :py:class:`pyspark.RDD` + labeled data stored as an RDD of LabeledPoint + + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint @@ -193,7 +223,6 @@ def loadVectors(sc, path): return callMLlibFunc("loadVectors", sc, path) @staticmethod - @since("2.0.0") def convertVectorColumnsToML(dataset, *cols): """ Converts vector columns in an input DataFrame from the @@ -201,16 +230,26 @@ def convertVectorColumnsToML(dataset, *cols): :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - New vector columns will be ignored. If unspecified, all old - vector columns will be converted excepted nested ones. - :return: - the input dataset with old vector columns converted to the - new vector type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Vector columns to be converted. + New vector columns will be ignored. If unspecified, all old + vector columns will be converted excepted nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with old vector columns converted to the + new vector type + + Examples + -------- >>> import pyspark >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.mllib.util import MLUtils @@ -233,7 +272,6 @@ def convertVectorColumnsToML(dataset, *cols): return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertVectorColumnsFromML(dataset, *cols): """ Converts vector columns in an input DataFrame to the @@ -241,16 +279,26 @@ def convertVectorColumnsFromML(dataset, *cols): :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - Old vector columns will be ignored. If unspecified, all new - vector columns will be converted except nested ones. - :return: - the input dataset with new vector columns converted to the - old vector type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Vector columns to be converted. + + Old vector columns will be ignored. If unspecified, all new + vector columns will be converted except nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with new vector columns converted to the + old vector type + Examples + -------- >>> import pyspark >>> from pyspark.ml.linalg import Vectors >>> from pyspark.mllib.util import MLUtils @@ -273,7 +321,6 @@ def convertVectorColumnsFromML(dataset, *cols): return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertMatrixColumnsToML(dataset, *cols): """ Converts matrix columns in an input DataFrame from the @@ -281,16 +328,26 @@ def convertMatrixColumnsToML(dataset, *cols): :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - New matrix columns will be ignored. If unspecified, all old - matrix columns will be converted excepted nested ones. - :return: - the input dataset with old matrix columns converted to the - new matrix type + .. versionadded:: 2.0.0 + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Matrix columns to be converted. + + New matrix columns will be ignored. If unspecified, all old + matrix columns will be converted excepted nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with old matrix columns converted to the + new matrix type + + Examples + -------- >>> import pyspark >>> from pyspark.mllib.linalg import Matrices >>> from pyspark.mllib.util import MLUtils @@ -313,7 +370,6 @@ def convertMatrixColumnsToML(dataset, *cols): return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertMatrixColumnsFromML(dataset, *cols): """ Converts matrix columns in an input DataFrame to the @@ -321,16 +377,26 @@ def convertMatrixColumnsFromML(dataset, *cols): :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - Old matrix columns will be ignored. If unspecified, all new - matrix columns will be converted except nested ones. - :return: - the input dataset with new matrix columns converted to the - old matrix type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Matrix columns to be converted. + + Old matrix columns will be ignored. If unspecified, all new + matrix columns will be converted except nested ones. + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with new matrix columns converted to the + old matrix type + + Examples + -------- >>> import pyspark >>> from pyspark.ml.linalg import Matrices >>> from pyspark.mllib.util import MLUtils @@ -370,10 +436,14 @@ def save(self, sc, path): The model may be loaded using :py:meth:`Loader.load`. - :param sc: Spark context used to save model data. - :param path: Path specifying the directory in which to save - this model. If the directory already exists, - this method throws an exception. + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context used to save model data. + path : str + Path specifying the directory in which to save + this model. If the directory already exists, + this method throws an exception. """ raise NotImplementedError @@ -410,10 +480,17 @@ def load(cls, sc, path): Load a model from the given path. The model should have been saved using :py:meth:`Saveable.save`. - :param sc: Spark context used for loading model files. - :param path: Path specifying the directory to which the model - was saved. - :return: model instance + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context used for loading model files. + path : str + Path specifying the directory to which the model was saved. + + Returns + ------- + object + model instance """ raise NotImplementedError @@ -463,20 +540,33 @@ class LinearDataGenerator(object): """ @staticmethod - @since("1.5.0") def generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps): """ - :param: intercept bias factor, the term c in X'w + c - :param: weights feature vector, the term w in X'w + c - :param: xMean Point around which the data X is centered. - :param: xVariance Variance of the given data - :param: nPoints Number of points to be generated - :param: seed Random Seed - :param: eps Used to scale the noise. If eps is set high, - the amount of gaussian noise added is more. - - Returns a list of LabeledPoints of length nPoints + .. versionadded:: 1.5.0 + + Parameters + ---------- + intercept : float + bias factor, the term c in X'w + c + weights : :py:class:`pyspark.mllib.linalg.Vector` or convertible + feature vector, the term w in X'w + c + xMean : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Point around which the data X is centered. + xVariance : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Variance of the given data + nPoints : int + Number of points to be generated + seed : int + Random Seed + eps : float + Used to scale the noise. If eps is set high, + the amount of gaussian noise added is more. + + Returns + ------- + list + of :py:class:`pyspark.mllib.regression.LabeledPoints` of length nPoints """ weights = [float(weight) for weight in weights] xMean = [float(mean) for mean in xMean]