|
18 | 18 | import numpy as np
|
19 | 19 | from numpy import array
|
20 | 20 |
|
| 21 | +from pyspark import RDD |
21 | 22 | from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
|
22 |
| -from pyspark.mllib.linalg import SparseVector, _convert_to_vector |
| 23 | +from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector |
23 | 24 | from pyspark.mllib.util import Saveable, Loader
|
24 | 25 |
|
25 | 26 | __all__ = ['LabeledPoint', 'LinearModel',
|
26 | 27 | 'LinearRegressionModel', 'LinearRegressionWithSGD',
|
27 | 28 | 'RidgeRegressionModel', 'RidgeRegressionWithSGD',
|
28 |
| - 'LassoModel', 'LassoWithSGD'] |
| 29 | + 'LassoModel', 'LassoWithSGD', 'IsotonicRegressionModel', |
| 30 | + 'IsotonicRegression'] |
29 | 31 |
|
30 | 32 |
|
31 | 33 | class LabeledPoint(object):
|
@@ -396,6 +398,73 @@ def train(rdd, i):
|
396 | 398 | return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)
|
397 | 399 |
|
398 | 400 |
|
| 401 | +class IsotonicRegressionModel(Saveable, Loader): |
| 402 | + |
| 403 | + """Regression model for isotonic regression. |
| 404 | +
|
| 405 | + >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] |
| 406 | + >>> irm = IsotonicRegression.train(sc.parallelize(data)) |
| 407 | + >>> irm.predict(3) |
| 408 | + 2.0 |
| 409 | + >>> irm.predict(5) |
| 410 | + 16.5 |
| 411 | + >>> irm.predict(sc.parallelize([3, 5])).collect() |
| 412 | + [2.0, 16.5] |
| 413 | + >>> import os, tempfile |
| 414 | + >>> path = tempfile.mkdtemp() |
| 415 | + >>> irm.save(sc, path) |
| 416 | + >>> sameModel = IsotonicRegressionModel.load(sc, path) |
| 417 | + >>> sameModel.predict(3) |
| 418 | + 2.0 |
| 419 | + >>> sameModel.predict(5) |
| 420 | + 16.5 |
| 421 | + >>> try: |
| 422 | + ... os.removedirs(path) |
| 423 | + ... except OSError: |
| 424 | + ... pass |
| 425 | + """ |
| 426 | + |
| 427 | + def __init__(self, boundaries, predictions, isotonic): |
| 428 | + self.boundaries = boundaries |
| 429 | + self.predictions = predictions |
| 430 | + self.isotonic = isotonic |
| 431 | + |
| 432 | + def predict(self, x): |
| 433 | + if isinstance(x, RDD): |
| 434 | + return x.map(lambda v: self.predict(v)) |
| 435 | + return np.interp(x, self.boundaries, self.predictions) |
| 436 | + |
| 437 | + def save(self, sc, path): |
| 438 | + java_boundaries = _py2java(sc, self.boundaries.tolist()) |
| 439 | + java_predictions = _py2java(sc, self.predictions.tolist()) |
| 440 | + java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel( |
| 441 | + java_boundaries, java_predictions, self.isotonic) |
| 442 | + java_model.save(sc._jsc.sc(), path) |
| 443 | + |
| 444 | + @classmethod |
| 445 | + def load(cls, sc, path): |
| 446 | + java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( |
| 447 | + sc._jsc.sc(), path) |
| 448 | + py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray() |
| 449 | + py_predictions = _java2py(sc, java_model.predictionVector()).toArray() |
| 450 | + return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic) |
| 451 | + |
| 452 | + |
| 453 | +class IsotonicRegression(object): |
| 454 | + """ |
| 455 | + Run IsotonicRegression algorithm to obtain isotonic regression model. |
| 456 | +
|
| 457 | + :param data: RDD of (label, feature, weight) tuples. |
| 458 | + :param isotonic: Whether this is isotonic or antitonic. |
| 459 | + """ |
| 460 | + @classmethod |
| 461 | + def train(cls, data, isotonic=True): |
| 462 | + """Train a isotonic regression model on the given data.""" |
| 463 | + boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", |
| 464 | + data.map(_convert_to_vector), bool(isotonic)) |
| 465 | + return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic) |
| 466 | + |
| 467 | + |
399 | 468 | def _test():
|
400 | 469 | import doctest
|
401 | 470 | from pyspark import SparkContext
|
|
0 commit comments