apache
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
Lines changed: 81 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
Lines changed: 81 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
Lines changed: 34 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
Lines changed: 34 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Identifiable.scala
Lines changed: 39 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Identifiable.scala
Lines changed: 39 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Model.scala
Lines changed: 6 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Model.scala
Lines changed: 6 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
Lines changed: 78 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
Lines changed: 78 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
Lines changed: 70 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
Lines changed: 70 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/example/BinaryClassificationEvaluator.scala
Lines changed: 53 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/example/BinaryClassificationEvaluator.scala
Lines changed: 53 additions & 0 deletions
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.sql.SchemaRDD
+
+/**
+ * Abstract class for estimators that fits models to data.
+ */
+abstract class Estimator[M <: Model] extends Identifiable with Params with PipelineStage {
+
+  /**
+   * Fits a single model to the input data with default parameters.
+   *
+   * @param dataset input dataset
+   * @return fitted model
+   */
+  def fit(dataset: SchemaRDD): M = {
+    fit(dataset, ParamMap.empty)
+  }
+
+  /**
+   * Fits a single model to the input data with provided parameter map.
+   *
+   * @param dataset input dataset
+   * @param paramMap parameters
+   * @return fitted model
+   */
+  def fit(dataset: SchemaRDD, paramMap: ParamMap): M
+
+  /**
+   * Fits a single model to the input data with provided parameters.
+   *
+   * @param dataset input dataset
+   * @param firstParamPair first parameter
+   * @param otherParamPairs other parameters
+   * @return fitted model
+   */
+  def fit(
+      dataset: SchemaRDD,
+      firstParamPair: ParamPair[_],
+      otherParamPairs: ParamPair[_]*): M = {
+    val map = new ParamMap()
+    map.put(firstParamPair)
+    otherParamPairs.foreach(map.put(_))
+    fit(dataset, map)
+  }
+
+  /**
+   * Fits multiple models to the input data with multiple sets of parameters.
+   * The default implementation uses a for loop on each parameter map.
+   * Subclasses could overwrite this to optimize multi-model training.
+   *
+   * @param dataset input dataset
+   * @param paramMaps an array of parameter maps
+   * @return fitted models, matching the input parameter maps
+   */
+  def fit(dataset: SchemaRDD, paramMaps: Array[ParamMap]): Seq[M] = {
+    paramMaps.map(fit(dataset, _))
+  }
+
+  /**
+   * Parameter for the output model.
+   */
+  def model: Params = Params.empty
+}
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.sql.SchemaRDD
+
+/**
+ * Abstract class for evaluators that compute metrics from predictions.
+ */
+abstract class Evaluator extends Identifiable {
+
+  /**
+   * Evaluate the output
+   * @param dataset a dataset that contains labels/observations and predictions.
+   * @param paramMap parameter map that specifies the input columns and output metrics
+   * @return metric
+   */
+  def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double
+}
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import java.util.UUID
+
+/**
+ * Something with a unique id.
+ */
+trait Identifiable extends Serializable {
+
+  /**
+   * A unique id for the object.
+   */
+  val uid: String = this.getClass.getSimpleName + "-" + Identifiable.randomUid
+}
+
+object Identifiable {
+
+  /**
+   * Returns a random uid, drawn uniformly from 4+ billion candidates.
+   */
+  private def randomUid: String = UUID.randomUUID().toString.take(8)
+}
@@ -0,0 +1,6 @@
+package org.apache.spark.ml
+
+abstract class Model extends Transformer {
+  // def parent: Estimator
+  // def trainingParameters: ParamMap
+}
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.sql.SchemaRDD
+
+import scala.collection.mutable.ListBuffer
+
+trait PipelineStage extends Identifiable
+
+/**
+ * A simple pipeline, which acts as an estimator.
+ */
+class Pipeline extends Estimator[PipelineModel] {
+
+  val stages: Param[Array[PipelineStage]] =
+    new Param[Array[PipelineStage]](this, "stages", "stages of the pipeline")
+
+  override def fit(dataset: SchemaRDD, paramMap: ParamMap): PipelineModel = {
+    val theStages = paramMap.apply(stages)
+    // Search for last estimator.
+    var lastIndexOfEstimator = -1
+    theStages.view.zipWithIndex.foreach { case (stage, index) =>
+      stage match {
+        case _: Estimator[_] =>
+          lastIndexOfEstimator = index
+        case _ =>
+      }
+    }
+    var curDataset = dataset
+    val transformers = ListBuffer.empty[Transformer]
+    theStages.view.zipWithIndex.foreach { case (stage, index) =>
+      stage match {
+        case estimator: Estimator[_] =>
+          val transformer = estimator.fit(curDataset, paramMap)
+          if (index < lastIndexOfEstimator) {
+            curDataset = transformer.transform(curDataset, paramMap)
+          }
+          transformers += transformer
+        case transformer: Transformer =>
+          if (index < lastIndexOfEstimator) {
+            curDataset = transformer.transform(curDataset, paramMap)
+          }
+          transformers += transformer
+        case _ =>
+          throw new IllegalArgumentException
+      }
+    }
+
+    new PipelineModel(transformers.toArray)
+  }
+
+  override def params: Array[Param[_]] = Array.empty
+}
+
+class PipelineModel(val transformers: Array[Transformer]) extends Model {
+
+  override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    transformers.foldLeft(dataset) { (dataset, transformer) =>
+      transformer.transform(dataset, paramMap)
+    }
+  }
+}
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.sql.SchemaRDD
+
+/**
+ * Abstract class for transformers that transform one dataset into another.
+ */
+abstract class Transformer extends Identifiable with Params with PipelineStage {
+
+  /**
+   * Transforms the dataset with the default parameters.
+   * @param dataset input dataset
+   * @return transformed dataset
+   */
+  def transform(dataset: SchemaRDD): SchemaRDD = {
+    transform(dataset, ParamMap.empty)
+  }
+
+  /**
+   * Transforms the dataset with provided parameter map.
+   * @param dataset input dataset
+   * @param paramMap parameters
+   * @return transformed dataset
+   */
+  def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD
+
+  /**
+   * Transforms the dataset with provided parameter pairs.
+   * @param dataset input dataset
+   * @param firstParamPair first parameter pair
+   * @param otherParamPairs second parameter pair
+   * @return transformed dataset
+   */
+  def transform(
+      dataset: SchemaRDD,
+      firstParamPair: ParamPair[_],
+      otherParamPairs: ParamPair[_]*): SchemaRDD = {
+    val map = new ParamMap()
+    map.put(firstParamPair)
+    otherParamPairs.foreach(map.put(_))
+    transform(dataset, map)
+  }
+
+  /**
+   * Transforms the dataset with multiple sets of parameters.
+   * @param dataset input dataset
+   * @param paramMaps an array of parameter maps
+   * @return transformed dataset
+   */
+  def transform(dataset: SchemaRDD, paramMaps: Array[ParamMap]): Array[SchemaRDD] = {
+    paramMaps.map(transform(dataset, _))
+  }
+}
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.example
+
+import org.apache.spark.ml._
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.catalyst.expressions.Row
+
+class BinaryClassificationEvaluator extends Evaluator with Params with OwnParamMap {
+
+  final val metricName: Param[String] =
+    new Param(this, "metricName", "evaluation metric: areaUnderROC or areaUnderPR", "areaUnderROC")
+
+  final val scoreCol: Param[String] = new Param(this, "scoreCol", "score column name", "score")
+
+  final val labelCol: Param[String] = new Param(this, "labelCol", "label column name", "label")
+
+  override def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double = {
+    import dataset.sqlContext._
+    val map = this.paramMap ++ paramMap
+    import map.implicitMapping
+    val scoreAndLabels = dataset.select((scoreCol: String).attr, (labelCol: String).attr)
+      .map { case Row(score: Double, label: Double) =>
+        println(score, label)
+        (score, label)
+      }.cache()
+    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+    (metricName: String) match {
+      case "areaUnderROC" =>
+        metrics.areaUnderROC()
+      case "areaUnderPR" =>
+        metrics.areaUnderPR()
+      case other =>
+        throw new IllegalArgumentException(s"Do not support metric $other.")
+    }
+  }
+}