Stats examples update.

jkbradley · jkbradley · commit 32173b7a2ebf · 2014-08-17T01:21:57.000-07:00
Split RandomAndSampledRDDs into RandomRDDGeneration and SampledRDDs.
(The name RandomRDDGeneration is to avoid a naming conflict with RandomRDDs.)

RandomRDDGeneration prints first 5 samples

Did same split for Python: random_rdd_generation.py and sampled_rdds.py

Other small updates based on code review.
diff --git a/examples/src/main/python/mllib/correlations.py b/examples/src/main/python/mllib/correlations.py
@@ -39,14 +39,14 @@
     corrType = 'pearson'
 
     points = MLUtils.loadLibSVMFile(sc, filepath)\
-        .map(lambda lp: LabeledPoint(lp.label, lp.features.toDense()))
+        .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
 
-    print ''
+    print
     print 'Summary of data file: ' + filepath
     print '%d data points' % points.count()
 
     # Statistics (correlations)
-    print ''
+    print
     print 'Correlation (%s) between label and each feature' % corrType
     print 'Feature\tCorrelation'
     numFeatures = points.take(1)[0].features.size
@@ -55,6 +55,6 @@
         featureRDD = points.map(lambda lp: lp.features[i])
         corr = Statistics.corr(labelRDD, featureRDD, corrType)
         print '%d\t%g' % (i, corr)
-    print ''
+    print
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/random_rdd_generation.py b/examples/src/main/python/mllib/random_rdd_generation.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Randomly generated RDDs.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.mllib.random import RandomRDDs
+
+
+if __name__ == "__main__":
+    if len(sys.argv) not in [1, 2]:
+        print >> sys.stderr, "Usage: random_rdd_generation"
+        exit(-1)
+
+    sc = SparkContext(appName="PythonRandomRDDGeneration")
+
+    numExamples = 10000 # number of examples to generate
+    fraction = 0.1 # fraction of data to sample
+
+    # Example: RandomRDDs.normalRDD
+    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
+    print 'Generated RDD of %d examples sampled from the standard normal distribution'\
+        % normalRDD.count()
+    print '  First 5 samples:'
+    for sample in normalRDD.take(5):
+        print '    ' + str(sample)
+    print
+
+    # Example: RandomRDDs.normalVectorRDD
+    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
+    print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
+    print '  First 5 samples:'
+    for sample in normalVectorRDD.take(5):
+        print '    ' + str(sample)
+    print
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py
@@ -16,61 +16,49 @@
 #
 
 """
-Randomly generated and sampled RDDs.
+Randomly sampled RDDs.
 """
 
 import sys
 
 from pyspark import SparkContext
-from pyspark.mllib.random import RandomRDDGenerators
 from pyspark.mllib.util import MLUtils
 
 
-
 if __name__ == "__main__":
     if len(sys.argv) not in [1, 2]:
-        print >> sys.stderr, "Usage: random_and_sampled_rdds <libsvm data file>"
+        print >> sys.stderr, "Usage: sampled_rdds <libsvm data file>"
         exit(-1)
     if len(sys.argv) == 2:
         datapath = sys.argv[1]
     else:
         datapath = 'data/mllib/sample_binary_classification_data.txt'
 
-    sc = SparkContext(appName="PythonRandomAndSampledRDDs")
-
-    points = MLUtils.loadLibSVMFile(sc, datapath)
+    sc = SparkContext(appName="PythonSampledRDDs")
 
-    numExamples = 10000 # number of examples to generate
     fraction = 0.1 # fraction of data to sample
 
-    # Example: RandomRDDGenerators
-    normalRDD = RandomRDDGenerators.normalRDD(sc, numExamples)
-    print 'Generated RDD of %d examples sampled from the standard normal distribution'\
-        % normalRDD.count()
-    normalVectorRDD = RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
-    print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
-
-    print
+    examples = MLUtils.loadLibSVMFile(sc, datapath)
+    numExamples = examples.count()
+    print 'Loaded data with %d examples from file: %s' % (numExamples, datapath)
 
     # Example: RDD.sample() and RDD.takeSample()
     expectedSampleSize = int(numExamples * fraction)
     print 'Sampling RDD using fraction %g.  Expected sample size = %d.' \
         % (fraction, expectedSampleSize)
-    sampledRDD = normalRDD.sample(withReplacement = True, fraction = fraction)
+    sampledRDD = examples.sample(withReplacement = True, fraction = fraction)
     print '  RDD.sample(): sample has %d examples' % sampledRDD.count()
-    sampledArray = normalRDD.takeSample(withReplacement = True, num = expectedSampleSize)
+    sampledArray = examples.takeSample(withReplacement = True, num = expectedSampleSize)
     print '  RDD.takeSample(): sample has %d examples' % len(sampledArray)
 
     print
 
     # Example: RDD.sampleByKey()
-    examples = MLUtils.loadLibSVMFile(sc, datapath)
-    sizeA = examples.count()
-    print 'Loaded data with %d examples from file: %s' % (sizeA, datapath)
     keyedRDD = examples.map(lambda lp: (int(lp.label), lp.features))
     print '  Keyed data using label (Int) as key ==> Orig'
     #  Count examples per label in original data.
     keyCountsA = keyedRDD.countByKey()
+
     #  Subsample, and count examples per label in sampled data.
     fractions = {}
     for k in keyCountsA.keys():
@@ -80,9 +68,11 @@
     sizeB = sum(keyCountsB.values())
     print '  Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
         % sizeB
+
+    #  Compare samples
     print '   \tFractions of examples with key'
     print 'Key\tOrig\tSample'
     for k in sorted(keyCountsA.keys()):
-        print '%d\t%g\t%g' % (k, keyCountsA[k] / float(sizeA), keyCountsB[k] / float(sizeB))
+        print '%d\t%g\t%g' % (k, keyCountsA[k] / float(numExamples), keyCountsB[k] / float(sizeB))
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import scopt.OptionParser
+
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+/**
+ * An example app for summarizing multivariate data from a file. Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.Correlations
+ * }}}
+ * By default, this loads a synthetic dataset from `data/mllib/sample_linear_regression_data.txt`.
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object Correlations {
+
+  case class Params(input: String = "data/mllib/sample_linear_regression_data.txt")
+
+  def main(args: Array[String]) {
+
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("Correlations") {
+      head("Correlations: an example app for computing correlations")
+      opt[String]("input")
+        .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
+        .action((x, c) => c.copy(input = x))
+      note(
+        """
+        |For example, the following command runs this app on a synthetic dataset:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.Correlations \
+        |  examples/target/scala-*/spark-examples-*.jar \
+        |  --input data/mllib/sample_linear_regression_data.txt
+        """.stripMargin)
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    } getOrElse {
+        sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"Correlations with $params")
+    val sc = new SparkContext(conf)
+
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
+
+    println(s"Summary of data file: ${params.input}")
+    println(s"${examples.count()} data points")
+
+    // Calculate label -- feature correlations
+    val labelRDD = examples.map(_.label)
+    val numFeatures = examples.take(1)(0).features.size
+    val corrType = "pearson"
+    println()
+    println(s"Correlation ($corrType) between label and each feature")
+    println(s"Feature\tCorrelation")
+    var feature = 0
+    while (feature < numFeatures) {
+      val featureRDD = examples.map(_.features(feature))
+      val corr = Statistics.corr(labelRDD, featureRDD)
+      println(s"$feature\t$corr")
+      feature += 1
+    }
+    println()
+
+    sc.stop()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala
@@ -17,53 +17,61 @@
 
 package org.apache.spark.examples.mllib
 
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
 import scopt.OptionParser
 
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, Statistics}
+import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.{SparkConf, SparkContext}
 
 
 /**
  * An example app for summarizing multivariate data from a file. Run with
  * {{{
- * bin/run-example org.apache.spark.examples.mllib.Statistics
+ * bin/run-example org.apache.spark.examples.mllib.MultivariateSummarizer
  * }}}
  * By default, this loads a synthetic dataset from `data/mllib/sample_linear_regression_data.txt`.
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
-object StatisticalSummary extends App {
+object MultivariateSummarizer {
 
   case class Params(input: String = "data/mllib/sample_linear_regression_data.txt")
 
-  val defaultParams = Params()
+  def main(args: Array[String]) {
 
-  val parser = new OptionParser[Params]("StatisticalSummary") {
-    head("StatisticalSummary: an example app for MultivariateOnlineSummarizer and Statistics" +
-      " (correlation)")
-    opt[String]("input")
-      .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
-      .action((x, c) => c.copy(input = x))
-    note(
-      """
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("MultivariateSummarizer") {
+      head("MultivariateSummarizer: an example app for MultivariateOnlineSummarizer")
+      opt[String]("input")
+        .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
+        .action((x, c) => c.copy(input = x))
+      note(
+        """
         |For example, the following command runs this app on a synthetic dataset:
         |
-        | bin/spark-submit --class org.apache.spark.examples.mllib.StatisticalSummary \
+        | bin/spark-submit --class org.apache.spark.examples.mllib.MultivariateSummarizer \
         |  examples/target/scala-*/spark-examples-*.jar \
         |  --input data/mllib/sample_linear_regression_data.txt
-      """.stripMargin)
-  }
+        """.stripMargin)
+    }
 
-  parser.parse(args, defaultParams).map { params =>
-    run(params)
-  } getOrElse {
-    sys.exit(1)
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    } getOrElse {
+        sys.exit(1)
+    }
   }
 
-  def runStatisticalSummary(examples: RDD[LabeledPoint], params: Params) {
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params")
+    val sc = new SparkContext(conf)
+
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
+
+    println(s"Summary of data file: ${params.input}")
+    println(s"${examples.count()} data points")
+
     // Summarize labels
     val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
       (summary, lp) => summary.add(Vectors.dense(lp.label)),
@@ -84,38 +92,6 @@ object StatisticalSummary extends App {
     println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}")
     println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}")
     println()
-  }
-
-  def runCorrelations(examples: RDD[LabeledPoint], params: Params) {
-    // Calculate label -- feature correlations
-    val labelRDD = examples.map(_.label)
-    val numFeatures = examples.take(1)(0).features.size
-    val corrType = "pearson"
-    println()
-    println(s"Correlation ($corrType) between label and each feature")
-    println(s"Feature\tCorrelation")
-    var feature = 0
-    while (feature < numFeatures) {
-      val featureRDD = examples.map(_.features(feature))
-      val corr = Statistics.corr(labelRDD, featureRDD)
-      println(s"$feature\t$corr")
-      feature += 1
-    }
-    println()
-  }
-
-  def run(params: Params) {
-    val conf = new SparkConf().setAppName(s"StatisticalSummary with $params")
-    val sc = new SparkContext(conf)
-
-    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
-
-    println(s"Summary of data file: ${params.input}")
-    println(s"${examples.count} data points")
-
-    runStatisticalSummary(examples, params)
-
-    runCorrelations(examples, params)
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala