Skip to content

Commit a0ff6d1

Browse files
yanboliangjkbradley
authored andcommitted
[SPARK-11978][ML] Move dataset_example.py to examples/ml and rename to dataframe_example.py
Since ```Dataset``` has a new meaning in Spark 1.6, we should rename it to avoid confusion. #9873 finished the work of Scala example, here we focus on the Python one. Move dataset_example.py to ```examples/ml``` and rename to ```dataframe_example.py```. BTW, fix minor missing issues of #9873. cc mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #9957 from yanboliang/SPARK-11978.
1 parent aea676c commit a0ff6d1

File tree

2 files changed

+38
-26
lines changed

2 files changed

+38
-26
lines changed

examples/src/main/python/mllib/dataset_example.py renamed to examples/src/main/python/ml/dataframe_example.py

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
#
1717

1818
"""
19-
An example of how to use DataFrame as a dataset for ML. Run with::
20-
bin/spark-submit examples/src/main/python/mllib/dataset_example.py
19+
An example of how to use DataFrame for ML. Run with::
20+
bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input>
2121
"""
2222
from __future__ import print_function
2323

@@ -28,36 +28,48 @@
2828

2929
from pyspark import SparkContext
3030
from pyspark.sql import SQLContext
31-
from pyspark.mllib.util import MLUtils
3231
from pyspark.mllib.stat import Statistics
3332

34-
35-
def summarize(dataset):
36-
print("schema: %s" % dataset.schema().json())
37-
labels = dataset.map(lambda r: r.label)
38-
print("label average: %f" % labels.mean())
39-
features = dataset.map(lambda r: r.features)
40-
summary = Statistics.colStats(features)
41-
print("features average: %r" % summary.mean())
42-
4333
if __name__ == "__main__":
4434
if len(sys.argv) > 2:
45-
print("Usage: dataset_example.py <libsvm file>", file=sys.stderr)
35+
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
4636
exit(-1)
47-
sc = SparkContext(appName="DatasetExample")
37+
sc = SparkContext(appName="DataFrameExample")
4838
sqlContext = SQLContext(sc)
4939
if len(sys.argv) == 2:
5040
input = sys.argv[1]
5141
else:
5242
input = "data/mllib/sample_libsvm_data.txt"
53-
points = MLUtils.loadLibSVMFile(sc, input)
54-
dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache()
55-
summarize(dataset0)
43+
44+
# Load input data
45+
print("Loading LIBSVM file with UDT from " + input + ".")
46+
df = sqlContext.read.format("libsvm").load(input).cache()
47+
print("Schema from LIBSVM:")
48+
df.printSchema()
49+
print("Loaded training data as a DataFrame with " +
50+
str(df.count()) + " records.")
51+
52+
# Show statistical summary of labels.
53+
labelSummary = df.describe("label")
54+
labelSummary.show()
55+
56+
# Convert features column to an RDD of vectors.
57+
features = df.select("features").map(lambda r: r.features)
58+
summary = Statistics.colStats(features)
59+
print("Selected features column with average values:\n" +
60+
str(summary.mean()))
61+
62+
# Save the records in a parquet file.
5663
tempdir = tempfile.NamedTemporaryFile(delete=False).name
5764
os.unlink(tempdir)
58-
print("Save dataset as a Parquet file to %s." % tempdir)
59-
dataset0.saveAsParquetFile(tempdir)
60-
print("Load it back and summarize it again.")
61-
dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache()
62-
summarize(dataset1)
65+
print("Saving to " + tempdir + " as Parquet file.")
66+
df.write.parquet(tempdir)
67+
68+
# Load the records back.
69+
print("Loading Parquet file with UDT from " + tempdir)
70+
newDF = sqlContext.read.parquet(tempdir)
71+
print("Schema from Parquet:")
72+
newDF.printSchema()
6373
shutil.rmtree(tempdir)
74+
75+
sc.stop()

examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ object DataFrameExample {
4444
def main(args: Array[String]) {
4545
val defaultParams = Params()
4646

47-
val parser = new OptionParser[Params]("DatasetExample") {
48-
head("Dataset: an example app using DataFrame as a Dataset for ML.")
47+
val parser = new OptionParser[Params]("DataFrameExample") {
48+
head("DataFrameExample: an example app using DataFrame for ML.")
4949
opt[String]("input")
50-
.text(s"input path to dataset")
50+
.text(s"input path to dataframe")
5151
.action((x, c) => c.copy(input = x))
5252
checkConfig { params =>
5353
success
@@ -88,7 +88,7 @@ object DataFrameExample {
8888
// Save the records in a parquet file.
8989
val tmpDir = Files.createTempDir()
9090
tmpDir.deleteOnExit()
91-
val outputDir = new File(tmpDir, "dataset").toString
91+
val outputDir = new File(tmpDir, "dataframe").toString
9292
println(s"Saving to $outputDir as Parquet file.")
9393
df.write.parquet(outputDir)
9494

0 commit comments

Comments
 (0)