Skip to content

Commit

Permalink
[SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
In the MLLib naivebayes example, scala and python example doesn't use libsvm data, but Java does.

I make changes in scala and python example to use the libsvm data as the same as Java example.

## How was this patch tested?

Manual tests

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes apache#13301 from wangmiao1981/example.
  • Loading branch information
wangmiao1981 authored and srowen committed May 28, 2016
1 parent 4a2fb8b commit 5d4dafe
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 33 deletions.
12 changes: 0 additions & 12 deletions data/mllib/sample_naive_bayes_data.txt

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
// $example on$
String path = "data/mllib/sample_naive_bayes_data.txt";
String path = "data/mllib/sample_libsvm_data.txt";
JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}, 12345);
JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
JavaRDD<LabeledPoint> training = tmp[0]; // training set
JavaRDD<LabeledPoint> test = tmp[1]; // test set
final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
Expand Down
13 changes: 4 additions & 9 deletions examples/src/main/python/mllib/naive_bayes_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,21 @@
from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils


def parseLine(line):
parts = line.split(',')
label = float(parts[0])
features = Vectors.dense([float(x) for x in parts[1].split(' ')])
return LabeledPoint(label, features)
# $example off$

if __name__ == "__main__":

sc = SparkContext(appName="PythonNaiveBayesExample")

# $example on$
data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
# Load and parse the data file.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

# Split data approximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=0)
training, test = data.randomSplit([0.6, 0.4])

# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ package org.apache.spark.examples.mllib
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
// $example off$

object NaiveBayesExample {
Expand All @@ -31,16 +30,11 @@ object NaiveBayesExample {
val conf = new SparkConf().setAppName("NaiveBayesExample")
val sc = new SparkContext(conf)
// $example on$
val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

// Split data into training (60%) and test (40%).
val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)
val Array(training, test) = data.randomSplit(Array(0.6, 0.4))

val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")

Expand Down

0 comments on commit 5d4dafe

Please sign in to comment.