Skip to content

Commit 1cdd7b5

Browse files
committed
Add parse function
1 parent 65bbbe9 commit 1cdd7b5

File tree

1 file changed

+10
-13
lines changed

1 file changed

+10
-13
lines changed

docs/mllib-naive-bayes.md

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -120,19 +120,16 @@ from pyspark.mllib.classification import NaiveBayes
120120
from pyspark.mllib.linalg import Vectors
121121
from pyspark.mllib.regression import LabeledPoint
122122

123-
data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
124-
125-
# Preprocessing
126-
splitData = data.map(lambda line: line.split(','))
127-
parsedData = splitData.map(
128-
lambda parts: LabeledPoint(
129-
float(parts[0]),
130-
Vectors.dense(map(lambda x: float(x), parts[1].split(' ')))
131-
)
132-
)
133-
134-
# Split data into training (60%) and test (40%)
135-
training, test = parsedData.randomSplit([0.6, 0.4], seed = 0)
123+
def parseLine(line):
124+
parts = line.split(',')
125+
label = float(parts[0])
126+
features = Vectors.dense([float(x) for x in parts[1].split(' ')])
127+
return LabeledPoint(label, features)
128+
129+
data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
130+
131+
# Split data aproximately into training (60%) and test (40%)
132+
training, test = data.randomSplit([0.6, 0.4], seed = 0)
136133

137134
# Train a naive Bayes model.
138135
model = NaiveBayes.train(training, 1.0)

0 commit comments

Comments
 (0)