cleanup

Ram Sriharsha · Ram Sriharsha · commit 615e91cb9413 · 2015-05-30T20:59:48.000-07:00
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
@@ -48,7 +48,7 @@
     # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
-    lr = LogisticRegression(maxIter=10, regParam=0.001)
+    lr = LogisticRegression(maxIter=10)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 
     # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
@@ -65,7 +65,7 @@
     crossval = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(),
-                              numFolds=2)
+                              numFolds=2) # use 3+ folds in practice
 
     # Run cross-validation, and choose the best set of parameters.
     cvModel = crossval.fit(training)
diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
@@ -41,8 +41,8 @@
 
     # prepare training data.
     # We create an RDD of LabeledPoints and convert them into a DataFrame.
-    # Spark DataFrames can automatically infer the schema from named tuples
-    # and LabeledPoint implements __reduce__ to behave like a named tuple.
+    # A LabeledPoint is an Object with two fields named label and features
+    # and Spark SQL identifies these fields and creates the schema appropriately.
     training = sc.parallelize([
         LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
         LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),