cleanup

apache · May 31, 2015 · 615e91c · 615e91c
1 parent 204c4e3
commit 615e91c
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
@@ -48,7 +48,7 @@
     # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
-    lr = LogisticRegression(maxIter=10, regParam=0.001)
+    lr = LogisticRegression(maxIter=10)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 
     # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
@@ -65,7 +65,7 @@
     crossval = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(),
-                              numFolds=2)
+                              numFolds=2) # use 3+ folds in practice
 
     # Run cross-validation, and choose the best set of parameters.
     cvModel = crossval.fit(training)

diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
@@ -41,8 +41,8 @@
 
     # prepare training data.
     # We create an RDD of LabeledPoints and convert them into a DataFrame.
-    # Spark DataFrames can automatically infer the schema from named tuples
-    # and LabeledPoint implements __reduce__ to behave like a named tuple.
+    # A LabeledPoint is an Object with two fields named label and features
+    # and Spark SQL identifies these fields and creates the schema appropriately.
     training = sc.parallelize([
         LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
         LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),