Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-13019][Docs] fix for scala-2.10 build: Replace example code in mllib-statistics.md using include_example #11901

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
49b7012
[SPARK-13019] raplce for summary staticstics, scala code
keypointt Feb 2, 2016
83592bc
[SPARK-13019] test out on/off, for import part
keypointt Feb 2, 2016
069341b
[SPARK-13019] create separate example files, but cannot compile yet
keypointt Feb 3, 2016
2058b16
[SPARK-13019] move new files into mllib folder
keypointt Feb 3, 2016
b328542
[SPARK-13019] remote python init files
keypointt Feb 3, 2016
12fda2b
[SPARK-13019] comment broken code to pass complie process
keypointt Feb 3, 2016
2abfaa9
[SPARK-13019] remove code block tag
keypointt Feb 4, 2016
157da53
[SPARK-13019] make commented code explicit in html content
keypointt Feb 4, 2016
323304f
[SPARK-13019] Stratified Sampling working
keypointt Feb 4, 2016
3692d30
[SPARK-13019] hypothesis testing working
keypointt Feb 5, 2016
89c3d2e
[SPARK-13019] Hypothesis Testing Kolmogorov Smirnov Test Example is w…
keypointt Feb 5, 2016
4dbbc6d
[SPARK-13019] remove empty lines
keypointt Feb 5, 2016
f024fc3
[SPARK-13019] random data generation example working
keypointt Feb 5, 2016
6f949cd
[SPARK-13019] Kernel Density Estimation Example is working
keypointt Feb 7, 2016
a4dd0fb
[SPARK-13019] code style check
keypointt Feb 7, 2016
3a11802
[SPARK-13019] fix python style
keypointt Feb 12, 2016
0df3e65
[SPARK-13019] remove setMaster, change java to 2-indent
keypointt Feb 16, 2016
d817d0b
[SPARK-13019] more java style fix
keypointt Feb 20, 2016
f945222
[SPARK-13019] mainly re-organize java import
keypointt Feb 21, 2016
aec10ca
[SPARK-13019] re-organize python import
keypointt Feb 23, 2016
e2737ee
[SPARK-13019] code review improvement
keypointt Mar 6, 2016
3329394
[SPARK-13019] sorry, forget to delete python file
keypointt Mar 6, 2016
acf7096
[SPARK-13019] removing '-'s
keypointt Mar 7, 2016
a4eb28d
[SPARK-13019] use asList() for concise code
keypointt Mar 17, 2016
fd6d786
Merge remote-tracking branch 'spark/master' into SPARK-13019
keypointt Mar 22, 2016
892fe60
[SPARK-13019] fix arguments passing for 2.10
keypointt Mar 22, 2016
87b2c56
Merge remote-tracking branch 'spark/master' into SPARK-13019
keypointt Mar 22, 2016
ceebd36
[SPARK-13019] remove variable 'seed'
keypointt Mar 22, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
438 changes: 56 additions & 382 deletions docs/mllib-statistics.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import java.util.Arrays;

import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.stat.Statistics;
// $example off$

public class JavaCorrelationsExample {
public static void main(String[] args) {

SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series

// must have the same number of partitions and cardinality as seriesX
JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
// If a method is not specified, Pearson's method will be used by default.
Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
System.out.println("Correlation is: " + correlation);

// note that each Vector is a row and not a column
JavaRDD<Vector> data = jsc.parallelize(
Arrays.asList(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(5.0, 33.0, 366.0)
)
);

// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
// If a method is not specified, Pearson's method will be used by default.
Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
System.out.println(correlMatrix.toString());
// $example off$

jsc.stop();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;

// $example on$
import java.util.Arrays;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Matrices;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.Statistics;
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
// $example off$

public class JavaHypothesisTestingExample {
public static void main(String[] args) {

SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
// a vector composed of the frequencies of events
Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);

// compute the goodness of fit. If a second vector to test against is not supplied
// as a parameter, the test runs against a uniform distribution.
ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
// summary of the test including the p-value, degrees of freedom, test statistic,
// the method used, and the null hypothesis.
System.out.println(goodnessOfFitTestResult + "\n");

// Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});

// conduct Pearson's independence test on the input contingency matrix
ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
// summary of the test including the p-value, degrees of freedom...
System.out.println(independenceTestResult + "\n");

// an RDD of labeled points
JavaRDD<LabeledPoint> obs = jsc.parallelize(
Arrays.asList(
new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
)
);

// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
// against the label.
ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
int i = 1;
for (ChiSqTestResult result : featureTestResults) {
System.out.println("Column " + i + ":");
System.out.println(result + "\n"); // summary of the test
i++;
}
// $example off$

jsc.stop();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import java.util.Arrays;

import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.mllib.stat.Statistics;
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
// $example off$

public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
public static void main(String[] args) {

SparkConf conf =
new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
KolmogorovSmirnovTestResult testResult =
Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
// summary of the test including the p-value, test statistic, and null hypothesis
// if our p-value indicates significance, we can reject the null hypothesis
System.out.println(testResult);
// $example off$

jsc.stop();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import java.util.Arrays;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.stat.KernelDensity;
// $example off$

public class JavaKernelDensityEstimationExample {
public static void main(String[] args) {

SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
// an RDD of sample data
JavaRDD<Double> data = jsc.parallelize(
Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));

// Construct the density estimator with the sample data
// and a standard deviation for the Gaussian kernels
KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);

// Find density estimates for the given values
double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});

System.out.println(Arrays.toString(densities));
// $example off$

jsc.stop();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import com.google.common.collect.ImmutableMap;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;

// $example on$
import java.util.*;

import scala.Tuple2;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.VoidFunction;
// $example off$

public class JavaStratifiedSamplingExample {
public static void main(String[] args) {

SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
Arrays.<Tuple2<Integer, Character>>asList(
new Tuple2(1, 'a'),
new Tuple2(1, 'b'),
new Tuple2(2, 'c'),
new Tuple2(2, 'd'),
new Tuple2(2, 'e'),
new Tuple2(3, 'f')
)
);

JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);

// specify the exact fraction desired from each key Map<K, Object>
ImmutableMap<Integer, Object> fractions =
ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);

// Get an approximate sample from each stratum
JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
// Get an exact sample from each stratum
JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
// $example off$

System.out.println("approxSample size is " + approxSample.collect().size());
for (Tuple2<Integer, Character> t : approxSample.collect()) {
System.out.println(t._1() + " " + t._2());
}

System.out.println("exactSample size is " + exactSample.collect().size());
for (Tuple2<Integer, Character> t : exactSample.collect()) {
System.out.println(t._1() + " " + t._2());
}

jsc.stop();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import java.util.Arrays;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
// $example off$

public class JavaSummaryStatisticsExample {
public static void main(String[] args) {

SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
JavaSparkContext jsc = new JavaSparkContext(conf);

// $example on$
JavaRDD<Vector> mat = jsc.parallelize(
Arrays.asList(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(3.0, 30.0, 300.0)
)
); // an RDD of Vectors

// Compute column summary statistics.
MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
System.out.println(summary.mean()); // a dense vector containing the mean value for each column
System.out.println(summary.variance()); // column-wise variance
System.out.println(summary.numNonzeros()); // number of nonzeros in each column
// $example off$

jsc.stop();
}
}
Loading