Skip to content

Commit

Permalink
[SPARK-2514] [mllib] Random RDD generator
Browse files Browse the repository at this point in the history
Utilities for generating random RDDs.

RandomRDD and RandomVectorRDD are created instead of using `sc.parallelize(range:Range)` because `Range` objects in Scala can only have `size <= Int.MaxValue`.

The object `RandomRDDGenerators` can be transformed into a generator class to reduce the number of auxiliary methods for optional arguments.

Author: Doris Xin <[email protected]>

Closes apache#1520 from dorx/randomRDD and squashes the following commits:

01121ac [Doris Xin] reviewer comments
6bf27d8 [Doris Xin] Merge branch 'master' into randomRDD
a8ea92d [Doris Xin] Reviewer comments
063ea0b [Doris Xin] Merge branch 'master' into randomRDD
aec68eb [Doris Xin] newline
bc90234 [Doris Xin] units passed.
d56cacb [Doris Xin] impl with RandomRDD
92d6f1c [Doris Xin] solution for Cloneable
df5bcff [Doris Xin] Merge branch 'generator' into randomRDD
f46d928 [Doris Xin] WIP
49ed20d [Doris Xin] alternative poisson distribution generator
7cb0e40 [Doris Xin] fix for data inconsistency
8881444 [Doris Xin] RandomRDDGenerator: initial design
  • Loading branch information
dorx authored and conviva-zz committed Sep 4, 2014
1 parent 239b174 commit 9c682ad
Show file tree
Hide file tree
Showing 5 changed files with 940 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.mllib.random

import cern.jet.random.Poisson
import cern.jet.random.engine.DRand

import org.apache.spark.annotation.Experimental
import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}

/**
* :: Experimental ::
* Trait for random number generators that generate i.i.d. values from a distribution.
*/
@Experimental
trait DistributionGenerator extends Pseudorandom with Serializable {

/**
* Returns an i.i.d. sample as a Double from an underlying distribution.
*/
def nextValue(): Double

/**
* Returns a copy of the DistributionGenerator with a new instance of the rng object used in the
* class when applicable for non-locking concurrent usage.
*/
def copy(): DistributionGenerator
}

/**
* :: Experimental ::
* Generates i.i.d. samples from U[0.0, 1.0]
*/
@Experimental
class UniformGenerator extends DistributionGenerator {

// XORShiftRandom for better performance. Thread safety isn't necessary here.
private val random = new XORShiftRandom()

override def nextValue(): Double = {
random.nextDouble()
}

override def setSeed(seed: Long) = random.setSeed(seed)

override def copy(): UniformGenerator = new UniformGenerator()
}

/**
* :: Experimental ::
* Generates i.i.d. samples from the standard normal distribution.
*/
@Experimental
class StandardNormalGenerator extends DistributionGenerator {

// XORShiftRandom for better performance. Thread safety isn't necessary here.
private val random = new XORShiftRandom()

override def nextValue(): Double = {
random.nextGaussian()
}

override def setSeed(seed: Long) = random.setSeed(seed)

override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
}

/**
* :: Experimental ::
* Generates i.i.d. samples from the Poisson distribution with the given mean.
*
* @param mean mean for the Poisson distribution.
*/
@Experimental
class PoissonGenerator(val mean: Double) extends DistributionGenerator {

private var rng = new Poisson(mean, new DRand)

override def nextValue(): Double = rng.nextDouble()

override def setSeed(seed: Long) {
rng = new Poisson(mean, new DRand(seed.toInt))
}

override def copy(): PoissonGenerator = new PoissonGenerator(mean)
}
Loading

0 comments on commit 9c682ad

Please sign in to comment.