Skip to content

Commit

Permalink
[SPARK-4348] [PySpark] [MLlib] rename random.py to rand.py
Browse files Browse the repository at this point in the history
This PR rename random.py to rand.py to avoid the side affects of conflict with random module, but still keep the same interface as before.

```
>>> from pyspark.mllib.random import RandomRDDs
```

```
$ pydoc pyspark.mllib.random
Help on module random in pyspark.mllib:
NAME
    random - Python package for random data generation.

FILE
    /Users/davies/work/spark/python/pyspark/mllib/rand.py

CLASSES
    __builtin__.object
        pyspark.mllib.random.RandomRDDs

    class RandomRDDs(__builtin__.object)
     |  Generator methods for creating RDDs comprised of i.i.d samples from
     |  some distribution.
     |
     |  Static methods defined here:
     |
     |  normalRDD(sc, size, numPartitions=None, seed=None)
```

cc mengxr

reference link: http://xion.org.pl/2012/05/06/hacking-python-imports/

Author: Davies Liu <[email protected]>

Closes #3216 from davies/random and squashes the following commits:

7ac4e8b [Davies Liu] rename random.py to rand.py

(cherry picked from commit ce0333f)
Signed-off-by: Xiangrui Meng <[email protected]>
  • Loading branch information
Davies Liu authored and mengxr committed Nov 13, 2014
1 parent ad872a5 commit c502e08
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 20 deletions.
10 changes: 0 additions & 10 deletions python/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,6 @@
"""

# The following block allows us to import python's random instead of mllib.random for scripts in
# mllib that depend on top level pyspark packages, which transitively depend on python's random.
# Since Python's import logic looks for modules in the current package first, we eliminate
# mllib.random as a candidate for C{import random} by removing the first search path, the script's
# location, in order to force the loader to look in Python's top-level modules for C{random}.
import sys
s = sys.path.pop(0)
import random
sys.path.insert(0, s)

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.rdd import RDD
Expand Down
34 changes: 34 additions & 0 deletions python/pyspark/mllib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,37 @@
import numpy
if numpy.version.version < '1.4':
raise Exception("MLlib requires NumPy 1.4+")

__all__ = ['classification', 'clustering', 'feature', 'linalg', 'random',
'recommendation', 'regression', 'stat', 'tree', 'util']

import sys
import rand as random
random.__name__ = 'random'
random.RandomRDDs.__module__ = __name__ + '.random'


class RandomModuleHook(object):
"""
Hook to import pyspark.mllib.random
"""
fullname = __name__ + '.random'

def find_module(self, name, path=None):
# skip all other modules
if not name.startswith(self.fullname):
return
return self

def load_module(self, name):
if name == self.fullname:
return random

cname = name.rsplit('.', 1)[-1]
try:
return getattr(random, cname)
except AttributeError:
raise ImportError


sys.meta_path.append(RandomModuleHook())
8 changes: 3 additions & 5 deletions python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
"""
Python package for feature in MLlib.
"""
from __future__ import absolute_import

import sys
import warnings
import random

from py4j.protocol import Py4JJavaError

Expand Down Expand Up @@ -341,8 +344,6 @@ def __init__(self):
"""
Construct Word2Vec instance
"""
import random # this can't be on the top because of mllib.random

self.vectorSize = 100
self.learningRate = 0.025
self.numPartitions = 1
Expand Down Expand Up @@ -411,8 +412,5 @@ def _test():
exit(-1)

if __name__ == "__main__":
# remove current path from list of search paths to avoid importing mllib.random
# for C{import random}, which is done in an external dependency of pyspark during doctests.
import sys
sys.path.pop(0)
_test()
4 changes: 0 additions & 4 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,4 @@ def _test():
exit(-1)

if __name__ == "__main__":
# remove current path from list of search paths to avoid importing mllib.random
# for C{import random}, which is done in an external dependency of pyspark during doctests.
import sys
sys.path.pop(0)
_test()
File renamed without changes.
2 changes: 1 addition & 1 deletion python/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ function run_mllib_tests() {
run_test "pyspark/mllib/clustering.py"
run_test "pyspark/mllib/feature.py"
run_test "pyspark/mllib/linalg.py"
run_test "pyspark/mllib/random.py"
run_test "pyspark/mllib/rand.py"
run_test "pyspark/mllib/recommendation.py"
run_test "pyspark/mllib/regression.py"
run_test "pyspark/mllib/stat.py"
Expand Down

0 comments on commit c502e08

Please sign in to comment.