Skip to content

Commit

Permalink
ENH: update joblib
Browse files Browse the repository at this point in the history
2012-05-07
Vlad Niculae

    ENH: controlled randomness in tests and doctest fix

2012-02-21
GaelVaroquaux

    ENH: add verbosity in memory

2012-02-21
GaelVaroquaux

    BUG: non-reproducible hashing: order of kwargs The ordering of a
    dictionnary is random. As a result the function hashing was not
    reproducible.
  • Loading branch information
GaelVaroquaux committed May 7, 2012
1 parent 2b287c3 commit 9d6a0b0
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 12 deletions.
3 changes: 2 additions & 1 deletion sklearn/externals/joblib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
inputs and outputs: Python functions. Joblib can save their
computation to disk and rerun it only if necessary::
>>> import numpy as np
>>> from sklearn.externals.joblib import Memory
>>> mem = Memory(cachedir='/tmp/joblib')
>>> import numpy as np
Expand Down Expand Up @@ -101,7 +102,7 @@
"""

__version__ = '0.6.3'
__version__ = '0.6.4'


from .memory import Memory
Expand Down
2 changes: 1 addition & 1 deletion sklearn/externals/joblib/func_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
)

varkwargs = dict()
for arg_name, arg_value in kwargs.iteritems():
for arg_name, arg_value in sorted(kwargs.items()):
if arg_name in arg_dict:
arg_dict[arg_name] = arg_value
elif arg_keywords is not None:
Expand Down
25 changes: 21 additions & 4 deletions sklearn/externals/joblib/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,15 @@ def __init__(self, func, cachedir, ignore=None, mmap_mode=None,
def __call__(self, *args, **kwargs):
# Compare the function code with the previous to see if the
# function code has changed
output_dir, _ = self.get_output_dir(*args, **kwargs)
output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
# FIXME: The statements below should be try/excepted
if not (self._check_previous_func_code(stacklevel=3) and
os.path.exists(output_dir)):
if self._verbose > 10:
_, name = get_func_name(self.func)
self.warn('Computing func %s, argument hash %s in '
'directory %s'
% (name, argument_hash, output_dir))
return self.call(*args, **kwargs)
else:
try:
Expand Down Expand Up @@ -287,6 +292,10 @@ def _check_previous_func_code(self, stacklevel=2):

# The function has changed, wipe the cache directory.
# XXX: Should be using warnings, and giving stacklevel
if self._verbose > 10:
_, func_name = get_func_name(self.func, resolv_alias=False)
self.warn("Function %s (stored in %s) has changed." %
(func_name, func_dir))
self.clear(warn=True)
return False

Expand All @@ -308,12 +317,11 @@ def call(self, *args, **kwargs):
persist the output values.
"""
start_time = time.time()
output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
if self._verbose:
print self.format_call(*args, **kwargs)
output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
output = self.func(*args, **kwargs)
self._persist_output(output, output_dir)
input_repr = self._persist_input(output_dir, *args, **kwargs)
duration = time.time() - start_time
if self._verbose:
_, name = get_func_name(self.func)
Expand Down Expand Up @@ -368,6 +376,8 @@ def _persist_output(self, output, dir):
mkdirp(dir)
filename = os.path.join(dir, 'output.pkl')
numpy_pickle.dump(output, filename, compress=self.compress)
if self._verbose > 10:
print 'Persisting in %s' % dir
except OSError:
" Race condition in the creation of the directory "

Expand Down Expand Up @@ -398,10 +408,17 @@ def load_output(self, output_dir):
"""
if self._verbose > 1:
t = time.time() - self.timestamp
print '[Memory]% 16s: Loading %s...' % (
if self._verbose < 10:
print '[Memory]% 16s: Loading %s...' % (
format_time(t),
self.format_signature(self.func)[0]
)
else:
print '[Memory]% 16s: Loading %s from %s' % (
format_time(t),
self.format_signature(self.func)[0],
output_dir
)
filename = os.path.join(output_dir, 'output.pkl')
return numpy_pickle.load(filename,
mmap_mode=self.mmap_mode)
Expand Down
6 changes: 4 additions & 2 deletions sklearn/externals/joblib/test/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def test_hash_methods():
def test_hash_numpy():
""" Test hashing with numpy arrays.
"""
arr1 = np.random.random((10, 10))
rnd = np.random.RandomState(0)
arr1 = rnd.random_sample((10, 10))
arr2 = arr1.copy()
arr3 = arr2.copy()
arr3[0] += 1
Expand Down Expand Up @@ -160,7 +161,8 @@ def test_hash_numpy_performance():
In [26]: %timeit hash(a)
100 loops, best of 3: 20.8 ms per loop
"""
a = np.random.random(1000000)
rnd = np.random.RandomState(0)
a = rnd.random_sample(1000000)
md5_hash = lambda x: hashlib.md5(np.getbuffer(x)).hexdigest()

relative_diff = relative_time(md5_hash, hash, a)
Expand Down
4 changes: 3 additions & 1 deletion sklearn/externals/joblib/test/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,10 @@ def n(l=None):
verbose=0)
memory.clear(warn=False)
cached_n = memory.cache(n)

rnd = np.random.RandomState(0)
for i in range(3):
a = np.random.random((10, 10))
a = rnd.random_sample((10, 10))
for _ in range(3):
yield nose.tools.assert_true, np.all(cached_n(a) == a)
yield nose.tools.assert_equal, len(accumulator), i + 1
Expand Down
32 changes: 29 additions & 3 deletions sklearn/externals/joblib/test/test_numpy_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ def test_value_error():
@with_numpy
def test_numpy_persistence():
filename = env['filename']
a = np.random.random((10, 2))
rnd = np.random.RandomState(0)
a = rnd.random_sample((10, 2))
for compress, cache_size in ((0, 0), (1, 0), (1, 10)):
# We use 'a.T' to have a non C-contiguous array.
for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])):
Expand Down Expand Up @@ -183,7 +184,8 @@ def test_numpy_persistence():

@with_numpy
def test_memmap_persistence():
a = np.random.random(10)
rnd = np.random.RandomState(0)
a = rnd.random_sample(10)
filename = env['filename'] + str(random.randint(0, 1000))
numpy_pickle.dump(a, filename)
b = numpy_pickle.load(filename, mmap_mode='r')
Expand All @@ -195,7 +197,8 @@ def test_memmap_persistence():
def test_masked_array_persistence():
# The special-case picker fails, because saving masked_array
# not implemented, but it just delegates to the standard pickler.
a = np.random.random(10)
rnd = np.random.RandomState(0)
a = rnd.random_sample(10)
a = np.ma.masked_greater(a, 0.5)
filename = env['filename'] + str(random.randint(0, 1000))
numpy_pickle.dump(a, filename)
Expand All @@ -210,3 +213,26 @@ def test_z_file():
numpy_pickle.write_zfile(file(filename, 'wb'), data)
data_read = numpy_pickle.read_zfile(file(filename, 'rb'))
nose.tools.assert_equal(data, data_read)

################################################################################
# Test dumping array subclasses
if np is not None:

class SubArray(np.ndarray):

def __reduce__(self):
return (_load_sub_array, (np.asarray(self), ))


def _load_sub_array(arr):
d = SubArray(arr.shape)
d[:] = arr
return d

@with_numpy
def test_numpy_subclass():
filename = env['filename']
a = SubArray((10,))
numpy_pickle.dump(a, filename)
c = numpy_pickle.load(filename)
nose.tools.assert_true(isinstance(c, SubArray))

0 comments on commit 9d6a0b0

Please sign in to comment.