ENH: update joblib

2012-05-07 Vlad Niculae ENH: controlled randomness in tests and doctest fix 2012-02-21 GaelVaroquaux ENH: add verbosity in memory 2012-02-21 GaelVaroquaux BUG: non-reproducible hashing: order of kwargs The ordering of a dictionnary is random. As a result the function hashing was not reproducible.
astaric · May 7, 2012 · 9d6a0b0 · 9d6a0b0
1 parent 2b287c3
commit 9d6a0b0
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 12 deletions.
diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
@@ -60,6 +60,7 @@
    inputs and  outputs: Python functions. Joblib can save their
    computation to disk and rerun it only if necessary::
 
+      >>> import numpy as np
       >>> from sklearn.externals.joblib import Memory
       >>> mem = Memory(cachedir='/tmp/joblib')
       >>> import numpy as np
@@ -101,7 +102,7 @@
 
 """
 
-__version__ = '0.6.3'
+__version__ = '0.6.4'
 
 
 from .memory import Memory

diff --git a/sklearn/externals/joblib/func_inspect.py b/sklearn/externals/joblib/func_inspect.py
@@ -207,7 +207,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
                         )
 
     varkwargs = dict()
-    for arg_name, arg_value in kwargs.iteritems():
+    for arg_name, arg_value in sorted(kwargs.items()):
         if arg_name in arg_dict:
             arg_dict[arg_name] = arg_value
         elif arg_keywords is not None:

diff --git a/sklearn/externals/joblib/memory.py b/sklearn/externals/joblib/memory.py
@@ -159,10 +159,15 @@ def __init__(self, func, cachedir, ignore=None, mmap_mode=None,
     def __call__(self, *args, **kwargs):
         # Compare the function code with the previous to see if the
         # function code has changed
-        output_dir, _ = self.get_output_dir(*args, **kwargs)
+        output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
         # FIXME: The statements below should be try/excepted
         if not (self._check_previous_func_code(stacklevel=3) and
                                  os.path.exists(output_dir)):
+            if self._verbose > 10:
+                _, name = get_func_name(self.func)
+                self.warn('Computing func %s, argument hash %s in '
+                          'directory %s'
+                        % (name, argument_hash, output_dir))
             return self.call(*args, **kwargs)
         else:
             try:
@@ -287,6 +292,10 @@ def _check_previous_func_code(self, stacklevel=2):
 
         # The function has changed, wipe the cache directory.
         # XXX: Should be using warnings, and giving stacklevel
+        if self._verbose > 10:
+            _, func_name = get_func_name(self.func, resolv_alias=False)
+            self.warn("Function %s (stored in %s) has changed." %
+                        (func_name, func_dir))
         self.clear(warn=True)
         return False
 
@@ -308,12 +317,11 @@ def call(self, *args, **kwargs):
             persist the output values.
         """
         start_time = time.time()
+        output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
         if self._verbose:
             print self.format_call(*args, **kwargs)
-        output_dir, argument_hash = self.get_output_dir(*args, **kwargs)
         output = self.func(*args, **kwargs)
         self._persist_output(output, output_dir)
-        input_repr = self._persist_input(output_dir, *args, **kwargs)
         duration = time.time() - start_time
         if self._verbose:
             _, name = get_func_name(self.func)
@@ -368,6 +376,8 @@ def _persist_output(self, output, dir):
             mkdirp(dir)
             filename = os.path.join(dir, 'output.pkl')
             numpy_pickle.dump(output, filename, compress=self.compress)
+            if self._verbose > 10:
+                print 'Persisting in %s' % dir
         except OSError:
             " Race condition in the creation of the directory "
 
@@ -398,10 +408,17 @@ def load_output(self, output_dir):
         """
         if self._verbose > 1:
             t = time.time() - self.timestamp
-            print '[Memory]% 16s: Loading %s...' % (
+            if self._verbose < 10:
+                print '[Memory]% 16s: Loading %s...' % (
                                     format_time(t),
                                     self.format_signature(self.func)[0]
                                     )
+            else:
+                print '[Memory]% 16s: Loading %s from %s' % (
+                                    format_time(t),
+                                    self.format_signature(self.func)[0],
+                                    output_dir
+                                    )
         filename = os.path.join(output_dir, 'output.pkl')
         return numpy_pickle.load(filename,
                                  mmap_mode=self.mmap_mode)

diff --git a/sklearn/externals/joblib/test/test_hashing.py b/sklearn/externals/joblib/test/test_hashing.py
@@ -93,7 +93,8 @@ def test_hash_methods():
 def test_hash_numpy():
     """ Test hashing with numpy arrays.
     """
-    arr1 = np.random.random((10, 10))
+    rnd = np.random.RandomState(0)
+    arr1 = rnd.random_sample((10, 10))
     arr2 = arr1.copy()
     arr3 = arr2.copy()
     arr3[0] += 1
@@ -160,7 +161,8 @@ def test_hash_numpy_performance():
         In [26]: %timeit hash(a)
         100 loops, best of 3: 20.8 ms per loop
     """
-    a = np.random.random(1000000)
+    rnd = np.random.RandomState(0)
+    a = rnd.random_sample(1000000)
     md5_hash = lambda x: hashlib.md5(np.getbuffer(x)).hexdigest()
 
     relative_diff = relative_time(md5_hash, hash, a)

diff --git a/sklearn/externals/joblib/test/test_memory.py b/sklearn/externals/joblib/test/test_memory.py
@@ -325,8 +325,10 @@ def n(l=None):
                             verbose=0)
         memory.clear(warn=False)
         cached_n = memory.cache(n)
+
+        rnd = np.random.RandomState(0)
         for i in range(3):
-            a = np.random.random((10, 10))
+            a = rnd.random_sample((10, 10))
             for _ in range(3):
                 yield nose.tools.assert_true, np.all(cached_n(a) == a)
                 yield nose.tools.assert_equal, len(accumulator), i + 1

diff --git a/sklearn/externals/joblib/test/test_numpy_pickle.py b/sklearn/externals/joblib/test/test_numpy_pickle.py
@@ -134,7 +134,8 @@ def test_value_error():
 @with_numpy
 def test_numpy_persistence():
     filename = env['filename']
-    a = np.random.random((10, 2))
+    rnd = np.random.RandomState(0)
+    a = rnd.random_sample((10, 2))
     for compress, cache_size in ((0, 0), (1, 0), (1, 10)):
         # We use 'a.T' to have a non C-contiguous array.
         for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])):
@@ -183,7 +184,8 @@ def test_numpy_persistence():
 
 @with_numpy
 def test_memmap_persistence():
-    a = np.random.random(10)
+    rnd = np.random.RandomState(0)
+    a = rnd.random_sample(10)
     filename = env['filename'] + str(random.randint(0, 1000))
     numpy_pickle.dump(a, filename)
     b = numpy_pickle.load(filename, mmap_mode='r')
@@ -195,7 +197,8 @@ def test_memmap_persistence():
 def test_masked_array_persistence():
     # The special-case picker fails, because saving masked_array
     # not implemented, but it just delegates to the standard pickler.
-    a = np.random.random(10)
+    rnd = np.random.RandomState(0)
+    a = rnd.random_sample(10)
     a = np.ma.masked_greater(a, 0.5)
     filename = env['filename'] + str(random.randint(0, 1000))
     numpy_pickle.dump(a, filename)
@@ -210,3 +213,26 @@ def test_z_file():
     numpy_pickle.write_zfile(file(filename, 'wb'), data)
     data_read = numpy_pickle.read_zfile(file(filename, 'rb'))
     nose.tools.assert_equal(data, data_read)
+
+################################################################################
+# Test dumping array subclasses
+if np is not None:
+
+    class SubArray(np.ndarray):
+
+        def __reduce__(self):
+            return (_load_sub_array, (np.asarray(self), ))
+
+
+    def _load_sub_array(arr):
+        d = SubArray(arr.shape)
+        d[:] = arr
+        return d
+
+@with_numpy
+def test_numpy_subclass():
+    filename = env['filename']
+    a = SubArray((10,))
+    numpy_pickle.dump(a, filename)
+    c = numpy_pickle.load(filename)
+    nose.tools.assert_true(isinstance(c, SubArray))