Merge pull request biolab#1337 from kernc/bottlechest

Bottlechest, bottleneck
astaric · Jul 4, 2016 · fa9b426 · fa9b426
2 parents ecc474d + 219847e
commit fa9b426
Show file tree

Hide file tree

Showing 14 changed files with 217 additions and 35 deletions.
diff --git a/Orange/__init__.py b/Orange/__init__.py
@@ -1,5 +1,9 @@
 from .misc.lazy_module import _LazyModule
 from .misc.datasets import _DatasetInfo
+
+# bottlechest patches bottleneck
+from .misc import bottlechest  # pylint: disable=unused-import
+
 from .version import \
     short_version as __version__, git_revision as __git_version__
 

diff --git a/Orange/base.py b/Orange/base.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 import scipy
-import bottlechest as bn
 
 from Orange.data import Table, Storage, Instance, Value
 from Orange.preprocess import (RemoveNaNClasses, Continuize,
                                RemoveNaNColumns, SklImpute)
 from Orange.misc.wrapper_meta import WrapperMeta
+from Orange.util import one_hot
 
 __all__ = ["Learner", "Model", "SklLearner", "SklModel"]
 
@@ -157,11 +157,9 @@ def __call__(self, data, ret=Value):
                                for c in self.domain.class_vars)
                 probs = np.zeros(value.shape + (max_card,), float)
                 for i, cvar in enumerate(self.domain.class_vars):
-                    probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]),
-                                                    max_card - 1)
+                    probs[:, i, :] = one_hot(value[:, i])
             else:
-                probs, _ = bn.bincount(np.atleast_2d(value),
-                                       len(self.domain.class_var.values) - 1)
+                probs = one_hot(value)
             if ret == Model.ValueProbs:
                 return value, probs
             else:

diff --git a/Orange/data/filter.py b/Orange/data/filter.py
@@ -5,7 +5,7 @@
 
 from ..misc.enum import Enum
 import numpy as np
-import bottlechest as bn
+import bottleneck as bn
 from Orange.data import Instance, Storage, Variable
 
 

diff --git a/Orange/data/io.py b/Orange/data/io.py
@@ -18,7 +18,7 @@
 from urllib.parse import urlparse, unquote as urlunquote
 from urllib.request import urlopen
 
-import bottlechest as bn
+import bottleneck as bn
 import numpy as np
 from chardet.universaldetector import UniversalDetector
 

diff --git a/Orange/data/table.py b/Orange/data/table.py
@@ -13,7 +13,7 @@
 from urllib.request import urlopen
 from urllib.error import URLError
 
-import bottlechest as bn
+import bottleneck as bn
 from scipy import sparse as sp
 
 from .instance import *
@@ -935,12 +935,7 @@ def __determine_density(data):
         if data is None:
             return Storage.Missing
         if data is not None and sp.issparse(data):
-            try:
-                if bn.bincount(data.data, 1)[0][0] == 0:
-                    return Storage.SPARSE_BOOL
-            except ValueError as e:
-                pass
-            return Storage.SPARSE
+            return Storage.SPARSE_BOOL if (data.data == 1).all() else Storage.SPARSE
         else:
             return Storage.DENSE
 

diff --git a/Orange/misc/bottlechest.py b/Orange/misc/bottlechest.py
@@ -0,0 +1,179 @@
+"""
+This module provides alternatives for the few additional functions found in
+and once used from the bottlechest package (fork of bottleneck).
+
+It also patches bottleneck to contain these functions.
+"""
+import numpy as np
+from scipy.sparse import issparse
+import bottleneck as bn
+
+
+def bincount(X, max_val=None, weights=None, minlength=None):
+    """Return counts of values in array X.
+
+    Works exactly like np.bincount(), except that it also supports non-integer
+    arrays with nans.
+    """
+    X = np.asanyarray(X)
+    if X.dtype.kind == 'f' and bn.anynan(X):
+        nonnan = ~np.isnan(X)
+        nans = (~nonnan).sum(axis=0)
+        X = X[nonnan]
+        if weights is not None:
+            weights = weights[nonnan]
+    else:
+        nans = 0 if X.ndim == 1 else np.zeros(X.shape[1])
+    return (np.bincount(X.astype(np.int32, copy=False),
+                        weights=weights,
+                        minlength=minlength or max_val + 1),
+            nans)
+
+
+def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
+    """
+    Count the undefined elements in arr along given axis.
+
+    Parameters
+    ----------
+    X : array_like
+    weights : array_like
+        Weights to weight the nans with, before or after counting (depending
+        on the weights shape).
+
+    Returns
+    -------
+    counts
+    """
+    X = np.asanyarray(X)
+    isnan = np.isnan(X)
+    if weights is not None and weights.shape == X.shape:
+        isnan = isnan * weights
+    counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
+    if weights is not None and weights.shape != X.shape:
+        counts = counts * weights
+    return counts
+
+
+def contingency(X, y, max_X=None, max_y=None, weights=None, mask=None):
+    """
+    Compute the contingency matrices for each column of X (excluding the masked)
+    versus the vector y.
+
+    If the array is 1-dimensional, a 2d contingency matrix is returned. If the
+    array is 2d, the function returns a 3d array, with the first dimension
+    corresponding to column index (variable in the input array).
+
+    The rows of contingency matrix correspond to values of variables, the
+    columns correspond to values in vector `y`.
+    (??? isn't it the other way around ???)
+
+    Rows in the input array can be weighted (argument `weights`). A subset of
+    columns can be selected by additional argument `mask`.
+
+    The function also returns a count of NaN values per each value of `y`.
+
+    Parameters
+    ----------
+    X : array_like
+        With values in columns.
+    y : 1d array
+        Vector of true values.
+    max_X : int
+        The maximal value in the array
+    max_y : int
+        The maximal value in `y`
+    weights : ...
+    mask : sequence
+        Discrete columns of X.
+
+    Returns
+    -------
+    contingencies: (m × ny × nx) array
+        m number of masked (used) columns (all if mask=None), i.e.
+        for each column of X;
+        ny number of uniques in y,
+        nx number of uniques in column of X.
+    nans : array_like
+        Number of nans in each column of X for each unique value of y.
+    """
+    if weights is not None and np.any(weights) and np.unique(weights)[0] != 1:
+        raise ValueError('weights not yet supported')
+
+    was_1d = False
+    if X.ndim == 1:
+        X = X[..., np.newaxis]
+        was_1d = True
+
+    contingencies, nans = [], []
+    ny = np.unique(y).size if max_y is None else max_y + 1
+    for i in range(X.shape[1]):
+        if mask is not None and not mask[i]:
+            contingencies.append(np.zeros((ny, max_X + 1)))
+            nans.append(np.zeros(ny))
+            continue
+        col = X[..., i]
+        nx = np.unique(col[~np.isnan(col)]).size if max_X is None else max_X + 1
+        if issparse(col):
+            col = np.ravel(col.todense())
+        contingencies.append(
+            bincount(y + ny * col,
+                     minlength=ny * nx)[0].reshape(nx, ny).T)
+        nans.append(
+            bincount(y[np.isnan(col)], minlength=ny)[0])
+    if was_1d:
+        return contingencies[0], nans[0]
+    return np.array(contingencies), np.array(nans)
+
+
+def stats(X, weights=None, compute_variance=False):
+    """
+    Compute min, max, #nans, mean and variance.
+
+    Result is a tuple (min, max, mean, variance, #nans, #non-nans) or an
+    array of shape (len(X), 6).
+
+    The mean and the number of nans and non-nans are weighted.
+
+    Computation of variance requires an additional pass and is not enabled
+    by default. Zeros are filled in instead of variance.
+
+    Parameters
+    ----------
+    X : array_like, 1 or 2 dimensions
+        Input array.
+    weights : array_like, optional
+        Weights, array of the same length as `x`.
+    compute_variance : bool, optional
+        If set to True, the function also computes variance.
+
+    Returns
+    -------
+    out : a 6-element tuple or an array of shape (len(x), 6)
+        Computed (min, max, mean, variance or 0, #nans, #non-nans)
+
+    Raises
+    ------
+    ValueError
+        If the length of the weight vector does not match the length of the
+        array
+    """
+    if weights is not None:
+        X = X * weights
+    is_numeric = np.issubdtype(X.dtype, np.number)
+    nans = (np.isnan(X) if is_numeric else X.astype(bool)).sum(axis=0)
+    variance = np.nanvar(X, axis=0) if compute_variance else np.zeros(X.shape[1])
+    return np.column_stack((np.nanmin(X, axis=0) if is_numeric else np.inf,
+                            np.nanmax(X, axis=0) if is_numeric else -np.inf,
+                            np.nanmean(X, axis=0) if is_numeric else 0,
+                            nans,
+                            variance if is_numeric else 0,
+                            X.shape[0] - nans))
+
+
+# Patch bottleneck to contain these additions
+for func in (bincount, countnans, contingency, stats):
+    if getattr(bn, func.__name__, bincount).__module__ != func.__module__:
+        raise DeprecationWarning('bottleneck got its own {}();'
+                                 'consider deprecating our own.'.format(func.__name__))
+    setattr(bn, func.__name__, func)
diff --git a/Orange/preprocess/preprocess.py b/Orange/preprocess/preprocess.py
@@ -5,7 +5,7 @@
 """
 import numpy as np
 import sklearn.preprocessing as skl_preprocessing
-import bottlechest
+import bottleneck as bn
 
 import Orange.data
 from Orange.data import Table
@@ -198,8 +198,8 @@ def __call__(self, data):
         data : an input data set
         """
 
-        oks = bottlechest.nanmin(data.X, axis=0) != \
-              bottlechest.nanmax(data.X, axis=0)
+        oks = bn.nanmin(data.X, axis=0) != \
+              bn.nanmax(data.X, axis=0)
         atts = [data.domain.attributes[i] for i, ok in enumerate(oks) if ok]
         domain = Orange.data.Domain(atts, data.domain.class_vars,
                                     data.domain.metas)

diff --git a/Orange/tests/test_contingency.py b/Orange/tests/test_contingency.py
@@ -163,7 +163,7 @@ def _construct_sparse():
     def test_sparse(self):
         d = self._construct_sparse()
         cont = contingency.Discrete(d, 5)
-        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
+        np.testing.assert_almost_equal(cont[0], [2, 0, 0])
         np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
         np.testing.assert_almost_equal(cont[2], [1, 0, 0])
 
@@ -193,7 +193,7 @@ def test_get_contingency(self):
         d = self._construct_sparse()
         cont = contingency.get_contingency(d, 5)
         self.assertIsInstance(cont, contingency.Discrete)
-        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
+        np.testing.assert_almost_equal(cont[0], [2, 0, 0])
         np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
         np.testing.assert_almost_equal(cont[2], [1, 0, 0])
 
@@ -218,7 +218,7 @@ def test_get_contingencies(self):
 
         cont = conts[5]
         self.assertIsInstance(cont, contingency.Discrete)
-        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
+        np.testing.assert_almost_equal(cont[0], [2, 0, 0])
         np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
         np.testing.assert_almost_equal(cont[2], [1, 0, 0])
 
@@ -240,7 +240,7 @@ def test_get_contingencies(self):
         self.assertEqual(len(conts), 10)
         cont = conts[5]
         self.assertIsInstance(cont, contingency.Discrete)
-        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
+        np.testing.assert_almost_equal(cont[0], [2, 0, 0])
         np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
         np.testing.assert_almost_equal(cont[2], [1, 0, 0])
 

diff --git a/Orange/util.py b/Orange/util.py
@@ -90,6 +90,22 @@ def scale(values, min=0, max=1):
     return (-np.nanmin(values) + values) / ptp * (max - min) + min
 
 
+def one_hot(values, dtype=float):
+    """Return a one-hot transform of values
+
+    Parameters
+    ----------
+    values : 1d array
+        Integer values (hopefully 0-max).
+
+    Returns
+    -------
+    result
+        2d array with ones in respective indicator columns.
+    """
+    return np.eye(np.max(values) + 1, dtype=dtype)[np.asanyarray(values, dtype=int)]
+
+
 class Registry(type):
     """Metaclass that registers subtypes."""
     def __new__(cls, name, bases, attrs):

diff --git a/Orange/widgets/classify/tests/test_owlogisticregression.py b/Orange/widgets/classify/tests/test_owlogisticregression.py
@@ -1,6 +1,6 @@
 import unittest
 
-import bottlechest as bn
+import bottleneck as bn
 
 from Orange.data import Table
 from Orange.classification import LogisticRegressionLearner

diff --git a/codecov.yml b/codecov.yml
@@ -4,7 +4,7 @@ coverage:
   status:
     patch:
       default:
-        target: '95'
+        target: '90'
     project:
       default:
         target: auto

diff --git a/requirements-core.txt b/requirements-core.txt
@@ -1,7 +1,7 @@
 numpy>=1.9.0
 scipy>=0.11.0
 scikit-learn>=0.17
-bottlechest>=0.7.1
+bottleneck>=1.0.0
 # Reading Excel files
 xlrd>=0.9.2
 # Encoding detection

diff --git a/scripts/build-osx-app.sh b/scripts/build-osx-app.sh
@@ -113,12 +113,6 @@ echo "Installing/updating setuptools and pip"
 echo "======================================"
 "$PIP" install 'setuptools==18.*' 'pip==7.*'
 
-echo "Installing Bottlechest"
-echo "======================"
-"$PIP" install --find-links http://orange.biolab.si/download/files/wheelhouse/ \
-               --use-wheel --trusted-host orange.biolab.si \
-               Bottlechest
-
 echo "Installing orangeqt"
 echo "==================="
 FDIR=$TEMPLATE/Contents/Frameworks

diff --git a/scripts/windows/build-win-application.sh b/scripts/windows/build-win-application.sh
@@ -120,10 +120,6 @@ touch "$BUILDBASE"/requirements.txt
 echo "
 #:wheel: scikit-learn https://pypi.python.org/packages/b8/9a/02d5d76be66c57aaa9f917c87007b9b0bf486992cc7701512464d1ce11e9/scikit_learn-0.17.1-cp34-cp34m-win32.whl#md5=ab00daed7cdac4cb16ad0613b91be07e
 scikit-learn==0.17.1
-
-#:wheel: Bottlecheset https://dl.dropboxusercontent.com/u/100248799/Bottlechest-0.7.1-cp34-none-win32.whl#md5=629ba2a148dfa784d0e6817497d42e97
---find-links https://dl.dropboxusercontent.com/u/100248799/Bottlechest-0.7.1-cp34-none-win32.whl
-Bottlechest==0.7.1
 " > "$BUILDBASE"/requirements.txt
 
 function __download_url {
@@ -291,7 +287,7 @@ function prepare_orange {
         bdist_wheel -d "$BUILDBASE/wheelhouse"
 
     # Ensure all install dependencies are available in the wheelhouse
-    prepare_req --only-binary numpy,scipy,scikit-learn,bottlechest .
+    prepare_req --only-binary numpy,scipy,scikit-learn .
 
     echo "# Orange " >> "$BUILDBASE/requirements.txt"
     echo "$name==$version" >> "$BUILDBASE/requirements.txt"