Skip to content

Commit

Permalink
Merge pull request biolab#1337 from kernc/bottlechest
Browse files Browse the repository at this point in the history
Bottlechest, bottleneck
  • Loading branch information
astaric authored Jul 4, 2016
2 parents ecc474d + 219847e commit fa9b426
Show file tree
Hide file tree
Showing 14 changed files with 217 additions and 35 deletions.
4 changes: 4 additions & 0 deletions Orange/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from .misc.lazy_module import _LazyModule
from .misc.datasets import _DatasetInfo

# bottlechest patches bottleneck
from .misc import bottlechest # pylint: disable=unused-import

from .version import \
short_version as __version__, git_revision as __git_version__

Expand Down
8 changes: 3 additions & 5 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import numpy as np
import scipy
import bottlechest as bn

from Orange.data import Table, Storage, Instance, Value
from Orange.preprocess import (RemoveNaNClasses, Continuize,
RemoveNaNColumns, SklImpute)
from Orange.misc.wrapper_meta import WrapperMeta
from Orange.util import one_hot

__all__ = ["Learner", "Model", "SklLearner", "SklModel"]

Expand Down Expand Up @@ -157,11 +157,9 @@ def __call__(self, data, ret=Value):
for c in self.domain.class_vars)
probs = np.zeros(value.shape + (max_card,), float)
for i, cvar in enumerate(self.domain.class_vars):
probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]),
max_card - 1)
probs[:, i, :] = one_hot(value[:, i])
else:
probs, _ = bn.bincount(np.atleast_2d(value),
len(self.domain.class_var.values) - 1)
probs = one_hot(value)
if ret == Model.ValueProbs:
return value, probs
else:
Expand Down
2 changes: 1 addition & 1 deletion Orange/data/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ..misc.enum import Enum
import numpy as np
import bottlechest as bn
import bottleneck as bn
from Orange.data import Instance, Storage, Variable


Expand Down
2 changes: 1 addition & 1 deletion Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from urllib.parse import urlparse, unquote as urlunquote
from urllib.request import urlopen

import bottlechest as bn
import bottleneck as bn
import numpy as np
from chardet.universaldetector import UniversalDetector

Expand Down
9 changes: 2 additions & 7 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from urllib.request import urlopen
from urllib.error import URLError

import bottlechest as bn
import bottleneck as bn
from scipy import sparse as sp

from .instance import *
Expand Down Expand Up @@ -935,12 +935,7 @@ def __determine_density(data):
if data is None:
return Storage.Missing
if data is not None and sp.issparse(data):
try:
if bn.bincount(data.data, 1)[0][0] == 0:
return Storage.SPARSE_BOOL
except ValueError as e:
pass
return Storage.SPARSE
return Storage.SPARSE_BOOL if (data.data == 1).all() else Storage.SPARSE
else:
return Storage.DENSE

Expand Down
179 changes: 179 additions & 0 deletions Orange/misc/bottlechest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
This module provides alternatives for the few additional functions found in
and once used from the bottlechest package (fork of bottleneck).
It also patches bottleneck to contain these functions.
"""
import numpy as np
from scipy.sparse import issparse
import bottleneck as bn


def bincount(X, max_val=None, weights=None, minlength=None):
"""Return counts of values in array X.
Works exactly like np.bincount(), except that it also supports non-integer
arrays with nans.
"""
X = np.asanyarray(X)
if X.dtype.kind == 'f' and bn.anynan(X):
nonnan = ~np.isnan(X)
nans = (~nonnan).sum(axis=0)
X = X[nonnan]
if weights is not None:
weights = weights[nonnan]
else:
nans = 0 if X.ndim == 1 else np.zeros(X.shape[1])
return (np.bincount(X.astype(np.int32, copy=False),
weights=weights,
minlength=minlength or max_val + 1),
nans)


def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
"""
Count the undefined elements in arr along given axis.
Parameters
----------
X : array_like
weights : array_like
Weights to weight the nans with, before or after counting (depending
on the weights shape).
Returns
-------
counts
"""
X = np.asanyarray(X)
isnan = np.isnan(X)
if weights is not None and weights.shape == X.shape:
isnan = isnan * weights
counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
if weights is not None and weights.shape != X.shape:
counts = counts * weights
return counts


def contingency(X, y, max_X=None, max_y=None, weights=None, mask=None):
"""
Compute the contingency matrices for each column of X (excluding the masked)
versus the vector y.
If the array is 1-dimensional, a 2d contingency matrix is returned. If the
array is 2d, the function returns a 3d array, with the first dimension
corresponding to column index (variable in the input array).
The rows of contingency matrix correspond to values of variables, the
columns correspond to values in vector `y`.
(??? isn't it the other way around ???)
Rows in the input array can be weighted (argument `weights`). A subset of
columns can be selected by additional argument `mask`.
The function also returns a count of NaN values per each value of `y`.
Parameters
----------
X : array_like
With values in columns.
y : 1d array
Vector of true values.
max_X : int
The maximal value in the array
max_y : int
The maximal value in `y`
weights : ...
mask : sequence
Discrete columns of X.
Returns
-------
contingencies: (m × ny × nx) array
m number of masked (used) columns (all if mask=None), i.e.
for each column of X;
ny number of uniques in y,
nx number of uniques in column of X.
nans : array_like
Number of nans in each column of X for each unique value of y.
"""
if weights is not None and np.any(weights) and np.unique(weights)[0] != 1:
raise ValueError('weights not yet supported')

was_1d = False
if X.ndim == 1:
X = X[..., np.newaxis]
was_1d = True

contingencies, nans = [], []
ny = np.unique(y).size if max_y is None else max_y + 1
for i in range(X.shape[1]):
if mask is not None and not mask[i]:
contingencies.append(np.zeros((ny, max_X + 1)))
nans.append(np.zeros(ny))
continue
col = X[..., i]
nx = np.unique(col[~np.isnan(col)]).size if max_X is None else max_X + 1
if issparse(col):
col = np.ravel(col.todense())
contingencies.append(
bincount(y + ny * col,
minlength=ny * nx)[0].reshape(nx, ny).T)
nans.append(
bincount(y[np.isnan(col)], minlength=ny)[0])
if was_1d:
return contingencies[0], nans[0]
return np.array(contingencies), np.array(nans)


def stats(X, weights=None, compute_variance=False):
"""
Compute min, max, #nans, mean and variance.
Result is a tuple (min, max, mean, variance, #nans, #non-nans) or an
array of shape (len(X), 6).
The mean and the number of nans and non-nans are weighted.
Computation of variance requires an additional pass and is not enabled
by default. Zeros are filled in instead of variance.
Parameters
----------
X : array_like, 1 or 2 dimensions
Input array.
weights : array_like, optional
Weights, array of the same length as `x`.
compute_variance : bool, optional
If set to True, the function also computes variance.
Returns
-------
out : a 6-element tuple or an array of shape (len(x), 6)
Computed (min, max, mean, variance or 0, #nans, #non-nans)
Raises
------
ValueError
If the length of the weight vector does not match the length of the
array
"""
if weights is not None:
X = X * weights
is_numeric = np.issubdtype(X.dtype, np.number)
nans = (np.isnan(X) if is_numeric else X.astype(bool)).sum(axis=0)
variance = np.nanvar(X, axis=0) if compute_variance else np.zeros(X.shape[1])
return np.column_stack((np.nanmin(X, axis=0) if is_numeric else np.inf,
np.nanmax(X, axis=0) if is_numeric else -np.inf,
np.nanmean(X, axis=0) if is_numeric else 0,
nans,
variance if is_numeric else 0,
X.shape[0] - nans))


# Patch bottleneck to contain these additions
for func in (bincount, countnans, contingency, stats):
if getattr(bn, func.__name__, bincount).__module__ != func.__module__:
raise DeprecationWarning('bottleneck got its own {}();'
'consider deprecating our own.'.format(func.__name__))
setattr(bn, func.__name__, func)
6 changes: 3 additions & 3 deletions Orange/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""
import numpy as np
import sklearn.preprocessing as skl_preprocessing
import bottlechest
import bottleneck as bn

import Orange.data
from Orange.data import Table
Expand Down Expand Up @@ -198,8 +198,8 @@ def __call__(self, data):
data : an input data set
"""

oks = bottlechest.nanmin(data.X, axis=0) != \
bottlechest.nanmax(data.X, axis=0)
oks = bn.nanmin(data.X, axis=0) != \
bn.nanmax(data.X, axis=0)
atts = [data.domain.attributes[i] for i, ok in enumerate(oks) if ok]
domain = Orange.data.Domain(atts, data.domain.class_vars,
data.domain.metas)
Expand Down
8 changes: 4 additions & 4 deletions Orange/tests/test_contingency.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _construct_sparse():
def test_sparse(self):
d = self._construct_sparse()
cont = contingency.Discrete(d, 5)
np.testing.assert_almost_equal(cont[0], [1, 0, 0])
np.testing.assert_almost_equal(cont[0], [2, 0, 0])
np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
np.testing.assert_almost_equal(cont[2], [1, 0, 0])

Expand Down Expand Up @@ -193,7 +193,7 @@ def test_get_contingency(self):
d = self._construct_sparse()
cont = contingency.get_contingency(d, 5)
self.assertIsInstance(cont, contingency.Discrete)
np.testing.assert_almost_equal(cont[0], [1, 0, 0])
np.testing.assert_almost_equal(cont[0], [2, 0, 0])
np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
np.testing.assert_almost_equal(cont[2], [1, 0, 0])

Expand All @@ -218,7 +218,7 @@ def test_get_contingencies(self):

cont = conts[5]
self.assertIsInstance(cont, contingency.Discrete)
np.testing.assert_almost_equal(cont[0], [1, 0, 0])
np.testing.assert_almost_equal(cont[0], [2, 0, 0])
np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
np.testing.assert_almost_equal(cont[2], [1, 0, 0])

Expand All @@ -240,7 +240,7 @@ def test_get_contingencies(self):
self.assertEqual(len(conts), 10)
cont = conts[5]
self.assertIsInstance(cont, contingency.Discrete)
np.testing.assert_almost_equal(cont[0], [1, 0, 0])
np.testing.assert_almost_equal(cont[0], [2, 0, 0])
np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
np.testing.assert_almost_equal(cont[2], [1, 0, 0])

Expand Down
16 changes: 16 additions & 0 deletions Orange/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,22 @@ def scale(values, min=0, max=1):
return (-np.nanmin(values) + values) / ptp * (max - min) + min


def one_hot(values, dtype=float):
"""Return a one-hot transform of values
Parameters
----------
values : 1d array
Integer values (hopefully 0-max).
Returns
-------
result
2d array with ones in respective indicator columns.
"""
return np.eye(np.max(values) + 1, dtype=dtype)[np.asanyarray(values, dtype=int)]


class Registry(type):
"""Metaclass that registers subtypes."""
def __new__(cls, name, bases, attrs):
Expand Down
2 changes: 1 addition & 1 deletion Orange/widgets/classify/tests/test_owlogisticregression.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

import bottlechest as bn
import bottleneck as bn

from Orange.data import Table
from Orange.classification import LogisticRegressionLearner
Expand Down
2 changes: 1 addition & 1 deletion codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ coverage:
status:
patch:
default:
target: '95'
target: '90'
project:
default:
target: auto
Expand Down
2 changes: 1 addition & 1 deletion requirements-core.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.9.0
scipy>=0.11.0
scikit-learn>=0.17
bottlechest>=0.7.1
bottleneck>=1.0.0
# Reading Excel files
xlrd>=0.9.2
# Encoding detection
Expand Down
6 changes: 0 additions & 6 deletions scripts/build-osx-app.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,6 @@ echo "Installing/updating setuptools and pip"
echo "======================================"
"$PIP" install 'setuptools==18.*' 'pip==7.*'

echo "Installing Bottlechest"
echo "======================"
"$PIP" install --find-links http://orange.biolab.si/download/files/wheelhouse/ \
--use-wheel --trusted-host orange.biolab.si \
Bottlechest

echo "Installing orangeqt"
echo "==================="
FDIR=$TEMPLATE/Contents/Frameworks
Expand Down
6 changes: 1 addition & 5 deletions scripts/windows/build-win-application.sh
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,6 @@ touch "$BUILDBASE"/requirements.txt
echo "
#:wheel: scikit-learn https://pypi.python.org/packages/b8/9a/02d5d76be66c57aaa9f917c87007b9b0bf486992cc7701512464d1ce11e9/scikit_learn-0.17.1-cp34-cp34m-win32.whl#md5=ab00daed7cdac4cb16ad0613b91be07e
scikit-learn==0.17.1
#:wheel: Bottlecheset https://dl.dropboxusercontent.com/u/100248799/Bottlechest-0.7.1-cp34-none-win32.whl#md5=629ba2a148dfa784d0e6817497d42e97
--find-links https://dl.dropboxusercontent.com/u/100248799/Bottlechest-0.7.1-cp34-none-win32.whl
Bottlechest==0.7.1
" > "$BUILDBASE"/requirements.txt

function __download_url {
Expand Down Expand Up @@ -291,7 +287,7 @@ function prepare_orange {
bdist_wheel -d "$BUILDBASE/wheelhouse"

# Ensure all install dependencies are available in the wheelhouse
prepare_req --only-binary numpy,scipy,scikit-learn,bottlechest .
prepare_req --only-binary numpy,scipy,scikit-learn .

echo "# Orange " >> "$BUILDBASE/requirements.txt"
echo "$name==$version" >> "$BUILDBASE/requirements.txt"
Expand Down

0 comments on commit fa9b426

Please sign in to comment.