diff --git a/requirements.txt b/requirements.txt index 2369ea96..a3f14ef4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -numpy>=1.13 +numpy<=1.16 scipy>=0.19 numba>=0.39 diff --git a/sparse/__init__.py b/sparse/__init__.py index 986371ff..45542843 100644 --- a/sparse/__init__.py +++ b/sparse/__init__.py @@ -1,4 +1,5 @@ from .coo import * +from .compressed import GXCS from .dok import DOK from .sparse_array import SparseArray from .utils import random @@ -13,4 +14,4 @@ _AUTO_DENSIFICATION_ENABLED = bool(int(os.environ.get('SPARSE_AUTO_DENSIFY', '0'))) _AUTO_WARN_ON_TOO_DENSE = bool(int(os.environ.get('SPARSE_WARN_ON_TOO_DENSE', '0'))) -del os \ No newline at end of file +del os diff --git a/sparse/compressed/__init__.py b/sparse/compressed/__init__.py new file mode 100644 index 00000000..e25afe3d --- /dev/null +++ b/sparse/compressed/__init__.py @@ -0,0 +1 @@ +from .compressed import GXCS diff --git a/sparse/compressed/compressed.py b/sparse/compressed/compressed.py new file mode 100644 index 00000000..2860269b --- /dev/null +++ b/sparse/compressed/compressed.py @@ -0,0 +1,389 @@ +import numpy as np +from numpy.lib.mixins import NDArrayOperatorsMixin +from functools import reduce +from operator import mul +from collections.abc import Iterable +import scipy.sparse as ss + +from ..sparse_array import SparseArray +from ..coo.common import linear_loc +from ..utils import normalize_axis, check_zero_fill_value +from ..coo.core import COO +from .convert import uncompress_dimension +from .indexing import getitem + + +def _from_coo(x, compressed_axes=None): + + if x.ndim == 0: + if compressed_axes is not None: + raise ValueError('no axes to compress for 0d array') + return ( + x.data, x.coords, []), x.shape, None, (), None, None, None, x.fill_value + + if x.ndim == 1: + if compressed_axes is not None: + raise ValueError('no axes to compress for 1d array') + return ( + x.data, x.coords[0], []), x.shape, None, None, None, None, None, x.fill_value + + compressed_axes = normalize_axis(compressed_axes, x.ndim) + if compressed_axes is None: + # defaults to best compression ratio + compressed_axes = (np.argmin(x.shape),) + + if not isinstance(compressed_axes, Iterable): + raise ValueError('compressed_axes must be an iterable') + if len(compressed_axes) == len(x.shape): + raise ValueError('cannot compress all axes') + if not np.array_equal( + np.unique(compressed_axes), sorted( + np.array(compressed_axes))): + raise ValueError('repeated axis in compressed_axes') + + axis_order = list(compressed_axes) + # array location where the uncompressed dimensions start + axisptr = len(compressed_axes) + axis_order.extend(np.setdiff1d(np.arange(len(x.shape)), compressed_axes)) + new_shape = np.array(x.shape)[axis_order] + row_size = np.prod(new_shape[:axisptr]) + col_size = np.prod(new_shape[axisptr:]) + compressed_shape = (row_size, col_size) + shape = x.shape + + x = x.transpose(axis_order) + linear = linear_loc(x.coords, new_shape) + order = np.argsort(linear) + # linearizing twice is unnecessary, fix needed + coords = x.reshape((compressed_shape)).coords + indptr = np.empty(row_size + 1, dtype=np.intp) + indptr[0] = 0 + np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:]) + indices = coords[1] + data = x.data[order] + return ((data, indices, indptr), shape, compressed_shape, compressed_axes, axis_order, + new_shape, axisptr, x.fill_value) + + +class GXCS(SparseArray, NDArrayOperatorsMixin): + + __array_priority__ = 12 + + def __init__(self, arg, shape=None, compressed_axes=None, fill_value=0): + + if isinstance(arg, np.ndarray): + arg, shape, compressed_shape, compressed_axes, axis_order, reordered_shape, axisptr, fill_value = _from_coo( + COO(arg), compressed_axes) + + elif isinstance(arg, COO): + arg, shape, compressed_shape, compressed_axes, axis_order, reordered_shape, axisptr, fill_value = _from_coo( + arg, compressed_axes) + + if shape is None: + raise ValueError('missing `shape` argument') + + if len(shape) != 1: + + # if initializing directly with (data,indices,indptr) + compressed_axes = normalize_axis(compressed_axes, len(shape)) + + if compressed_axes is None: + raise ValueError('missing `compressed_axes` argument') + elif compressed_axes != () and len(compressed_axes) >= len(shape): + raise ValueError('cannot compress all axes') + if not np.array_equal( + np.unique(compressed_axes), sorted( + np.array(compressed_axes))): + raise ValueError('repeated axis in compressed_axes') + + axis_order = list(compressed_axes) + # array location where the uncompressed dimensions start + axisptr = len(compressed_axes) + axis_order.extend( + np.setdiff1d( + np.arange( + len(shape)), + compressed_axes)) + reordered_shape = np.array(shape)[axis_order] + row_size = np.prod(reordered_shape[:axisptr]) + col_size = np.prod(reordered_shape[axisptr:]) + compressed_shape = (row_size, col_size) + else: + compressed_axes = compressed_shape = axis_order = reordered_shape = axisptr = None + + self.data, self.indices, self.indptr = arg + self.shape = shape + self.compressed_shape = compressed_shape + self.compressed_axes = compressed_axes + self.axis_order = axis_order + self.axisptr = axisptr + self.reordered_shape = reordered_shape + self.fill_value = fill_value + self.dtype = self.data.dtype + + @classmethod + def from_numpy(cls, x, compressed_axes=None, fill_value=0): + coo = COO(x, fill_value=fill_value) + return cls.from_coo(coo, compressed_axes) + + @classmethod + def from_coo(cls, x, compressed_axes=None): + arg, shape, compressed_shape, compressed_axes, axis_order, reordered_shape, axisptr, fill_value = _from_coo( + x, compressed_axes) + return cls( + arg, + shape=shape, + compressed_axes=compressed_axes, + fill_value=fill_value) + + @classmethod + def from_scipy_sparse(cls, x): + if x.format == 'csc': + return cls((x.data, x.indices, x.indptr), + shape=x.shape, compressed_axes=(1,)) + else: + x = x.asformat('csr') + return cls((x.data, x.indices, x.indptr), + shape=x.shape, compressed_axes=(0,)) + + @classmethod + def from_iter(cls, x, shape=None, compressed_axes=None, fill_value=None): + return cls.from_coo( + COO.from_iter( + x, + shape, + fill_value), + compressed_axes=compressed_axes) + + @property + def nnz(self): + return self.data.shape[0] + + @property + def nbytes(self): + return self.data.nbytes + self.indices.nbytes + self.indptr.nbytes + + @property + def density(self): + return self.nnz / reduce(mul, self.shape, 1) + + @property + def ndim(self): + return len(self.shape) + + def __str__(self): + return ''.format( + self.shape, self.dtype, self.nnz, self.fill_value, self.compressed_axes) + + __repr__ = __str__ + + __getitem__ = getitem + + def change_compressed_axes(self, new_compressed_axes): + """ + changes the compressed axes in-place. + right now the space complexity feels a little high + """ + if self.ndim == 1: + raise NotImplementedError('no axes to compress for 1d array') + + new_compressed_axes = tuple(normalize_axis(new_compressed_axes[i], + self.ndim) for i in range(len(new_compressed_axes))) + + if len(new_compressed_axes) >= len(self.shape): + raise ValueError('cannot compress all axes') + if len(set(new_compressed_axes)) != len(new_compressed_axes): + raise ValueError('repeated axis in compressed_axes') + coo = self.tocoo() + arg, shape, compressed_shape, compressed_axes, axis_order, reordered_shape, axisptr, fill_value = _from_coo( + coo, new_compressed_axes) + self.data, self.indices, self.indptr = arg + self.compressed_shape = compressed_shape + self.compressed_axes = new_compressed_axes + self.axis_order = axis_order + self.reordered_shape = reordered_shape + self.axisptr = axisptr + + def tocoo(self): + if self.ndim == 1: + return COO(self.indices[None, :], self.data, + shape=self.shape, fill_value=self.fill_value) + uncompressed = uncompress_dimension(self.indptr) + coords = np.vstack((uncompressed, self.indices)) + order = np.argsort(self.axis_order) + return COO( + coords, + self.data, + shape=self.compressed_shape, + fill_value=self.fill_value).reshape( + self.reordered_shape).transpose(order) + + def todense(self): + if self.compressed_axes == (): + return np.full(self.shape, self.fill_value, self.dtype) + return self.tocoo().todense() + + def todok(self): + + from ..dok import DOK + return DOK.from_coo(self.tocoo()) # probably a temporary solution + + def to_scipy_sparse(self): + """ + Converts this :obj:`CSD` object into a :obj:`scipy.sparse.csr_matrix` or `scipy.sparse.csc_matrix`. + Returns + ------- + :obj:`scipy.sparse.csr_matrix` or `scipy.sparse.csc_matrix` + The converted Scipy sparse matrix. + Raises + ------ + ValueError + If the array is not two-dimensional. + ValueError + If all the array doesn't zero fill-values. + """ + + check_zero_fill_value(self) + + if self.ndim != 2: + raise ValueError( + "Can only convert a 2-dimensional array to a Scipy sparse matrix.") + + if 0 in self.compressed_axes: + return ss.csr_matrix( + (self.data, self.indices, self.indptr), shape=self.shape) + else: + return ss.csc_matrix( + (self.data, self.indices, self.indptr), shape=self.shape) + + def asformat(self, format): + """ + Convert this sparse array to a given format. + Parameters + ---------- + format : str + A format string. + Returns + ------- + out : SparseArray + The converted array. + Raises + ------ + NotImplementedError + If the format isn't supported. + """ + + if format == 'coo': + return self.tocoo() + elif format == 'dok': + return self.todok() + + raise NotImplementedError('The given format is not supported.') + + def maybe_densify(self, max_size=1000, min_density=0.25): + """ + Converts this :obj:`CSR` or `CSC` array to a :obj:`numpy.ndarray` if not too + costly. + Parameters + ---------- + max_size : int + Maximum number of elements in output + min_density : float + Minimum density of output + Returns + ------- + numpy.ndarray + The dense array. + Raises + ------- + ValueError + If the returned array would be too large. + """ + + if self.size <= max_size or self.density >= min_density: + return self.todense() + else: + raise ValueError("Operation would require converting " + "large sparse array to dense") + + def reshape(self, shape, order='C', compressed_axes=None): + """ + Returns a new :obj:`CSR` or `CSC` array that is a reshaped version of this array. + Parameters + ---------- + shape : tuple[int] + The desired shape of the output array. + Returns + ------- + CSR or CSC + The reshaped output array. + See Also + -------- + numpy.ndarray.reshape : The equivalent Numpy function. + sparse.COO.reshape: The equivalent COO function. + Notes + ----- + The :code:`order` parameter is provided just for compatibility with + Numpy and isn't actually supported. + + """ + + if order not in {'C', None}: + raise NotImplementedError("The 'order' parameter is not supported") + if any(d == -1 for d in shape): + extra = int(self.size / + np.prod([d for d in shape if d != -1])) + shape = tuple([d if d != -1 else extra for d in shape]) + + if self.shape == shape: + return self + + if self.size != reduce(mul, shape, 1): + raise ValueError( + 'cannot reshape array of size {} into shape {}'.format( + self.size, shape)) + + # there's likely a way to do this without decompressing to COO + coo = self.tocoo().reshape(shape) + return GXCS.from_coo(coo, compressed_axes) + + def resize(self, *args, refcheck=True, compressed_axes=None): + """ + This method changes the shape and size of an array in-place. + + Parameters + ---------- + args : tuple, or series of integers + The desired shape of the output array. + + See Also + -------- + numpy.ndarray.resize : The equivalent Numpy function. + sparse.COO.resize : The equivalent COO function. + """ + + if len(args) == 1 and isinstance(args[0], tuple): + shape = args[0] + elif all(isinstance(arg, int) for arg in args): + shape = tuple(args) + else: + raise ValueError('Invalid input') + + if any(d < 0 for d in shape): + raise ValueError('negative dimensions not allowed') + + if self.shape == shape: + return + + # there's likely a way to do this without decompressing to COO + coo = self.tocoo() + coo.resize(shape) + arg, shape, compressed_shape, compressed_axes, axis_order, reordered_shape, axisptr, fill_value = _from_coo( + coo, compressed_axes) + self.data, self.indices, self.indptr = arg + self.shape = shape + self.compressed_shape = compressed_shape + self.compressed_axes = compressed_axes + self.axis_order = axis_order + self.reordered_shape = reordered_shape + self.axisptr = axisptr diff --git a/sparse/compressed/convert.py b/sparse/compressed/convert.py new file mode 100644 index 00000000..86991e41 --- /dev/null +++ b/sparse/compressed/convert.py @@ -0,0 +1,62 @@ +import numpy as np +import numba +from numba.typed import List + + +def convert_to_flat(inds, shape, axisptr): + inds = [np.array(ind) for ind in inds] + if any(ind.ndim > 1 for ind in inds): + raise IndexError('Only one-dimensional iterable indices supported.') + uncompressed_inds = inds[axisptr:] + cols = np.empty( + np.prod([ind.size for ind in uncompressed_inds]), dtype=np.intp) + shape_bins = transform_shape(shape[axisptr:]) + increments = List() + for i in range(len(uncompressed_inds)): + increments.append( + (uncompressed_inds[i] * shape_bins[i]).astype(np.int32)) + operations = np.prod([ind.shape[0] for ind in increments[:-1]]) + return compute_flat(increments, cols, operations) + + +@numba.jit(nopython=True, nogil=True) +def compute_flat(increments, cols, operations): # pragma: no cover + start = 0 + end = increments[-1].shape[0] + positions = np.zeros(len(increments)-1, dtype=np.intp) + pos = len(increments)-2 + for i in range(operations): + if i != 0 and positions[pos] == increments[pos].shape[0]: + positions[pos] = 0 + pos -= 1 + positions[pos] += 1 + pos += 1 + to_add = np.array([increments[i][positions[i]] + for i in range(len(increments)-1)]).sum() + cols[start:end] = increments[-1] + to_add + positions[pos] += 1 + start += increments[-1].shape[0] + end += increments[-1].shape[0] + return cols + + +def transform_shape(shape): + """ + turns a shape into the linearized increments that + it represents. For example, given (5,5,5), it returns + np.array([25,5,1]). + """ + shape_bins = np.empty(len(shape), dtype=np.intp) + shape_bins[-1] = 1 + for i in range(len(shape)-2, -1, -1): + shape_bins[i] = np.prod(shape[i+1:]) + return shape_bins + + +@numba.jit(nopython=True, nogil=True) +def uncompress_dimension(indptr): # pragma: no cover + """converts an index pointer array into an array of coordinates""" + uncompressed = np.empty(indptr[-1], dtype=np.intp) + for i in range(len(indptr) - 1): + uncompressed[indptr[i]:indptr[i + 1]] = i + return uncompressed diff --git a/sparse/compressed/indexing.py b/sparse/compressed/indexing.py new file mode 100644 index 00000000..96a32a1a --- /dev/null +++ b/sparse/compressed/indexing.py @@ -0,0 +1,187 @@ +import numpy as np +import numba +from numbers import Integral +from itertools import zip_longest +from collections.abc import Iterable +from ..slicing import normalize_index +from .convert import convert_to_flat, uncompress_dimension + + +def getitem(x, key): + """ + + + """ + from .compressed import GXCS + + if x.ndim == 1: + coo = x.tocoo()[key] + return GXCS.from_coo(coo) + + key = list(normalize_index(key, x.shape)) + + # zip_longest so things like x[..., None] are picked up. + if len(key) != 0 and all(isinstance(k, slice) and k == slice(0, dim, 1) + for k, dim in zip_longest(key, x.shape)): + return x + + # return a single element + if all(isinstance(k, int) for k in key): # indexing for a single element + key = np.array(key)[x.axis_order] # reordering the input + ind = np.ravel_multi_index(key, x.reordered_shape) + row, col = np.unravel_index(ind, x.compressed_shape) + current_row = x.indices[x.indptr[row]:x.indptr[row + 1]] + item = np.searchsorted(current_row, col) + if not (item >= current_row.size or current_row[item] != col): + item += x.indptr[row] + return x.data[item] + return x.fill_value + + shape = [] + compressed_inds = np.zeros(len(x.shape), dtype=np.bool) + uncompressed_inds = np.zeros(len(x.shape), dtype=np.bool) + shape_key = np.zeros(len(x.shape), dtype=np.intp) + + Nones_removed = [k for k in key if k is not None] + count = 0 + for i, ind in enumerate(Nones_removed): + if isinstance(ind, Integral): + continue + elif ind is None: + # handle the None cases at the end + continue + elif isinstance(ind, slice): + shape_key[i] = count + shape.append(len(range(ind.start, ind.stop, ind.step))) + if i in x.compressed_axes: + compressed_inds[i] = True + else: + uncompressed_inds[i] = True + elif isinstance(ind, Iterable): + shape_key[i] = count + shape.append(len(ind)) + if i in x.compressed_axes: + compressed_inds[i] = True + else: + uncompressed_inds[i] = True + count += 1 + + reordered_key = [Nones_removed[i] for i in x.axis_order] + + # prepare for converting to flat indices + for i, ind in enumerate(reordered_key[:x.axisptr]): + if isinstance(ind, slice): + reordered_key[i] = range(ind.start, ind.stop, ind.step) + for i, ind in enumerate(reordered_key[x.axisptr:]): + if isinstance(ind, Integral): + reordered_key[i + x.axisptr] = [ind] + elif isinstance(ind, slice): + reordered_key[i + + x.axisptr] = np.arange(ind.start, ind.stop, ind.step) + + # find starts and ends of rows + a = x.indptr[:-1].reshape(x.reordered_shape[:x.axisptr]) + b = x.indptr[1:].reshape(x.reordered_shape[:x.axisptr]) + starts = a[tuple(reordered_key[:x.axisptr])].flatten() + ends = b[tuple(reordered_key[:x.axisptr])].flatten() + + shape = np.array(shape) + + cols = convert_to_flat(reordered_key, x.reordered_shape, x.axisptr) + + if np.any(compressed_inds): + compressed_axes = shape_key[compressed_inds] + + if len(compressed_axes) == 1: + row_size = shape[compressed_axes] + else: + row_size = np.prod(shape[compressed_axes]) + + else: # only uncompressed axes + compressed_axes = (0,) # defaults to 0 + row_size = 1 # this doesn't matter + + if not np.any(uncompressed_inds): # only indexing compressed axes + compressed_axes = (0,) # defaults to 0 + row_size = starts.size + + indptr = np.empty(row_size + 1, dtype=np.intp) + indptr[0] = 0 + arg = get_array_selection(x.data, x.indices, indptr, starts, ends, cols) + + data, indices, indptr = arg + size = np.prod(shape[1:]) + + if not np.any(uncompressed_inds): # only indexing compressed axes + uncompressed = uncompress_dimension(indptr) + if len(shape) == 1: + indices = uncompressed + indptr = None + else: + indices = uncompressed % size + indptr = np.empty(shape[0] + 1, dtype=np.intp) + indptr[0] = 0 + np.cumsum(np.bincount(uncompressed // + size, minlength=shape[0]), out=indptr[1:]) + if not np.any(compressed_inds): + + if len(shape) == 1: + indptr = None + else: + uncompressed = indices // size + indptr = np.empty(shape[0] + 1, dtype=np.intp) + indptr[0] = 0 + np.cumsum(np.bincount(uncompressed, + minlength=shape[0]), out=indptr[1:]) + indices = indices % size + + arg = (data, indices, indptr) + + compressed_axes = np.array(compressed_axes) + shape = shape.tolist() + for i in range(len(key)): + if key[i] is None: + shape.insert(i, 1) + compressed_axes[compressed_axes >= i] += 1 + + compressed_axes = tuple(compressed_axes) + shape = tuple(shape) + + if len(shape) == 1: + compressed_axes = None + + return GXCS( + arg, + shape=shape, + compressed_axes=compressed_axes, + fill_value=x.fill_value) + + +@numba.jit(nopython=True, nogil=True) +def get_array_selection(arr_data, arr_indices, indptr, starts, ends, col): # pragma: no cover + """ + This is a very general algorithm to be used when more optimized methods don't apply. + It performs a binary search for each of the requested elements. + Consequently it roughly scales by O(n log nnz per row) where n is the number of requested elements and + nnz per row is the number of nonzero elements in that row. + """ + indices = [] + ind_list = [] + for i, (start, end) in enumerate(zip(starts, ends)): + inds = [] + current_row = arr_indices[start:end] + if len(current_row) == 0: + indptr[i + 1] = indptr[i] + continue + for c in range(len(col)): + s = np.searchsorted(current_row, col[c]) + if not (s >= current_row.size or current_row[s] != col[c]): + s += start + inds.append(s) + indices.append(c) + ind_list.extend(inds) + indptr[i + 1] = indptr[i] + len(inds) + ind_list = np.array(ind_list, dtype=np.int64) + indices = np.array(indices) + data = arr_data[ind_list] + return (data, indices, indptr) diff --git a/sparse/coo/core.py b/sparse/coo/core.py index 2b260968..e4ac97b9 100644 --- a/sparse/coo/core.py +++ b/sparse/coo/core.py @@ -8,7 +8,6 @@ import numpy as np import scipy.sparse from numpy.lib.mixins import NDArrayOperatorsMixin - import numba from .common import dot, matmul @@ -222,7 +221,8 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True, self.data = np.broadcast_to(self.data, self.coords.shape[1]) if shape and not self.coords.size: - self.coords = np.zeros((len(shape) if isinstance(shape, Iterable) else 1, 0), dtype=np.uint64) + self.coords = np.zeros((len(shape) if isinstance( + shape, Iterable) else 1, 0), dtype=np.uint64) if shape is None: if self.coords.nbytes: @@ -693,7 +693,8 @@ def reduce(self, method, axis=(0,), keepdims=False, **kwargs): """ axis = normalize_axis(axis, self.ndim) - zero_reduce_result = method.reduce([self.fill_value, self.fill_value], **kwargs) + zero_reduce_result = method.reduce( + [self.fill_value, self.fill_value], **kwargs) reduce_super_ufunc = None if not equivalent(zero_reduce_result, self.fill_value): @@ -717,7 +718,8 @@ def reduce(self, method, axis=(0,), keepdims=False, **kwargs): a = a.reshape((np.prod([self.shape[d] for d in neg_axis], dtype=np.intp), np.prod([self.shape[d] for d in axis], dtype=np.intp))) - result, inv_idx, counts = _grouped_reduce(a.data, a.coords[0], method, **kwargs) + result, inv_idx, counts = _grouped_reduce( + a.data, a.coords[0], method, **kwargs) result_fill_value = self.fill_value @@ -726,7 +728,8 @@ def reduce(self, method, axis=(0,), keepdims=False, **kwargs): result[missing_counts] = method(result[missing_counts], self.fill_value, **kwargs) else: - result = method(result, reduce_super_ufunc(self.fill_value, a.shape[1] - counts)).astype(result.dtype) + result = method(result, reduce_super_ufunc( + self.fill_value, a.shape[1] - counts)).astype(result.dtype) result_fill_value = reduce_super_ufunc(self.fill_value, a.shape[1]) coords = a.coords[0:1, inv_idx] @@ -1656,26 +1659,21 @@ def linear_loc(self): def reshape(self, shape, order='C'): """ Returns a new :obj:`COO` array that is a reshaped version of this array. - Parameters ---------- shape : tuple[int] The desired shape of the output array. - Returns ------- COO The reshaped output array. - See Also -------- numpy.ndarray.reshape : The equivalent Numpy function. - Notes ----- The :code:`order` parameter is provided just for compatibility with Numpy and isn't actually supported. - Examples -------- >>> s = COO.from_numpy(np.arange(25)) @@ -1706,7 +1704,8 @@ def reshape(self, shape, order='C'): return self if self.size != reduce(operator.mul, shape, 1): - raise ValueError('cannot reshape array of size {} into shape {}'.format(self.size, shape)) + raise ValueError( + 'cannot reshape array of size {} into shape {}'.format(self.size, shape)) if self._cache is not None: for sh, value in self._cache['reshape']: @@ -1734,7 +1733,6 @@ def reshape(self, shape, order='C'): def resize(self, *args, refcheck=True): """ This method changes the shape and size of an array in-place. - Parameters ---------- args : tuple, or series of integers @@ -1799,7 +1797,8 @@ def to_scipy_sparse(self): check_zero_fill_value(self) if self.ndim != 2: - raise ValueError("Can only convert a 2-dimensional array to a Scipy sparse matrix.") + raise ValueError( + "Can only convert a 2-dimensional array to a Scipy sparse matrix.") result = scipy.sparse.coo_matrix((self.data, (self.coords[0], @@ -2155,7 +2154,7 @@ def nonzero(self): check_zero_fill_value(self) return tuple(self.coords) - def asformat(self, format): + def asformat(self, format, compressed_axes=None): """ Convert this sparse array to a given format. @@ -2174,6 +2173,13 @@ def asformat(self, format): NotImplementedError If the format isn't supported. """ + from ..compressed import GXCS + if format == 'gxcs' or format is GXCS: + return GXCS.from_coo(self, compressed_axes=compressed_axes) + elif compressed_axes is not None: + raise ValueError( + 'compressed_axes is not supported for {} format'.format(format)) + if format == 'coo' or format is COO: return self diff --git a/sparse/utils.py b/sparse/utils.py index 0ed9bc3f..439ba5e8 100644 --- a/sparse/utils.py +++ b/sparse/utils.py @@ -43,7 +43,8 @@ def assert_eq(x, y, check_nnz=True, compare_dtype=True, **kwargs): def assert_nnz(s, x): - fill_value = s.fill_value if hasattr(s, 'fill_value') else _zero_of_dtype(s.dtype) + fill_value = s.fill_value if hasattr( + s, 'fill_value') else _zero_of_dtype(s.dtype) assert np.sum(~equivalent(x, fill_value)) == s.nnz @@ -74,6 +75,7 @@ def random( random_state=None, data_rvs=None, format='coo', + compressed_axes=None, fill_value=None ): """ Generate a random sparse multidimensional array @@ -135,6 +137,10 @@ def random( nnz = int(elements * density) + if format != 'gxcs' and compressed_axes is not None: + raise ValueError( + 'compressed_axes is not supported for {} format'.format(format)) + if random_state is None: random_state = np.random elif isinstance(random_state, Integral): @@ -157,9 +163,10 @@ def random( data = data_rvs(nnz) - ar = COO(ind[None, :], data, shape=elements, fill_value=fill_value).reshape(shape) + ar = COO(ind[None, :], data, shape=elements, + fill_value=fill_value).reshape(shape) - return ar.asformat(format) + return ar.asformat(format, compressed_axes=compressed_axes) def isscalar(x): @@ -205,7 +212,8 @@ def normalize_axis(axis, ndim): axis += ndim if axis >= ndim or axis < 0: - raise ValueError('Invalid axis index %d for ndim=%d' % (axis, ndim)) + raise ValueError( + 'Invalid axis index %d for ndim=%d' % (axis, ndim)) return axis @@ -255,7 +263,8 @@ def equivalent(x, y): # Can contain NaNs # FIXME: Complex floats and np.void with multiple values can't be compared properly. - return (x == y) | ((x != x) & (y != y)) # lgtm [py/comparison-of-identical-expressions] + # lgtm [py/comparison-of-identical-expressions] + return (x == y) | ((x != x) & (y != y)) def check_zero_fill_value(*args): diff --git a/tests/test_compressed.py b/tests/test_compressed.py new file mode 100644 index 00000000..04d718e0 --- /dev/null +++ b/tests/test_compressed.py @@ -0,0 +1,181 @@ +import sparse +import pytest +import numpy as np +import scipy + +from sparse import GXCS +from sparse.utils import assert_eq + + +@pytest.mark.parametrize('a,b', [ + [(3, 4), (5, 5)], + [(12,), (3, 4)], + [(12,), (3, 6)], + [(5, 5, 5), (6, 6, 6)], + [(3, 4), (9, 4)], + [(5,), (4,)], + [(2, 3, 4, 5), (2, 3, 4, 5, 6)], + [(100,), (5, 5)], + [(2, 3, 4, 5), (20, 6)], + [(), ()], +]) +def test_resize(a, b): + s = sparse.random(a, density=0.5, format='gxcs') + orig_size = s.size + x = s.todense() + x = np.resize(x, b) + s.resize(b) + temp = x.reshape(x.size) + temp[orig_size:] = s.fill_value + assert isinstance(s, sparse.SparseArray) + assert_eq(x, s) + + +@pytest.mark.parametrize('a,b', [ + [(3, 4), (3, 4)], + [(12,), (3, 4)], + [(12,), (3, -1)], + [(3, 4), (12,)], + [(3, 4), (-1, 4)], + [(3, 4), (3, -1)], + [(2, 3, 4, 5), (8, 15)], + [(2, 3, 4, 5), (24, 5)], + [(2, 3, 4, 5), (20, 6)], + [(), ()], +]) +def test_reshape(a, b): + s = sparse.random(a, density=0.5, format='gxcs') + x = s.todense() + + assert_eq(x.reshape(b), s.reshape(b)) + + +def test_reshape_same(): + s = sparse.random((3, 5), density=0.5, format='gxcs') + + assert s.reshape(s.shape) is s + + +def test_to_scipy_sparse(): + s = sparse.random((3, 5), density=0.5, format='gxcs', compressed_axes=(0,)) + a = s.to_scipy_sparse() + b = scipy.sparse.csr_matrix(s.todense()) + + assert_eq(a, b) + + +def test_tocoo(): + coo = sparse.random((5, 6), density=.5) + b = GXCS.from_coo(coo) + + assert_eq(b.tocoo(), coo) + + +@pytest.mark.parametrize('index', [ + # Integer + 0, + 1, + -1, + (1, 1, 1), + # Pure slices + (slice(0, 2),), + (slice(None, 2), slice(None, 2)), + (slice(1, None), slice(1, None)), + (slice(None, None),), + (slice(None, None, -1),), + (slice(None, 2, -1), slice(None, 2, -1)), + (slice(1, None, 2), slice(1, None, 2)), + (slice(None, None, 2),), + (slice(None, 2, -1), slice(None, 2, -2)), + (slice(1, None, 2), slice(1, None, 1)), + (slice(None, None, -2),), + # Combinations + (0, slice(0, 2),), + (slice(0, 1), 0), + (None, slice(1, 3), 0), + (slice(0, 3), None, 0), + (slice(1, 2), slice(2, 4)), + (slice(1, 2), slice(None, None)), + (slice(1, 2), slice(None, None), 2), + (slice(1, 2, 2), slice(None, None), 2), + (slice(1, 2, None), slice(None, None, 2), 2), + (slice(1, 2, -2), slice(None, None), -2), + (slice(1, 2, None), slice(None, None, -2), 2), + (slice(1, 2, -1), slice(None, None), -1), + (slice(1, 2, None), slice(None, None, -1), 2), + (slice(2, 0, -1), slice(None, None), -1), + (slice(-2, None, None),), + (slice(-1, None, None), slice(-2, None, None)), + # With ellipsis + (Ellipsis, slice(1, 3)), + (1, Ellipsis, slice(1, 3)), + (slice(0, 1), Ellipsis), + (Ellipsis, None), + (None, Ellipsis), + (1, Ellipsis), + (1, Ellipsis, None), + (1, 1, 1, Ellipsis), + (Ellipsis, 1, None), + # Pathological - Slices larger than array + (slice(None, 1000)), + (slice(None), slice(None, 1000)), + (slice(None), slice(1000, -1000, -1)), + (slice(None), slice(1000, -1000, -50)), + # Pathological - Wrong ordering of start/stop + (slice(5, 0),), + (slice(0, 5, -1),), +]) +def test_slicing(index): + s = sparse.random((2, 3, 4), density=0.5, format='gxcs') + x = s.todense() + + assert_eq(x[index], s[index]) + + +@pytest.mark.parametrize('index', [ + ([1, 0], 0), + (1, [0, 2]), + (0, [1, 0], 0), + (1, [2, 0], 0), + ([True, False], slice(1, None), slice(-2, None)), + (slice(1, None), slice(-2, None), [True, False, True, False]), + ([1, 0],), + (Ellipsis, [2, 1, 3],), + (slice(None), [2, 1, 2],), + (1, [2, 0, 1],), +]) +def test_advanced_indexing(index): + s = sparse.random((2, 3, 4), density=0.5, format='gxcs') + x = s.todense() + + assert_eq(x[index], s[index]) + + +@pytest.mark.parametrize('index', [ + (Ellipsis, Ellipsis), + (1, 1, 1, 1), + (slice(None),) * 4, + 5, + -5, + 'foo', + [True, False, False], + 0.5, + [0.5], + {'potato': 'kartoffel'}, + ([[0, 1]],), +]) +def test_slicing_errors(index): + s = sparse.random((2, 3, 4), density=0.5, format='gxcs') + + with pytest.raises(IndexError): + s[index] + + +def test_change_compressed_axes(): + coo = sparse.random((3, 4, 5), density=.5) + s = GXCS.from_coo(coo, compressed_axes=(0, 1)) + b = GXCS.from_coo(coo, compressed_axes=(1, 2)) + + s.change_compressed_axes((1, 2)) + + assert_eq(s, b)