From f1fdf099386ed8ef2cf5b1c8a5ab848ac5683647 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 20:49:31 -0700 Subject: [PATCH 1/7] REF: SingleBlockManager dont subclass BlockManager --- pandas/_typing.py | 2 +- pandas/core/frame.py | 20 +++ pandas/core/generic.py | 20 --- pandas/core/internals/managers.py | 251 +++++++++++++++++------------- pandas/core/series.py | 8 +- 5 files changed, 162 insertions(+), 139 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f90ef33434773..7ab37beff165c 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -185,5 +185,5 @@ ] # internals -Manager = Union["ArrayManager", "BlockManager"] +Manager = Union["ArrayManager", "BlockManager", "SingleBlockManager"] SingleManager = Union["SingleArrayManager", "SingleBlockManager"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 484b01f2c04f0..77845ef11c7ca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -556,6 +556,7 @@ class DataFrame(NDFrame, OpsMixin): _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) _accessors: Set[str] = {"sparse"} _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: Union[BlockManager, ArrayManager] @property def _constructor(self) -> Type[DataFrame]: @@ -3530,6 +3531,25 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) + def _get_item_cache(self, item): + """Return the cached item, item represents a label indexer.""" + cache = self._item_cache + res = cache.get(item) + if res is None: + # All places that call _get_item_cache have unique columns, + # pending resolution of GH#33047 + + loc = self.columns.get_loc(item) + values = self._mgr.iget(loc) + res = self._box_col_values(values, loc).__finalize__(self) + + cache[item] = res + res._set_as_cached(item, self) + + # for a chain + res._is_copy = self._is_copy + return res + def __setitem__(self, key, value): key = com.apply_if_callable(key, self) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b4e3c7caef50..dcb90cf7437bd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3893,26 +3893,6 @@ class animal locomotion def __getitem__(self, item): raise AbstractMethodError(self) - @final - def _get_item_cache(self, item): - """Return the cached item, item represents a label indexer.""" - cache = self._item_cache - res = cache.get(item) - if res is None: - # All places that call _get_item_cache have unique columns, - # pending resolution of GH#33047 - - loc = self.columns.get_loc(item) - values = self._mgr.iget(loc) - res = self._box_col_values(values, loc).__finalize__(self) - - cache[item] = res - res._set_as_cached(item, self) - - # for a chain - res._is_copy = self._is_copy - return res - def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: """ Construct a slice of this container. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3c8d942554575..a567410684bb0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,6 +12,7 @@ Optional, Sequence, Tuple, + Type, TypeVar, Union, ) @@ -86,10 +87,10 @@ # TODO: flexible with index=None and/or items=None -T = TypeVar("T", bound="BlockManager") +T = TypeVar("T", bound="_BlockManager") -class BlockManager(DataManager): +class _BlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. @@ -151,59 +152,18 @@ class BlockManager(DataManager): _blknos: np.ndarray _blklocs: np.ndarray + blocks: tuple[Block, ...] + axes: List[Index] # Non-trivially faster than a property - ndim = 2 # overridden by SingleBlockManager + ndim: int - def __init__( - self, - blocks: Sequence[Block], - axes: Sequence[Index], - verify_integrity: bool = True, - ): - self.axes = [ensure_index(ax) for ax in axes] - self.blocks: Tuple[Block, ...] = tuple(blocks) - - for block in blocks: - if self.ndim != block.ndim: - raise AssertionError( - f"Number of Block dimensions ({block.ndim}) must equal " - f"number of axes ({self.ndim})" - ) - - if verify_integrity: - self._verify_integrity() - - # Populate known_consolidate, blknos, and blklocs lazily - self._known_consolidated = False - # error: Incompatible types in assignment (expression has type "None", - # variable has type "ndarray") - self._blknos = None # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type "None", - # variable has type "ndarray") - self._blklocs = None # type: ignore[assignment] + def __init__(self, blocks, axes, verify_integrity=True): + raise NotImplementedError @classmethod - def _simple_new(cls, blocks: Tuple[Block, ...], axes: List[Index]): - """ - Fastpath constructor; does NO validation. - """ - obj = cls.__new__(cls) - obj.axes = axes - obj.blocks = blocks - - # Populate known_consolidate, blknos, and blklocs lazily - obj._known_consolidated = False - obj._blknos = None - obj._blklocs = None - return obj - - @classmethod - def from_blocks(cls, blocks: List[Block], axes: List[Index]): - """ - Constructor for BlockManager and SingleBlockManager with same signature. - """ - return cls(blocks, axes, verify_integrity=False) + def from_blocks(cls: Type[T], blocks: List[Block], axes: List[Index]) -> T: + raise NotImplementedError @property def blknos(self): @@ -342,7 +302,7 @@ def __getstate__(self): return axes_array, block_values, block_items, extra_state def __setstate__(self, state): - def unpickle_block(values, mgr_locs, ndim: int): + def unpickle_block(values, mgr_locs, ndim: int) -> Block: # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray values = extract_array(values, extract_numpy=True) @@ -465,12 +425,6 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: return type(self).from_blocks(result_blocks, [self.axes[0], index]) - def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: - """ - Apply array_op blockwise with another (aligned) BlockManager. - """ - return operate_blockwise(self, other, array_op) - def apply( self: T, f, @@ -538,12 +492,12 @@ def apply( return type(self).from_blocks(result_blocks, self.axes) def quantile( - self, + self: T, *, qs: Float64Index, axis: int = 0, interpolation="linear", - ) -> BlockManager: + ) -> T: """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and @@ -577,7 +531,7 @@ def quantile( return type(self)(blocks, new_axes) - def where(self, other, cond, align: bool, errors: str) -> BlockManager: + def where(self: T, other, cond, align: bool, errors: str) -> T: if align: align_keys = ["other", "cond"] else: @@ -592,7 +546,7 @@ def where(self, other, cond, align: bool, errors: str) -> BlockManager: errors=errors, ) - def setitem(self, indexer, value) -> BlockManager: + def setitem(self: T, indexer, value) -> T: return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): @@ -610,38 +564,38 @@ def putmask(self, mask, new, align: bool = True): new=new, ) - def diff(self, n: int, axis: int) -> BlockManager: + def diff(self: T, n: int, axis: int) -> T: axis = self._normalize_axis(axis) return self.apply("diff", n=n, axis=axis) - def interpolate(self, **kwargs) -> BlockManager: + def interpolate(self: T, **kwargs) -> T: return self.apply("interpolate", **kwargs) - def shift(self, periods: int, axis: int, fill_value) -> BlockManager: + def shift(self: T, periods: int, axis: int, fill_value) -> T: axis = self._normalize_axis(axis) if fill_value is lib.no_default: fill_value = None return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) - def fillna(self, value, limit, inplace: bool, downcast) -> BlockManager: + def fillna(self: T, value, limit, inplace: bool, downcast) -> T: return self.apply( "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def downcast(self) -> BlockManager: + def downcast(self: T) -> T: return self.apply("downcast") - def astype(self, dtype, copy: bool = False, errors: str = "raise") -> BlockManager: + def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: return self.apply("astype", dtype=dtype, copy=copy, errors=errors) def convert( - self, + self: T, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, - ) -> BlockManager: + ) -> T: return self.apply( "convert", copy=copy, @@ -650,7 +604,7 @@ def convert( timedelta=timedelta, ) - def replace(self, to_replace, value, inplace: bool, regex: bool) -> BlockManager: + def replace(self: T, to_replace, value, inplace: bool, regex: bool) -> T: assert np.ndim(value) == 0, value return self.apply( "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex @@ -676,7 +630,7 @@ def replace_list( bm._consolidate_inplace() return bm - def to_native_types(self, **kwargs) -> BlockManager: + def to_native_types(self: T, **kwargs) -> T: """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). @@ -720,7 +674,7 @@ def is_view(self) -> bool: return False - def get_bool_data(self, copy: bool = False) -> BlockManager: + def get_bool_data(self: T, copy: bool = False) -> T: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. @@ -745,7 +699,7 @@ def get_bool_data(self, copy: bool = False) -> BlockManager: return self._combine(new_blocks, copy) - def get_numeric_data(self, copy: bool = False) -> BlockManager: + def get_numeric_data(self: T, copy: bool = False) -> T: """ Parameters ---------- @@ -778,21 +732,6 @@ def _combine( return type(self).from_blocks(new_blocks, axes) - def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: - assert isinstance(slobj, slice), type(slobj) - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) - elif axis == 1: - new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] - else: - raise IndexError("Requested axis not found in manager") - - new_axes = list(self.axes) - new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - - return type(self)._simple_new(tuple(new_blocks), new_axes) - @property def nblocks(self) -> int: return len(self.blocks) @@ -1006,7 +945,7 @@ def fast_xs(self, loc: int) -> ArrayLike: return result - def consolidate(self) -> BlockManager: + def consolidate(self: T) -> T: """ Join together blocks having same dtype @@ -1029,19 +968,6 @@ def _consolidate_inplace(self) -> None: self._known_consolidated = True self._rebuild_blknos_and_blklocs() - def iget(self, i: int) -> SingleBlockManager: - """ - Return the data as a SingleBlockManager. - """ - block = self.blocks[self.blknos[i]] - values = block.iget(self.blklocs[i]) - - # shortcut for select a single-dim from a 2-dim BM - bp = BlockPlacement(slice(0, len(values))) - values = maybe_coerce_values(values) - nb = type(block)(values, placement=bp, ndim=1) - return SingleBlockManager(nb, self.axes[1]) - def iget_values(self, i: int) -> ArrayLike: """ Return the data for column i as the values (ndarray or ExtensionArray). @@ -1479,21 +1405,110 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: consolidate=False, ) - def _equal_values(self: T, other: T) -> bool: + +class BlockManager(_BlockManager): + """ + _BlockManager that holds 2D blocks. + """ + + ndim = 2 + + def __init__( + self, + blocks: Sequence[Block], + axes: Sequence[Index], + verify_integrity: bool = True, + ): + self.axes = [ensure_index(ax) for ax in axes] + self.blocks: Tuple[Block, ...] = tuple(blocks) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + + if verify_integrity: + self._verify_integrity() + + # Populate known_consolidate, blknos, and blklocs lazily + self._known_consolidated = False + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blknos = None # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blklocs = None # type: ignore[assignment] + + @classmethod + def _simple_new(cls, blocks: Tuple[Block, ...], axes: List[Index]): + """ + Fastpath constructor; does NO validation. + """ + obj = cls.__new__(cls) + obj.axes = axes + obj.blocks = blocks + + # Populate known_consolidate, blknos, and blklocs lazily + obj._known_consolidated = False + obj._blknos = None + obj._blklocs = None + return obj + + @classmethod + def from_blocks(cls, blocks: List[Block], axes: List[Index]) -> BlockManager: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, verify_integrity=False) + + def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + assert isinstance(slobj, slice), type(slobj) + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + elif axis == 1: + new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] + else: + raise IndexError("Requested axis not found in manager") + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + + return type(self)._simple_new(tuple(new_blocks), new_axes) + + def iget(self, i: int) -> SingleBlockManager: + """ + Return the data as a SingleBlockManager. + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + + # shortcut for select a single-dim from a 2-dim BM + bp = BlockPlacement(slice(0, len(values))) + values = maybe_coerce_values(values) + nb = type(block)(values, placement=bp, ndim=1) + return SingleBlockManager(nb, self.axes[1]) + + # ---------------------------------------------------------------- + # Block-wise Operation + + def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + return operate_blockwise(self, other, array_op) + + def _equal_values(self: BlockManager, other: BlockManager) -> bool: """ Used in .equals defined in base class. Only check the column values assuming shape and indexes have already been checked. """ - if self.ndim == 1: - # For SingleBlockManager (i.e.Series) - if other.ndim != 1: - return False - left = self.blocks[0].values - right = other.blocks[0].values - return array_equals(left, right) - return blockwise_all(self, other, array_equals) + # ---------------------------------------------------------------- + def unstack(self, unstacker, fill_value) -> BlockManager: """ Return a BlockManager with all blocks unstacked.. @@ -1532,7 +1547,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: return bm -class SingleBlockManager(BlockManager, SingleDataManager): +class SingleBlockManager(_BlockManager, SingleDataManager): """ manage a single block with """ ndim = 1 @@ -1685,6 +1700,18 @@ def set_values(self, values: ArrayLike): self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) + def _equal_values(self: T, other: T) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: + return False + left = self.blocks[0].values + right = other.blocks[0].values + return array_equals(left, right) + # -------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/series.py b/pandas/core/series.py index 4ade9992e9e3e..0029ed0dd20f3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1110,12 +1110,8 @@ def _set_labels(self, key, value): def _set_values(self, key, value): if isinstance(key, Series): key = key._values - # error: Incompatible types in assignment (expression has type "Union[Any, - # BlockManager]", variable has type "Union[SingleArrayManager, - # SingleBlockManager]") - self._mgr = self._mgr.setitem( # type: ignore[assignment] - indexer=key, value=value - ) + + self._mgr = self._mgr.setitem(indexer=key, value=value) self._maybe_update_cacher() def _set_value(self, label, value, takeable: bool = False): From 304615240ea010ba59b331d443d95884e9a9b294 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 15:30:05 -0700 Subject: [PATCH 2/7] pass tuple of blocks to BlockManager --- pandas/_libs/reduction.pyx | 2 +- pandas/core/internals/concat.py | 2 +- pandas/core/internals/ops.py | 2 +- pandas/tests/internals/test_internals.py | 9 +++++---- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9acff1cac305c..f911993d41838 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -488,6 +488,6 @@ cdef class BlockSlider: Ensure that we have the original blocks, blknos, and blklocs. """ mgr = self.dummy._mgr - mgr.blocks = self.blocks + mgr.blocks = tuple(self.blocks) mgr._blklocs = self.orig_blklocs mgr._blknos = self.orig_blknos diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a5c1f3985e70e..0811760be4da0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -157,7 +157,7 @@ def concatenate_managers( b = new_block(new_values, placement=placement, ndim=len(axes)) blocks.append(b) - return BlockManager(blocks, axes) + return BlockManager(tuple(blocks), axes) def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: Dict[int, np.ndarray]): diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index be5224fe32ae1..3a65924f42123 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -78,7 +78,7 @@ def operate_blockwise( # assert len(slocs) == nlocs, (len(slocs), nlocs) # assert slocs == set(range(nlocs)), slocs - new_mgr = type(right)(res_blks, axes=right.axes, verify_integrity=False) + new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False) return new_mgr diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c67ef9177ca96..218de8f4c276f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -230,9 +230,10 @@ def create_mgr(descr, item_shape=None): ) num_offset += len(placement) + sblocks = sorted(blocks, key=lambda b: b.mgr_locs[0]) return BlockManager( - sorted(blocks, key=lambda b: b.mgr_locs[0]), - [mgr_items] + [np.arange(n) for n in item_shape], + tuple(sblocks), + [mgr_items] + [Index(np.arange(n)) for n in item_shape], ) @@ -405,7 +406,7 @@ def test_iget(self): block = new_block( values=values.copy(), placement=np.arange(3), ndim=values.ndim ) - mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) + mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))]) tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0]) tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1]) @@ -812,7 +813,7 @@ def test_equals_block_order_different_dtypes(self, mgr_string): bm = create_mgr(mgr_string) block_perms = itertools.permutations(bm.blocks) for bm_perm in block_perms: - bm_this = BlockManager(bm_perm, bm.axes) + bm_this = BlockManager(tuple(bm_perm), bm.axes) assert bm.equals(bm_this) assert bm_this.equals(bm) From 7614613d8af02eca92d45cdb51a69cebf35896ca Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 18:08:21 -0700 Subject: [PATCH 3/7] PERF: implement libinternals.BlockManager --- pandas/_libs/internals.pyx | 73 +++++++++++++++++++++++++++++++ pandas/compat/pickle_compat.py | 3 ++ pandas/core/internals/managers.py | 51 ++++----------------- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 4295db9d1613c..55812701f4d0a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -505,3 +505,76 @@ cdef class Block: ndim = maybe_infer_ndim(self.values, self.mgr_locs) self.ndim = ndim + + +@cython.freelist(64) +cdef class BlockManager: + cdef: + public tuple blocks + public list axes + public bint _known_consolidated, _is_consolidated + public ndarray _blknos, _blklocs + + def __cinit__(self, blocks, axes, verify_integrity=True): + if isinstance(blocks, list): + # Backward compat for e.g. pyarrow + blocks = tuple(blocks) + + self.blocks = blocks + self.axes = axes.copy() # copy to make sure we are not remotely-mutable + + # Populate known_consolidate, blknos, and blklocs lazily + self._known_consolidated = False + self._is_consolidated = False + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blknos = None # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blklocs = None # type: ignore[assignment] + + # ------------------------------------------------------------------- + # Pickle + + cpdef __reduce__(self): + if len(self.axes) == 1: + # SingleBlockManager, __init__ expects Block, axis + args = (self.blocks[0], self.axes[0]) + else: + args = (self.blocks, self.axes) + return type(self), args + + cpdef __setstate__(self, state): + from pandas.core.construction import extract_array + from pandas.core.internals.blocks import new_block + from pandas.core.internals.managers import ensure_index + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(axes) + # extract_array bc older pickles may store e.g. DatetimeIndex + # instead of DatetimeArray + nbs = [ + new_block( + extract_array(blk["values"], extract_numpy=True), + blk["mgr_locs"], + ndim=ndim + ) + for blk in state["blocks"] + ] + blocks = tuple(nbs) + self.axes = axes + self.blocks = blocks + + else: + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") + + self._post_setstate() + + def _post_setstate(self) -> None: + self._is_consolidated = False + self._known_consolidated = False + self._rebuild_blknos_and_blklocs() + + # ------------------------------------------------------------------- diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 25ebd3d3ddc62..22189b597f522 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -16,6 +16,7 @@ from pandas._libs.tslibs import BaseOffset from pandas import Index +from pandas.core.internals import BlockManager if TYPE_CHECKING: from pandas import ( @@ -207,6 +208,8 @@ def load_newobj(self): # compat if issubclass(cls, Index): obj = object.__new__(cls) + elif cls is BlockManager and not args: + obj = cls.__new__(cls, (), [], False) else: obj = cls.__new__(cls, *args) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a567410684bb0..863ebc4f6a839 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -141,14 +141,7 @@ class _BlockManager(DataManager): This is *not* a public API class """ - __slots__ = [ - "axes", - "blocks", - "_known_consolidated", - "_is_consolidated", - "_blknos", - "_blklocs", - ] + __slots__ = () _blknos: np.ndarray _blklocs: np.ndarray @@ -302,7 +295,7 @@ def __getstate__(self): return axes_array, block_values, block_items, extra_state def __setstate__(self, state): - def unpickle_block(values, mgr_locs, ndim: int) -> Block: + def unpickle_block(values, mgr_locs, ndim: int): # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray values = extract_array(values, extract_numpy=True) @@ -321,11 +314,6 @@ def unpickle_block(values, mgr_locs, ndim: int) -> Block: self._post_setstate() - def _post_setstate(self) -> None: - self._is_consolidated = False - self._known_consolidated = False - self._rebuild_blknos_and_blklocs() - def __repr__(self) -> str: output = type(self).__name__ for i, ax in enumerate(self.axes): @@ -489,7 +477,7 @@ def apply( if len(result_blocks) == 0: return self.make_empty(self.axes) - return type(self).from_blocks(result_blocks, self.axes) + return type(self).from_blocks(tuple(result_blocks), self.axes) def quantile( self: T, @@ -1129,6 +1117,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) + self.blknos # ensure initialized + for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] if count == len(blk.mgr_locs): @@ -1406,7 +1396,7 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: ) -class BlockManager(_BlockManager): +class BlockManager(libinternals.BlockManager, _BlockManager): """ _BlockManager that holds 2D blocks. """ @@ -1419,8 +1409,7 @@ def __init__( axes: Sequence[Index], verify_integrity: bool = True, ): - self.axes = [ensure_index(ax) for ax in axes] - self.blocks: Tuple[Block, ...] = tuple(blocks) + assert all(isinstance(x, Index) for x in axes) for block in blocks: if self.ndim != block.ndim: @@ -1432,30 +1421,6 @@ def __init__( if verify_integrity: self._verify_integrity() - # Populate known_consolidate, blknos, and blklocs lazily - self._known_consolidated = False - # error: Incompatible types in assignment (expression has type "None", - # variable has type "ndarray") - self._blknos = None # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type "None", - # variable has type "ndarray") - self._blklocs = None # type: ignore[assignment] - - @classmethod - def _simple_new(cls, blocks: Tuple[Block, ...], axes: List[Index]): - """ - Fastpath constructor; does NO validation. - """ - obj = cls.__new__(cls) - obj.axes = axes - obj.blocks = blocks - - # Populate known_consolidate, blknos, and blklocs lazily - obj._known_consolidated = False - obj._blknos = None - obj._blklocs = None - return obj - @classmethod def from_blocks(cls, blocks: List[Block], axes: List[Index]) -> BlockManager: """ @@ -1476,7 +1441,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)._simple_new(tuple(new_blocks), new_axes) + return type(self)(tuple(new_blocks), new_axes) def iget(self, i: int) -> SingleBlockManager: """ From 4cdba6b83b3af1290a49a92539c7bd26ddec95d9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 19:58:49 -0700 Subject: [PATCH 4/7] REF: move BM/SBM-only methods there --- pandas/core/internals/managers.py | 109 ++++++++++++++++-------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 863ebc4f6a839..aad8845f059f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -275,45 +275,6 @@ def arrays(self) -> List[ArrayLike]: """ return [blk.values for blk in self.blocks] - def __getstate__(self): - block_values = [b.values for b in self.blocks] - block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = list(self.axes) - - extra_state = { - "0.14.1": { - "axes": axes_array, - "blocks": [ - {"values": b.values, "mgr_locs": b.mgr_locs.indexer} - for b in self.blocks - ], - } - } - - # First three elements of the state are to maintain forward - # compatibility with 0.13.1. - return axes_array, block_values, block_items, extra_state - - def __setstate__(self, state): - def unpickle_block(values, mgr_locs, ndim: int): - # TODO(EA2D): ndim would be unnecessary with 2D EAs - # older pickles may store e.g. DatetimeIndex instead of DatetimeArray - values = extract_array(values, extract_numpy=True) - return new_block(values, placement=mgr_locs, ndim=ndim) - - if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: - state = state[3]["0.14.1"] - self.axes = [ensure_index(ax) for ax in state["axes"]] - ndim = len(self.axes) - self.blocks = tuple( - unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) - for b in state["blocks"] - ) - else: - raise NotImplementedError("pre-0.14.1 pickles are no longer supported") - - self._post_setstate() - def __repr__(self) -> str: output = type(self).__name__ for i, ax in enumerate(self.axes): @@ -326,19 +287,6 @@ def __repr__(self) -> str: output += f"\n{block}" return output - def _verify_integrity(self) -> None: - mgr_shape = self.shape - tot_items = sum(len(x.mgr_locs) for x in self.blocks) - for block in self.blocks: - if block.shape[1:] != mgr_shape[1:]: - raise construction_error(tot_items, block.shape[1:], self.axes) - if len(self.items) != tot_items: - raise AssertionError( - "Number of manager items must equal union of " - f"block items\n# manager items: {len(self.items)}, # " - f"tot_items: {tot_items}" - ) - def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> Tuple[T, np.ndarray]: @@ -1403,6 +1351,9 @@ class BlockManager(libinternals.BlockManager, _BlockManager): ndim = 2 + # ------------------------------------------------------------------- + # Constructors + def __init__( self, blocks: Sequence[Block], @@ -1428,6 +1379,21 @@ def from_blocks(cls, blocks: List[Block], axes: List[Index]) -> BlockManager: """ return cls(blocks, axes, verify_integrity=False) + def _verify_integrity(self) -> None: + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block.shape[1:] != mgr_shape[1:]: + raise construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError( + "Number of manager items must equal union of " + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" + ) + + # ------------------------------------------------------------------- + def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: assert isinstance(slobj, slice), type(slobj) @@ -1559,6 +1525,45 @@ def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: block = new_block(array, placement=slice(0, len(index)), ndim=1) return cls(block, index) + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = list(self.axes) + + extra_state = { + "0.14.1": { + "axes": axes_array, + "blocks": [ + {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + for b in self.blocks + ], + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state): + def unpickle_block(values, mgr_locs, ndim: int) -> Block: + # TODO(EA2D): ndim would be unnecessary with 2D EAs + # older pickles may store e.g. DatetimeIndex instead of DatetimeArray + values = extract_array(values, extract_numpy=True) + return new_block(values, placement=mgr_locs, ndim=ndim) + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] + ) + else: + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") + + self._post_setstate() + def _post_setstate(self): pass From bf89843929bb7809db2cd730bb092040c50f8ff5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Mar 2021 19:48:43 -0700 Subject: [PATCH 5/7] PERF: do validaton only if verify_integrity=True --- pandas/core/internals/managers.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index aad8845f059f2..b8f26ef6c53ac 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -892,7 +892,7 @@ def consolidate(self: T) -> T: if self.is_consolidated(): return self - bm = type(self)(self.blocks, self.axes) + bm = type(self)(self.blocks, self.axes, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -1360,16 +1360,17 @@ def __init__( axes: Sequence[Index], verify_integrity: bool = True, ): - assert all(isinstance(x, Index) for x in axes) - - for block in blocks: - if self.ndim != block.ndim: - raise AssertionError( - f"Number of Block dimensions ({block.ndim}) must equal " - f"number of axes ({self.ndim})" - ) if verify_integrity: + assert all(isinstance(x, Index) for x in axes) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + self._verify_integrity() @classmethod @@ -1407,7 +1408,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(tuple(new_blocks), new_axes) + return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) def iget(self, i: int) -> SingleBlockManager: """ From ed55afc14d3e2df2361f91441134e7a156b3ba0b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Apr 2021 10:55:50 -0700 Subject: [PATCH 6/7] remove simple_new usage --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b8f26ef6c53ac..cad34cd4a4bcc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -923,7 +923,7 @@ def idelete(self, indexer) -> BlockManager: nbs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - return type(self)._simple_new(tuple(nbs), axes) + return type(self)(tuple(nbs), axes) def iset(self, loc: Union[int, slice, np.ndarray], value): """ From 671a0aacdcd54e99363da7807e620a713711463d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Apr 2021 08:56:24 -0700 Subject: [PATCH 7/7] mypy fixup --- pandas/io/parsers/c_parser_wrapper.py | 70 ++++++++++++++++++--------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index dd3790470f607..abf6128699a21 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -65,10 +65,11 @@ def __init__(self, src: FilePathOrBuffer, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns + # error: Cannot determine type of 'names' # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' ( - self.names, + self.names, # type: ignore[has-type] self.index_names, self.col_names, passed_names, @@ -79,15 +80,21 @@ def __init__(self, src: FilePathOrBuffer, **kwds): passed_names, ) else: - self.names = list(self._reader.header[0]) + # error: Cannot determine type of 'names' + self.names = list(self._reader.header[0]) # type: ignore[has-type] - if self.names is None: + # error: Cannot determine type of 'names' + if self.names is None: # type: ignore[has-type] if self.prefix: - self.names = [ + # error: Cannot determine type of 'names' + self.names = [ # type: ignore[has-type] f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: - self.names = list(range(self._reader.table_width)) + # error: Cannot determine type of 'names' + self.names = list( # type: ignore[has-type] + range(self._reader.table_width) + ) # gh-9755 # @@ -97,7 +104,8 @@ def __init__(self, src: FilePathOrBuffer, **kwds): # # once names has been filtered, we will # then set orig_names again to names - self.orig_names = self.names[:] + # error: Cannot determine type of 'names' + self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) @@ -110,20 +118,30 @@ def __init__(self, src: FilePathOrBuffer, **kwds): ): self._validate_usecols_names(usecols, self.orig_names) - if len(self.names) > len(usecols): - self.names = [ + # error: Cannot determine type of 'names' + if len(self.names) > len(usecols): # type: ignore[has-type] + # error: Cannot determine type of 'names' + self.names = [ # type: ignore[has-type] n - for i, n in enumerate(self.names) + # error: Cannot determine type of 'names' + for i, n in enumerate(self.names) # type: ignore[has-type] if (i in usecols or n in usecols) ] - if len(self.names) < len(usecols): - self._validate_usecols_names(usecols, self.names) + # error: Cannot determine type of 'names' + if len(self.names) < len(usecols): # type: ignore[has-type] + # error: Cannot determine type of 'names' + self._validate_usecols_names( + usecols, + self.names, # type: ignore[has-type] + ) - self._validate_parse_dates_presence(self.names) + # error: Cannot determine type of 'names' + self._validate_parse_dates_presence(self.names) # type: ignore[has-type] self._set_noconvert_columns() - self.orig_names = self.names + # error: Cannot determine type of 'names' + self.orig_names = self.names # type: ignore[has-type] if not self._has_complex_date_col: # error: Cannot determine type of 'index_col' @@ -132,9 +150,15 @@ def __init__(self, src: FilePathOrBuffer, **kwds): ): self._name_processed = True - # error: Cannot determine type of 'index_col' - (index_names, self.names, self.index_col) = self._clean_index_names( - self.names, + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] self.unnamed_cols, ) @@ -165,17 +189,14 @@ def _set_noconvert_columns(self): undergo such conversions. """ assert self.orig_names is not None - # error: Item "None" of "Optional[Any]" has no attribute "__iter__" - # (not iterable) + # error: Cannot determine type of 'names' col_indices = [ - self.orig_names.index(x) for x in self.names # type: ignore[union-attr] + self.orig_names.index(x) for x in self.names # type: ignore[has-type] ] - # error: Argument 2 to "_set_noconvert_dtype_columns" of "ParserBase" - # has incompatible type "Optional[Any]"; expected - # "List[Union[int, str, Tuple[Any, ...]]]" + # error: Cannot determine type of 'names' noconvert_columns = self._set_noconvert_dtype_columns( col_indices, - self.names, # type: ignore[arg-type] + self.names, # type: ignore[has-type] ) for col in noconvert_columns: self._reader.set_noconvert(col) @@ -213,7 +234,8 @@ def read(self, nrows=None): # Done with first read, next time raise StopIteration self._first_chunk = False - names = self.names + # error: Cannot determine type of 'names' + names = self.names # type: ignore[has-type] if self._reader.leading_cols: if self._has_complex_date_col: