diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 23d35b412e1ae..ea9f9abc4a4c7 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,3 +1,4 @@ +from pandas.core.internals.api import make_block # pseudo-public version from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -16,7 +17,6 @@ NumericBlock, ObjectBlock, TimeDeltaBlock, - make_block, ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py new file mode 100644 index 0000000000000..3fbe324417c60 --- /dev/null +++ b/pandas/core/internals/api.py @@ -0,0 +1,61 @@ +""" +This is a pseudo-public API for downstream libraries. We ask that downstream +authors + +1) Try to avoid using internals directly altogether, and failing that, +2) Use only functions exposed here (or in core.internals) + +""" +from typing import Optional + +import numpy as np + +from pandas._typing import Dtype + +from pandas.core.dtypes.common import is_datetime64tz_dtype +from pandas.core.dtypes.dtypes import PandasDtype +from pandas.core.dtypes.generic import ABCPandasArray + +from pandas.core.arrays import DatetimeArray +from pandas.core.internals.blocks import ( + Block, + DatetimeTZBlock, + get_block_type, +) + + +def make_block( + values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None +) -> Block: + """ + This is a pseudo-public analogue to blocks.new_block. + + We ask that downstream libraries use this rather than any fully-internal + APIs, including but not limited to: + + - core.internals.blocks.make_block + - Block.make_block + - Block.make_block_same_class + - Block.__init__ + """ + if isinstance(values, ABCPandasArray): + # Ensure that we don't allow PandasArray / PandasDtype in internals. + # For now, blocks should be backed by ndarrays when possible. + values = values.to_numpy() + if ndim and ndim > 1: + # TODO(EA2D): special case not needed with 2D EAs + values = np.atleast_2d(values) + + if isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + + if klass is None: + dtype = dtype or values.dtype + klass = get_block_type(values, dtype) + + elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): + # TODO: This is no longer hit internally; does it need to be retained + # for e.g. pyarrow? + values = DatetimeArray._simple_new(values, dtype=dtype) + + return klass(values, ndim=ndim, placement=placement) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0449be84bdcf7..f82cd93e5402a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -82,7 +82,7 @@ DataManager, SingleDataManager, ) -from pandas.core.internals.blocks import make_block +from pandas.core.internals.blocks import new_block if TYPE_CHECKING: from pandas import Float64Index @@ -439,9 +439,9 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: if self.ndim == 2: if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) - block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + block = new_block(arr, placement=slice(0, 1, 1), ndim=2) else: - block = make_block(arr, placement=slice(0, len(self), 1), ndim=1) + block = new_block(arr, placement=slice(0, len(self), 1), ndim=1) applied = getattr(block, f)(**kwargs) if isinstance(applied, list): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 59b5a151497ff..597023cb5b000 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -316,7 +316,7 @@ def make_block(self, values, placement=None) -> Block: if self.is_extension: values = ensure_block_shape(values, ndim=self.ndim) - return make_block(values, placement=placement, ndim=self.ndim) + return new_block(values, placement=placement, ndim=self.ndim) @final def make_block_same_class(self, values, placement=None) -> Block: @@ -1431,7 +1431,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement, ndim=2)] + blocks = [new_block(new_values, placement=new_placement, ndim=2)] return blocks, mask def quantile( @@ -1460,7 +1460,7 @@ def quantile( result = quantile_compat(self.values, qs, interpolation, axis) - return make_block(result, placement=self.mgr_locs, ndim=2) + return new_block(result, placement=self.mgr_locs, ndim=2) class ExtensionBlock(Block): @@ -2301,7 +2301,7 @@ def get_block_type(values, dtype: Optional[Dtype] = None): return cls -def make_block( +def new_block( values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None ) -> Block: # Ensure that we don't allow PandasArray / PandasDtype in internals. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a71fdff043212..0803e40a219be 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -42,7 +42,7 @@ ExtensionArray, ) from pandas.core.internals.array_manager import ArrayManager -from pandas.core.internals.blocks import make_block +from pandas.core.internals.blocks import new_block from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: @@ -144,10 +144,10 @@ def concatenate_managers( # Fast-path b = blk.make_block_same_class(values, placement=placement) else: - b = make_block(values, placement=placement, ndim=blk.ndim) + b = new_block(values, placement=placement, ndim=blk.ndim) else: new_values = _concatenate_join_units(join_units, concat_axis, copy=copy) - b = make_block(new_values, placement=placement, ndim=len(axes)) + b = new_block(new_values, placement=placement, ndim=len(axes)) blocks.append(b) return BlockManager(blocks, axes) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8774c11859fcd..c314673f609f3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -75,7 +75,7 @@ from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( ensure_block_shape, - make_block, + new_block, ) from pandas.core.internals.managers import ( BlockManager, @@ -300,7 +300,7 @@ def ndarray_to_mgr( # TODO: What about re-joining object columns? dvals_list = [maybe_squeeze_dt64tz(x) for x in dvals_list] block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) + new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dcd58c703619f..aebaef27b484d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -74,7 +74,7 @@ ensure_block_shape, extend_blocks, get_block_type, - make_block, + new_block, ) from pandas.core.internals.ops import ( blockwise_all, @@ -322,7 +322,7 @@ def unpickle_block(values, mgr_locs, ndim: int): # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray values = extract_array(values, extract_numpy=True) - return make_block(values, placement=mgr_locs, ndim=ndim) + return new_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] @@ -1148,7 +1148,7 @@ def value_getitem(placement): # one item. # TODO(EA2D): special casing unnecessary with 2D EAs new_blocks.extend( - make_block( + new_block( values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), @@ -1164,7 +1164,7 @@ def value_getitem(placement): unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) new_blocks.append( - make_block( + new_block( values=value_getitem(unfit_val_items), ndim=self.ndim, placement=unfit_mgr_locs, @@ -1209,7 +1209,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = ensure_block_shape(value, ndim=2) # TODO: type value as ArrayLike - block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) + block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] @@ -1436,7 +1436,7 @@ def _make_na_block(self, placement, fill_value=None): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement, ndim=block_values.ndim) + return new_block(block_values, placement=placement, ndim=block_values.ndim) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -1562,7 +1562,7 @@ def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: """ Constructor for if we have an array that is not yet a Block. """ - block = make_block(array, placement=slice(0, len(index)), ndim=1) + block = new_block(array, placement=slice(0, len(index)), ndim=1) return cls(block, index) def _post_setstate(self): @@ -1669,7 +1669,7 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block( + new_block( values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 ) ] @@ -1780,7 +1780,7 @@ def _form_blocks( if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) + new_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) for i, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1791,14 +1791,14 @@ def _form_blocks( if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i, ndim=2) + new_block(array, klass=CategoricalBlock, placement=i, ndim=2) for i, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i, ndim=2) + new_block(array, klass=ExtensionBlock, placement=i, ndim=2) for i, array in items_dict["ExtensionBlock"] ] @@ -1806,7 +1806,7 @@ def _form_blocks( if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) + new_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) for i, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1819,7 +1819,7 @@ def _form_blocks( block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs, ndim=2) + na_block = new_block(block_values, placement=extra_locs, ndim=2) blocks.append(na_block) return blocks @@ -1836,7 +1836,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement, ndim=2) + block = new_block(values, placement=placement, ndim=2) return [block] @@ -1850,7 +1850,7 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement, ndim=2) + block = new_block(values, placement=placement, ndim=2) new_blocks.append(block) return new_blocks @@ -1928,7 +1928,7 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs, ndim=2)] + return [new_block(new_values, placement=new_mgr_locs, ndim=2)] # can't consolidate --> no merge return blocks diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py new file mode 100644 index 0000000000000..d4630b20db85f --- /dev/null +++ b/pandas/tests/internals/test_api.py @@ -0,0 +1,50 @@ +""" +Tests for the pseudo-public API implemented in internals/api.py and exposed +in core.internals +""" + +from pandas.core import internals +from pandas.core.internals import api + + +def test_internals_api(): + assert internals.make_block is api.make_block + + +def test_namespace(): + # SUBJECT TO CHANGE + + modules = [ + "blocks", + "concat", + "managers", + "construction", + "array_manager", + "base", + "api", + "ops", + ] + expected = [ + "Block", + "CategoricalBlock", + "NumericBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "ObjectBlock", + "TimeDeltaBlock", + "make_block", + "DataManager", + "ArrayManager", + "BlockManager", + "SingleDataManager", + "SingleBlockManager", + "SingleArrayManager", + "concatenate_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", + ] + + result = [x for x in dir(internals) if not x.startswith("__")] + assert set(result) == set(expected + modules) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 54130bb075666..683006d9b3b9c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -36,6 +36,15 @@ SingleBlockManager, make_block, ) +from pandas.core.internals.blocks import new_block + + +@pytest.fixture(params=[new_block, make_block]) +def block_maker(request): + """ + Fixture to test both the internal new_block and pseudo-public make_block. + """ + return request.param @pytest.fixture @@ -65,7 +74,7 @@ def get_numeric_mat(shape): N = 10 -def create_block(typestr, placement, item_shape=None, num_offset=0): +def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_block): """ Supported typestr: @@ -147,7 +156,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): else: raise ValueError(f'Unsupported typestr: "{typestr}"') - return make_block(values, placement=placement, ndim=len(shape)) + return maker(values, placement=placement, ndim=len(shape)) def create_single_mgr(typestr, num_rows=None): @@ -290,7 +299,7 @@ def test_delete(self): def test_split(self): # GH#37799 values = np.random.randn(3, 4) - blk = make_block(values, placement=[3, 1, 6], ndim=2) + blk = new_block(values, placement=[3, 1, 6], ndim=2) result = blk._split() # check that we get views, not copies @@ -299,9 +308,9 @@ def test_split(self): assert len(result) == 3 expected = [ - make_block(values[[0]], placement=[3], ndim=2), - make_block(values[[1]], placement=[1], ndim=2), - make_block(values[[2]], placement=[6], ndim=2), + new_block(values[[0]], placement=[3], ndim=2), + new_block(values[[1]], placement=[1], ndim=2), + new_block(values[[2]], placement=[6], ndim=2), ] for res, exp in zip(result, expected): assert_block_equal(res, exp) @@ -365,7 +374,7 @@ def test_categorical_block_pickle(self): def test_iget(self): cols = Index(list("abc")) values = np.random.rand(3, 3) - block = make_block( + block = new_block( values=values.copy(), placement=np.arange(3), ndim=values.ndim ) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) @@ -1149,7 +1158,7 @@ def test_datetime_block_can_hold_element(self): def test_interval_can_hold_element_emptylist(self, dtype, element): arr = np.array([1, 3, 4], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = make_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, [1], ndim=2) assert blk._can_hold_element([]) # TODO: check this holds for all blocks @@ -1158,7 +1167,7 @@ def test_interval_can_hold_element_emptylist(self, dtype, element): def test_interval_can_hold_element(self, dtype, element): arr = np.array([1, 3, 4, 9], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = make_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, [1], ndim=2) elem = element(ii) self.check_series_setitem(elem, ii, True) @@ -1183,7 +1192,7 @@ def test_interval_can_hold_element(self, dtype, element): def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") - blk = make_block(pi._data, [1], ndim=2) + blk = new_block(pi._data, [1], ndim=2) assert blk._can_hold_element([]) @@ -1278,18 +1287,18 @@ def test_should_store_categorical(self): ("sparse", SparseArray), ], ) -def test_holder(typestr, holder): - blk = create_block(typestr, [1]) +def test_holder(typestr, holder, block_maker): + blk = create_block(typestr, [1], maker=block_maker) assert blk._holder is holder -def test_validate_ndim(): +def test_validate_ndim(block_maker): values = np.array([1.0, 2.0]) placement = slice(2) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + block_maker(values, placement, ndim=2) def test_block_shape(): @@ -1300,22 +1309,24 @@ def test_block_shape(): assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer -def test_make_block_no_pandas_array(): +def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype - result = make_block(arr, slice(len(arr)), ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # PandasArray, PandasDtype - result = make_block(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # ndarray, PandasDtype - result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False