Skip to content

Commit

Permalink
down to 322 fails
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Sep 16, 2022
1 parent 7f73a89 commit e8ce5e8
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 34 deletions.
13 changes: 4 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4760,7 +4760,8 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
and not is_bool_dtype(dtype)
)

def predicate(dtype: DtypeObj) -> bool:
def predicate(arr: ArrayLike) -> bool:
dtype = arr.dtype
if include:
if not dtype_predicate(dtype, include):
return False
Expand All @@ -4771,14 +4772,8 @@ def predicate(dtype: DtypeObj) -> bool:

return True

def arr_predicate(arr: ArrayLike) -> bool:
dtype = arr.dtype
return predicate(dtype)

mgr, taker = self._mgr._get_data_subset(arr_predicate).copy(deep=None)
# FIXME: get axes without mgr.axes
# FIXME: return taker from _get_data_subset, this is really slow
#taker = self.dtypes.apply(predicate).values.nonzero()[0]
mgr, taker = self._mgr._get_data_subset(predicate)
mgr = mgr.copy(deep=None)
columns = self.columns.take(taker)
return type(self)(mgr, columns=columns, index=self.index).__finalize__(self)

Expand Down
13 changes: 9 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ def _validate_set_axis(self, axis: int, new_labels: Index) -> None:
old_len = self.shape[axis]
new_len = len(new_labels)

if axis == 1 and len(self.columns) == 0:
if self.ndim > 1 and axis == 0 and len(self.columns) == 0:
# If we are setting the index on a DataFrame with no columns,
# it is OK to change the length.
pass
Expand Down Expand Up @@ -3933,6 +3933,14 @@ def _take(
convert_indices=convert_indices,
)

# We have 6 tests that get here with a slice; TODO: maybe avoid?
# TODO: de-duplicate with similar inside BlockManager.take
indices = (
np.arange(indices.start, indices.stop, indices.step, dtype=np.intp)
if isinstance(indices, slice)
else np.asanyarray(indices, dtype=np.intp) # <- converts some cases with empty float64
)

axes_dict = self._construct_axes_dict()
if convert_indices and isinstance(indices, np.ndarray):
# i.e. exclude slice, which in principle shouldn't be in a _take
Expand Down Expand Up @@ -5490,9 +5498,6 @@ def _reindex_with_indexers(
if copy and new_data is self._mgr:
new_data = new_data.copy()

# FIXME: get axes without mgr.axes
#axes_dict = self._get_axes_from_mgr(new_data)

return self._constructor(new_data, **axes_dict).__finalize__(self)

def filter(
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series:
else:
mgr = cast(Manager2D, mgr)
single = mgr.iget(0)
#breakpoint()
# FIXME: get axes without mgr.axes
index = single.axes[0]
ser = self.obj._constructor(single, index=index, name=self.obj.name)
Expand Down Expand Up @@ -1329,14 +1330,26 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:

# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr, taker = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < orig_mgr_len:
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)

# FIXME: get axes without mgr.axes
res_df = self.obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
columns = mgr.axes[0]
index = res_mgr.axes[1] # FIXME: get index without res_mgr.axes
if self.axis == 0:

pass#index = self._obj_with_exclusions.index
#columns = columns[taker]
#breakpoint()
else:
#columns = self._obj_with_exclusions.index
pass#index = self._obj_with_exclusions.columns
#breakpoint()

columns = columns[taker]
res_df = self.obj._constructor(res_mgr, index=index, columns=columns)
if self.axis == 1:
res_df = res_df.T
return res_df
Expand Down
18 changes: 12 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1780,7 +1780,7 @@ def array_func(values: ArrayLike) -> ArrayLike:

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
new_mgr, taker = data.grouped_reduce(array_func, ignore_failures=ignore_failures)

if not is_ser and len(new_mgr) < orig_len:
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
Expand Down Expand Up @@ -2055,7 +2055,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
return counted[0]
return counted

new_mgr = data.grouped_reduce(hfunc)
new_mgr, taker = data.grouped_reduce(hfunc)

# If we are grouping on categoricals we want unobserved categories to
# return zero, rather than the default of NaN which the reindexing in
Expand Down Expand Up @@ -3374,7 +3374,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
mgr = self._get_data_to_aggregate()
data = mgr.get_numeric_data()[0] if numeric_only_bool else mgr
ignore_failures = numeric_only_bool
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
res_mgr, taker = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)

if (
numeric_only is lib.no_default
Expand All @@ -3401,6 +3401,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
axes_dict["index"] = res_mgr.axes[-1]
if res_mgr.ndim == 2:
axes_dict["columns"] = res_mgr.axes[0]
#breakpoint()
res = obj._constructor(res_mgr, **axes_dict)

if orig_scalar:
Expand Down Expand Up @@ -3693,7 +3694,7 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
skipna = kwargs.get("skipna", True)
if axis != 0:
f = lambda x: np.minimum.accumulate(x, axis)
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) # TODO: "cummin"?
obj = self._selected_obj
if numeric_only_bool:
obj = obj._get_numeric_data()
Expand Down Expand Up @@ -3853,7 +3854,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
if numeric_only_bool:
mgr = mgr.get_numeric_data()[0]

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
res_mgr, taker = mgr.grouped_reduce(blk_func, ignore_failures=True)

if not is_ser and len(res_mgr.items) != orig_mgr_len:
howstr = how.replace("group_", "")
Expand All @@ -3871,7 +3872,12 @@ def blk_func(values: ArrayLike) -> ArrayLike:
out = self._wrap_agged_manager(res_mgr)
else:
# FIXME: get axes without mgr.axes
out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
if self.axis == 0 and not numeric_only_bool:
columns = self._obj_with_exclusions.columns[taker]
else:
#breakpoint()
columns = res_mgr.axes[0]
out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=columns)

return self._wrap_aggregated_output(out)

Expand Down
18 changes: 10 additions & 8 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ def is_view(self) -> bool:
def is_single_block(self) -> bool:
return len(self.arrays) == 1

def _get_data_subset(self: T, predicate: Callable) -> T:
def _get_data_subset(self: T, predicate: Callable) -> tuple[T, npt.NDArray[np.intp]]:
indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
arrays = [self.arrays[i] for i in indices]
# TODO copy?
Expand All @@ -473,9 +473,9 @@ def _get_data_subset(self: T, predicate: Callable) -> T:
taker = np.array(indices, dtype="intp")
new_cols = self._axes[1].take(taker)
new_axes = [self._axes[0], new_cols]
return type(self)(arrays, new_axes, verify_integrity=False)
return type(self)(arrays, new_axes, verify_integrity=False), taker

def get_bool_data(self: T, copy: bool = False) -> T:
def get_bool_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
"""
Select columns that are bool-dtype and object-dtype columns that are all-bool.
Expand All @@ -485,9 +485,8 @@ def get_bool_data(self: T, copy: bool = False) -> T:
Whether to copy the blocks
"""
return self._get_data_subset(is_inferred_bool_dtype)
# FIXME: return indexer

def get_numeric_data(self: T, copy: bool = False) -> T:
def get_numeric_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
"""
Select columns that have a numeric dtype.
Expand Down Expand Up @@ -935,7 +934,7 @@ def idelete(self, indexer) -> ArrayManager:
# --------------------------------------------------------------------
# Array-wise Operation

def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
"""
Apply grouped reduction function columnwise, returning a new ArrayManager.
Expand All @@ -948,6 +947,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
Returns
-------
ArrayManager
np.ndarray[intp]
"""
result_arrays: list[np.ndarray] = []
result_indices: list[int] = []
Expand Down Expand Up @@ -975,14 +975,16 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
else:
index = Index(range(result_arrays[0].shape[0]))

taker = None
if ignore_failures:
columns = self.items[np.array(result_indices, dtype="int64")]
taker = np.array(result_indices, dtype=np.intp)
columns = self.items[taker]
else:
columns = self.items

# error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
# expected "List[Union[ndarray, ExtensionArray]]"
return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
return type(self)(result_arrays, [index, columns]), taker # type: ignore[arg-type]

def reduce(
self: T, func: Callable, ignore_failures: bool = False
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def grouped_reduce(self, func, ignore_failures: bool = False):
index = default_index(len(res))

mgr = type(self).from_array(res, index)
return mgr
return mgr, np.arange(len(res), dtype=np.intp) # TODO: is taker meaningful here?

@classmethod
def from_array(cls, arr: ArrayLike, index: Index):
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,7 +1475,7 @@ def idelete(self, indexer) -> BlockManager:
# ----------------------------------------------------------------
# Block-wise Operation

def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
"""
Apply grouped reduction function blockwise, returning a new BlockManager.
Expand All @@ -1488,6 +1488,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
Returns
-------
BlockManager
np.ndarray[intp]
"""
result_blocks: list[Block] = []
dropped_any = False
Expand Down Expand Up @@ -1522,9 +1523,10 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:

if dropped_any:
# faster to skip _combine if we haven't dropped any blocks
return self._combine(result_blocks, copy=False, index=index)[0]
return self._combine(result_blocks, copy=False, index=index)

return type(self).from_blocks(result_blocks, [self.axes[0], index])
taker = np.arange(len(self), dtype=np.intp)
return type(self).from_blocks(result_blocks, [self.axes[0], index]), taker

def reduce(
self: T, func: Callable, ignore_failures: bool = False
Expand Down Expand Up @@ -2055,6 +2057,7 @@ def array_values(self):

def get_numeric_data(self, copy: bool = False):
if self._block.is_numeric:
taker = np.arange(len(self.items), dtype=np.intp)
return self.copy(deep=copy), taker
taker = np.array([], dtype=np.intp)
return self.make_empty(), taker
Expand Down

0 comments on commit e8ce5e8

Please sign in to comment.