down to 322 fails

pandas-dev · Sep 16, 2022 · e8ce5e8 · e8ce5e8
1 parent 7f73a89
commit e8ce5e8
Show file tree

Hide file tree

Showing 7 changed files with 58 additions and 34 deletions.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4760,7 +4760,8 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
                 and not is_bool_dtype(dtype)
             )
 
-        def predicate(dtype: DtypeObj) -> bool:
+        def predicate(arr: ArrayLike) -> bool:
+            dtype = arr.dtype
             if include:
                 if not dtype_predicate(dtype, include):
                     return False
@@ -4771,14 +4772,8 @@ def predicate(dtype: DtypeObj) -> bool:
 
             return True
 
-        def arr_predicate(arr: ArrayLike) -> bool:
-            dtype = arr.dtype
-            return predicate(dtype)
-
-        mgr, taker = self._mgr._get_data_subset(arr_predicate).copy(deep=None)
-        # FIXME: get axes without mgr.axes
-        # FIXME: return taker from _get_data_subset, this is really slow
-        #taker = self.dtypes.apply(predicate).values.nonzero()[0]
+        mgr, taker = self._mgr._get_data_subset(predicate)
+        mgr = mgr.copy(deep=None)
         columns = self.columns.take(taker)
         return type(self)(mgr, columns=columns, index=self.index).__finalize__(self)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -840,7 +840,7 @@ def _validate_set_axis(self, axis: int, new_labels: Index) -> None:
         old_len = self.shape[axis]
         new_len = len(new_labels)
 
-        if axis == 1 and len(self.columns) == 0:
+        if self.ndim > 1 and axis == 0 and len(self.columns) == 0:
             # If we are setting the index on a DataFrame with no columns,
             #  it is OK to change the length.
             pass
@@ -3933,6 +3933,14 @@ def _take(
             convert_indices=convert_indices,
         )
 
+        # We have 6 tests that get here with a slice; TODO: maybe avoid?
+        # TODO: de-duplicate with similar inside BlockManager.take
+        indices = (
+            np.arange(indices.start, indices.stop, indices.step, dtype=np.intp)
+            if isinstance(indices, slice)
+            else np.asanyarray(indices, dtype=np.intp)  # <- converts some cases with empty float64
+        )
+
         axes_dict = self._construct_axes_dict()
         if convert_indices and isinstance(indices, np.ndarray):
             # i.e. exclude slice, which in principle shouldn't be in a _take
@@ -5490,9 +5498,6 @@ def _reindex_with_indexers(
         if copy and new_data is self._mgr:
             new_data = new_data.copy()
 
-        # FIXME: get axes without mgr.axes
-        #axes_dict = self._get_axes_from_mgr(new_data)
-
         return self._constructor(new_data, **axes_dict).__finalize__(self)
 
     def filter(

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -175,6 +175,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series:
         else:
             mgr = cast(Manager2D, mgr)
             single = mgr.iget(0)
+        #breakpoint()
         # FIXME: get axes without mgr.axes
         index = single.axes[0]
         ser = self.obj._constructor(single, index=index, name=self.obj.name)
@@ -1329,14 +1330,26 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
 
         # We could use `mgr.apply` here and not have to set_axis, but
         #  we would have to do shape gymnastics for ArrayManager compat
-        res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
+        res_mgr, taker = mgr.grouped_reduce(arr_func, ignore_failures=True)
         res_mgr.set_axis(1, mgr.axes[1])
 
         if len(res_mgr) < orig_mgr_len:
             warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
 
-        # FIXME: get axes without mgr.axes
-        res_df = self.obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
+        columns = mgr.axes[0]
+        index = res_mgr.axes[1]  # FIXME: get index without res_mgr.axes
+        if self.axis == 0:
+
+            pass#index = self._obj_with_exclusions.index
+            #columns = columns[taker]
+            #breakpoint()
+        else:
+            #columns = self._obj_with_exclusions.index
+            pass#index = self._obj_with_exclusions.columns
+            #breakpoint()
+
+        columns = columns[taker]
+        res_df = self.obj._constructor(res_mgr, index=index, columns=columns)
         if self.axis == 1:
             res_df = res_df.T
         return res_df

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1780,7 +1780,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
 
         # TypeError -> we may have an exception in trying to aggregate
         #  continue and exclude the block
-        new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
+        new_mgr, taker = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
 
         if not is_ser and len(new_mgr) < orig_len:
             warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
@@ -2055,7 +2055,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
                 return counted[0]
             return counted
 
-        new_mgr = data.grouped_reduce(hfunc)
+        new_mgr, taker = data.grouped_reduce(hfunc)
 
         # If we are grouping on categoricals we want unobserved categories to
         # return zero, rather than the default of NaN which the reindexing in
@@ -3374,7 +3374,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         mgr = self._get_data_to_aggregate()
         data = mgr.get_numeric_data()[0] if numeric_only_bool else mgr
         ignore_failures = numeric_only_bool
-        res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
+        res_mgr, taker = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
 
         if (
             numeric_only is lib.no_default
@@ -3401,6 +3401,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
             axes_dict["index"] = res_mgr.axes[-1]
             if res_mgr.ndim == 2:
                 axes_dict["columns"] = res_mgr.axes[0]
+            #breakpoint()
             res = obj._constructor(res_mgr, **axes_dict)
 
         if orig_scalar:
@@ -3693,7 +3694,7 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
         skipna = kwargs.get("skipna", True)
         if axis != 0:
             f = lambda x: np.minimum.accumulate(x, axis)
-            numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
+            numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)  # TODO: "cummin"?
             obj = self._selected_obj
             if numeric_only_bool:
                 obj = obj._get_numeric_data()
@@ -3853,7 +3854,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         if numeric_only_bool:
             mgr = mgr.get_numeric_data()[0]
 
-        res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
+        res_mgr, taker = mgr.grouped_reduce(blk_func, ignore_failures=True)
 
         if not is_ser and len(res_mgr.items) != orig_mgr_len:
             howstr = how.replace("group_", "")
@@ -3871,7 +3872,12 @@ def blk_func(values: ArrayLike) -> ArrayLike:
             out = self._wrap_agged_manager(res_mgr)
         else:
             # FIXME: get axes without mgr.axes
-            out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
+            if self.axis == 0 and not numeric_only_bool:
+                columns = self._obj_with_exclusions.columns[taker]
+            else:
+                #breakpoint()
+                columns = res_mgr.axes[0]
+            out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=columns)
 
         return self._wrap_aggregated_output(out)
 

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -464,7 +464,7 @@ def is_view(self) -> bool:
     def is_single_block(self) -> bool:
         return len(self.arrays) == 1
 
-    def _get_data_subset(self: T, predicate: Callable) -> T:
+    def _get_data_subset(self: T, predicate: Callable) -> tuple[T, npt.NDArray[np.intp]]:
         indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
         arrays = [self.arrays[i] for i in indices]
         # TODO copy?
@@ -473,9 +473,9 @@ def _get_data_subset(self: T, predicate: Callable) -> T:
         taker = np.array(indices, dtype="intp")
         new_cols = self._axes[1].take(taker)
         new_axes = [self._axes[0], new_cols]
-        return type(self)(arrays, new_axes, verify_integrity=False)
+        return type(self)(arrays, new_axes, verify_integrity=False), taker
 
-    def get_bool_data(self: T, copy: bool = False) -> T:
+    def get_bool_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
         """
         Select columns that are bool-dtype and object-dtype columns that are all-bool.
 
@@ -485,9 +485,8 @@ def get_bool_data(self: T, copy: bool = False) -> T:
             Whether to copy the blocks
         """
         return self._get_data_subset(is_inferred_bool_dtype)
-        # FIXME: return indexer
 
-    def get_numeric_data(self: T, copy: bool = False) -> T:
+    def get_numeric_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
         """
         Select columns that have a numeric dtype.
 
@@ -935,7 +934,7 @@ def idelete(self, indexer) -> ArrayManager:
     # --------------------------------------------------------------------
     # Array-wise Operation
 
-    def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
+    def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
         """
         Apply grouped reduction function columnwise, returning a new ArrayManager.
 
@@ -948,6 +947,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
         Returns
         -------
         ArrayManager
+        np.ndarray[intp]
         """
         result_arrays: list[np.ndarray] = []
         result_indices: list[int] = []
@@ -975,14 +975,16 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
         else:
             index = Index(range(result_arrays[0].shape[0]))
 
+        taker = None
         if ignore_failures:
-            columns = self.items[np.array(result_indices, dtype="int64")]
+            taker = np.array(result_indices, dtype=np.intp)
+            columns = self.items[taker]
         else:
             columns = self.items
 
         # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
         # expected "List[Union[ndarray, ExtensionArray]]"
-        return type(self)(result_arrays, [index, columns])  # type: ignore[arg-type]
+        return type(self)(result_arrays, [index, columns]), taker  # type: ignore[arg-type]
 
     def reduce(
         self: T, func: Callable, ignore_failures: bool = False

diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py
@@ -203,7 +203,7 @@ def grouped_reduce(self, func, ignore_failures: bool = False):
         index = default_index(len(res))
 
         mgr = type(self).from_array(res, index)
-        return mgr
+        return mgr, np.arange(len(res), dtype=np.intp)  # TODO: is taker meaningful here?
 
     @classmethod
     def from_array(cls, arr: ArrayLike, index: Index):

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1475,7 +1475,7 @@ def idelete(self, indexer) -> BlockManager:
     # ----------------------------------------------------------------
     # Block-wise Operation
 
-    def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
+    def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
         """
         Apply grouped reduction function blockwise, returning a new BlockManager.
 
@@ -1488,6 +1488,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
         Returns
         -------
         BlockManager
+        np.ndarray[intp]
         """
         result_blocks: list[Block] = []
         dropped_any = False
@@ -1522,9 +1523,10 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
 
         if dropped_any:
             # faster to skip _combine if we haven't dropped any blocks
-            return self._combine(result_blocks, copy=False, index=index)[0]
+            return self._combine(result_blocks, copy=False, index=index)
 
-        return type(self).from_blocks(result_blocks, [self.axes[0], index])
+        taker = np.arange(len(self), dtype=np.intp)
+        return type(self).from_blocks(result_blocks, [self.axes[0], index]), taker
 
     def reduce(
         self: T, func: Callable, ignore_failures: bool = False
@@ -2055,6 +2057,7 @@ def array_values(self):
 
     def get_numeric_data(self, copy: bool = False):
         if self._block.is_numeric:
+            taker = np.arange(len(self.items), dtype=np.intp)
             return self.copy(deep=copy), taker
         taker = np.array([], dtype=np.intp)
         return self.make_empty(), taker