Reduce pandas dataframe overhead. (#11058)

dmlc · Dec 5, 2024 · 96952fc · 96952fc
1 parent fcb9c2f
commit 96952fc
Showing 1 changed file with 80 additions and 64 deletions.
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -2,6 +2,7 @@
 # pylint: disable=too-many-return-statements
 """Data dispatching for DMatrix."""
 import ctypes
+import functools
 import json
 import os
 import warnings
@@ -21,7 +22,9 @@
     TransformedData,
     c_bst_ulong,
 )
-from .compat import DataFrame, lazy_isinstance
+from .compat import DataFrame
+from .compat import Series as PdSeries
+from .compat import lazy_isinstance
 from .core import (
     _LIB,
     DataIter,
@@ -377,23 +380,39 @@ def pandas_feature_info(
         else:
             feature_names = list(data.columns.map(str))
 
-    # handle feature types
+    # handle feature types and dtype validation
+    new_feature_types = []
+    need_sparse_extension_warn = True
+    for dtype in data.dtypes:
+        if is_pd_sparse_dtype(dtype):
+            new_feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
+            if need_sparse_extension_warn:
+                warnings.warn("Sparse arrays from pandas are converted into dense.")
+                need_sparse_extension_warn = False
+        elif (
+            is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+        ) and enable_categorical:
+            new_feature_types.append(CAT_T)
+        else:
+            try:
+                new_feature_types.append(_pandas_dtype_mapper[dtype.name])
+            except KeyError:
+                _invalid_dataframe_dtype(data)
+
     if feature_types is None and meta is None:
-        feature_types = []
-        for dtype in data.dtypes:
-            if is_pd_sparse_dtype(dtype):
-                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
-            elif (
-                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
-            ) and enable_categorical:
-                feature_types.append(CAT_T)
-            else:
-                feature_types.append(_pandas_dtype_mapper[dtype.name])
+        feature_types = new_feature_types
+
     return feature_names, feature_types
 
 
 def is_nullable_dtype(dtype: PandasDType) -> bool:
     """Whether dtype is a pandas nullable type."""
+
+    from pandas.api.extensions import ExtensionDtype
+
+    if not isinstance(dtype, ExtensionDtype):
+        return False
+
     from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
 
     is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
@@ -415,35 +434,53 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
     )
 
 
-def is_pd_cat_dtype(dtype: PandasDType) -> bool:
-    """Wrapper for testing pandas category type."""
+@functools.cache
+def _lazy_load_pd_is_cat() -> Callable[[PandasDType], bool]:
     import pandas as pd
 
     if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
         Version = pd.util.version.Version
         if Version(pd.__version__) >= Version("2.1.0"):
             from pandas import CategoricalDtype
 
-            return isinstance(dtype, CategoricalDtype)
+            def pd_is_cat_210(dtype: PandasDType) -> bool:
+                return isinstance(dtype, CategoricalDtype)
 
+            return pd_is_cat_210
     from pandas.api.types import is_categorical_dtype  # type: ignore
 
-    return is_categorical_dtype(dtype)
+    return is_categorical_dtype
 
 
-def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
-    """Wrapper for testing pandas sparse type."""
+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    is_cat = _lazy_load_pd_is_cat()
+    return is_cat(dtype)
+
+
+@functools.cache
+def _lazy_load_pd_is_sparse() -> Callable[[PandasDType], bool]:
     import pandas as pd
 
     if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
         Version = pd.util.version.Version
         if Version(pd.__version__) >= Version("2.1.0"):
             from pandas import SparseDtype
 
-            return isinstance(dtype, SparseDtype)
+            def pd_is_sparse_210(dtype: PandasDType) -> bool:
+                return isinstance(dtype, SparseDtype)
+
+            return pd_is_sparse_210
 
     from pandas.api.types import is_sparse  # type: ignore
 
+    return is_sparse
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    is_sparse = _lazy_load_pd_is_sparse()
+
     return is_sparse(dtype)
 
 
@@ -474,33 +511,34 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
     return arr
 
 
+@functools.cache
+def _lazy_has_npdtypes() -> bool:
+    return np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
+
+
+@functools.cache
+def _lazy_load_pd_floats() -> tuple:
+    from pandas import Float32Dtype, Float64Dtype
+
+    return Float32Dtype, Float64Dtype
+
+
 def pandas_transform_data(data: DataFrame) -> List[np.ndarray]:
     """Handle categorical dtype and extension types from pandas."""
-    import pandas as pd
-    from pandas import Float32Dtype, Float64Dtype
+    Float32Dtype, Float64Dtype = _lazy_load_pd_floats()
 
     result: List[np.ndarray] = []
+    np_dtypes = _lazy_has_npdtypes()
 
-    def cat_codes(ser: pd.Series) -> np.ndarray:
-        if is_pd_cat_dtype(ser.dtype):
-            return _ensure_np_dtype(
-                ser.cat.codes.astype(np.float32)
-                .replace(-1.0, np.nan)
-                .to_numpy(na_value=np.nan),
-                np.float32,
-            )[0]
-        # Not yet supported, the index is not ordered for some reason. Alternately:
-        # `combine_chunks().to_pandas().cat.codes`. The result is the same.
-        assert is_pa_ext_categorical_dtype(ser.dtype)
-        return (
-            ser.array.__arrow_array__()
-            .combine_chunks()
-            .dictionary_encode()
-            .indices.astype(np.float32)
+    def cat_codes(ser: PdSeries) -> np.ndarray:
+        return _ensure_np_dtype(
+            ser.cat.codes.astype(np.float32)
             .replace(-1.0, np.nan)
-        )
+            .to_numpy(na_value=np.nan),
+            np.float32,
+        )[0]
 
-    def nu_type(ser: pd.Series) -> np.ndarray:
+    def nu_type(ser: PdSeries) -> np.ndarray:
         # Avoid conversion when possible
         if isinstance(dtype, Float32Dtype):
             res_dtype: NumpyDType = np.float32
@@ -512,10 +550,9 @@ def nu_type(ser: pd.Series) -> np.ndarray:
             ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype
         )[0]
 
-    def oth_type(ser: pd.Series) -> np.ndarray:
+    def oth_type(ser: PdSeries) -> np.ndarray:
         # The dtypes module is added in 1.25.
-        npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
-        npdtypes = npdtypes and isinstance(
+        npdtypes = np_dtypes and isinstance(
             ser.dtype,
             (
                 # pylint: disable=no-member
@@ -545,7 +582,7 @@ def oth_type(ser: pd.Series) -> np.ndarray:
         elif is_nullable_dtype(dtype):
             result.append(nu_type(data[col]))
         elif is_pd_sparse_dtype(dtype):
-            arr = cast(pd.arrays.SparseArray, data[col].values)
+            arr = data[col].values
             arr = arr.to_dense()
             if _is_np_array_like(arr):
                 arr, _ = _ensure_np_dtype(arr, arr.dtype)
@@ -559,26 +596,6 @@ def oth_type(ser: pd.Series) -> np.ndarray:
     return result
 
 
-def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None:
-    """Validate the input types, returns True if the dataframe is backed by arrow."""
-    sparse_extension = False
-
-    for dtype in data.dtypes:
-        if not (
-            (dtype.name in _pandas_dtype_mapper)
-            or is_pd_sparse_dtype(dtype)
-            or (is_pd_cat_dtype(dtype) and enable_categorical)
-            or is_pa_ext_dtype(dtype)
-        ):
-            _invalid_dataframe_dtype(data)
-
-        if is_pd_sparse_dtype(dtype):
-            sparse_extension = True
-
-    if sparse_extension:
-        warnings.warn("Sparse arrays from pandas are converted into dense.")
-
-
 class PandasTransformed:
     """A storage class for transformed pandas DataFrame."""
 
@@ -604,7 +621,6 @@ def _transform_pandas_df(
     feature_types: Optional[FeatureTypes] = None,
     meta: Optional[str] = None,
 ) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
-    pandas_check_dtypes(data, enable_categorical)
     if meta and len(data.columns) > 1 and meta not in _matrix_meta:
         raise ValueError(f"DataFrame for {meta} cannot have multiple columns")