Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Series and Python primitives in inplace_predict and QDM #8547

Merged
merged 7 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 30 additions & 23 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2220,17 +2220,15 @@ def inplace_predict(
preds = ctypes.POINTER(ctypes.c_float)()

# once caching is supported, we can pass id(data) as cache id.
args = {
"type": 0,
"training": False,
"iteration_begin": iteration_range[0],
"iteration_end": iteration_range[1],
"missing": missing,
"strict_shape": strict_shape,
"cache_id": 0,
}
if predict_type == "margin":
args["type"] = 1
args = make_jcargs(
type=1 if predict_type == "margin" else 0,
training=False,
iteration_begin=iteration_range[0],
iteration_end=iteration_range[1],
missing=missing,
strict_shape=strict_shape,
cache_id=0,
)
shape = ctypes.POINTER(c_bst_ulong)()
dims = c_bst_ulong()

Expand All @@ -2243,6 +2241,18 @@ def inplace_predict(
proxy = None
p_handle = ctypes.c_void_p()
assert proxy is None or isinstance(proxy, _ProxyDMatrix)

from .data import (
_array_interface,
_is_cudf_df,
_is_cupy_array,
_is_list,
_is_pandas_df,
_is_pandas_series,
_is_tuple,
_transform_pandas_df,
)

if validate_features:
if not hasattr(data, "shape"):
raise TypeError(
Expand All @@ -2254,19 +2264,16 @@ def inplace_predict(
f"got {data.shape[1]}"
)

from .data import (
_array_interface,
_is_cudf_df,
_is_cupy_array,
_is_pandas_df,
_transform_pandas_df,
)

enable_categorical = True
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
data, fns, _ = _transform_pandas_df(data, enable_categorical)
if validate_features:
self._validate_features(fns)
if _is_list(data) or _is_tuple(data):
data = np.array(data)

if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype
Expand All @@ -2276,7 +2283,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromDense(
self.handle,
_array_interface(data),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2293,7 +2300,7 @@ def inplace_predict(
_array_interface(csr.indices),
_array_interface(csr.data),
c_bst_ulong(csr.shape[1]),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2310,7 +2317,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromCudaArray(
self.handle,
interface_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2331,7 +2338,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,
interfaces_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand Down
13 changes: 8 additions & 5 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,12 +958,12 @@ def dispatch_data_backend(
return _from_list(data, missing, threads, feature_names, feature_types)
if _is_tuple(data):
return _from_tuple(data, missing, threads, feature_names, feature_types)
if _is_pandas_df(data):
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(
data, missing, threads, enable_categorical, feature_names, feature_types
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
return _from_pandas_df(
data, enable_categorical, missing, threads, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df(
Expand Down Expand Up @@ -1205,6 +1205,9 @@ def _proxy_transform(
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
return data, None, feature_names, feature_types
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
Expand Down
6 changes: 6 additions & 0 deletions python-package/xgboost/testing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def np_dtypes(
for dtype in dtypes:
X = np.array(orig, dtype=dtype)
yield orig, X
yield orig.tolist(), X.tolist()

for dtype in dtypes:
X = np.array(orig, dtype=dtype)
Expand Down Expand Up @@ -101,6 +102,11 @@ def pd_dtypes() -> Generator:
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=dtype
)
yield orig, df
ser_orig = orig["f0"]
ser = df["f0"]
assert isinstance(ser, pd.Series)
assert isinstance(ser_orig, pd.Series)
yield ser_orig, ser

# Categorical
orig = orig.astype("category")
Expand Down
17 changes: 16 additions & 1 deletion tests/python/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import pytest
from scipy import sparse
from xgboost.testing.data import np_dtypes
from xgboost.testing.data import np_dtypes, pd_dtypes
from xgboost.testing.shared import validate_leaf_output

import xgboost as xgb
Expand Down Expand Up @@ -231,6 +231,7 @@ def test_base_margin(self):
from_dmatrix = booster.predict(dtrain)
np.testing.assert_allclose(from_dmatrix, from_inplace)

@pytest.mark.skipif(**tm.no_pandas())
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
def test_dtypes(self) -> None:
for orig, x in np_dtypes(self.rows, self.cols):
predt_orig = self.booster.inplace_predict(orig)
Expand All @@ -246,3 +247,17 @@ def test_dtypes(self) -> None:
X: np.ndarray = np.array(orig, dtype=dtype)
with pytest.raises(ValueError):
self.booster.inplace_predict(X)

@pytest.mark.skipif(**tm.no_pandas())
def test_pd_dtypes(self) -> None:
from pandas.api.types import is_bool_dtype
for orig, x in pd_dtypes():
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
continue
y = np.arange(x.shape[0])
Xy = xgb.DMatrix(orig, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist"}, Xy, num_boost_round=1)
predt_orig = booster.inplace_predict(orig)
predt = booster.inplace_predict(x)
np.testing.assert_allclose(predt, predt_orig)
33 changes: 20 additions & 13 deletions tests/python/test_with_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,22 +298,29 @@ def test_cv_as_pandas(self):
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]

def test_nullable_type(self) -> None:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical

for DMatrixT in (xgb.DMatrix, xgb.QuantileDMatrix):
for orig, df in pd_dtypes():
for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical for dtype in df.dtypes)

m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
# extension types
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
# different from pd.BooleanDtype(), None is converted to False with bool
if any(dtype == "bool" for dtype in orig.dtypes):
assert not tm.predictor_equal(m_orig, m_etype)
else:
assert tm.predictor_equal(m_orig, m_etype)

else:
# series
enable_categorical = is_categorical(df.dtype)

m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
# extension types
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
# different from pd.BooleanDtype(), None is converted to False with bool
if hasattr(orig.dtypes, "__iter__") and any(
dtype == "bool" for dtype in orig.dtypes
):
assert not tm.predictor_equal(m_orig, m_etype)
else:
assert tm.predictor_equal(m_orig, m_etype)

if isinstance(df, pd.DataFrame):
f0 = df["f0"]
with pytest.raises(ValueError, match="Label contains NaN"):
xgb.DMatrix(df, f0, enable_categorical=enable_categorical)