Skip to content

Commit

Permalink
Support Series and Python primitives in inplace_predict and QDM (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Dec 16, 2022
1 parent a10e4cb commit f6effa1
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 44 deletions.
57 changes: 32 additions & 25 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2220,17 +2220,15 @@ def inplace_predict(
preds = ctypes.POINTER(ctypes.c_float)()

# once caching is supported, we can pass id(data) as cache id.
args = {
"type": 0,
"training": False,
"iteration_begin": iteration_range[0],
"iteration_end": iteration_range[1],
"missing": missing,
"strict_shape": strict_shape,
"cache_id": 0,
}
if predict_type == "margin":
args["type"] = 1
args = make_jcargs(
type=1 if predict_type == "margin" else 0,
training=False,
iteration_begin=iteration_range[0],
iteration_end=iteration_range[1],
missing=missing,
strict_shape=strict_shape,
cache_id=0,
)
shape = ctypes.POINTER(c_bst_ulong)()
dims = c_bst_ulong()

Expand All @@ -2243,30 +2241,39 @@ def inplace_predict(
proxy = None
p_handle = ctypes.c_void_p()
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
if validate_features:
if not hasattr(data, "shape"):
raise TypeError(
"`shape` attribute is required when `validate_features` is True."
)
if len(data.shape) != 1 and self.num_features() != data.shape[1]:
raise ValueError(
f"Feature shape mismatch, expected: {self.num_features()}, "
f"got {data.shape[1]}"
)

from .data import (
_array_interface,
_is_cudf_df,
_is_cupy_array,
_is_list,
_is_pandas_df,
_is_pandas_series,
_is_tuple,
_transform_pandas_df,
)

enable_categorical = True
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
data, fns, _ = _transform_pandas_df(data, enable_categorical)
if validate_features:
self._validate_features(fns)
if _is_list(data) or _is_tuple(data):
data = np.array(data)

if validate_features:
if not hasattr(data, "shape"):
raise TypeError(
"`shape` attribute is required when `validate_features` is True."
)
if len(data.shape) != 1 and self.num_features() != data.shape[1]:
raise ValueError(
f"Feature shape mismatch, expected: {self.num_features()}, "
f"got {data.shape[1]}"
)

if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype
Expand All @@ -2276,7 +2283,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromDense(
self.handle,
_array_interface(data),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2293,7 +2300,7 @@ def inplace_predict(
_array_interface(csr.indices),
_array_interface(csr.data),
c_bst_ulong(csr.shape[1]),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2310,7 +2317,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromCudaArray(
self.handle,
interface_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand All @@ -2331,7 +2338,7 @@ def inplace_predict(
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,
interfaces_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
Expand Down
13 changes: 8 additions & 5 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,12 +958,12 @@ def dispatch_data_backend(
return _from_list(data, missing, threads, feature_names, feature_types)
if _is_tuple(data):
return _from_tuple(data, missing, threads, feature_names, feature_types)
if _is_pandas_df(data):
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(
data, missing, threads, enable_categorical, feature_names, feature_types
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
return _from_pandas_df(
data, enable_categorical, missing, threads, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df(
Expand Down Expand Up @@ -1205,6 +1205,9 @@ def _proxy_transform(
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
return data, None, feature_names, feature_types
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
Expand Down
6 changes: 6 additions & 0 deletions python-package/xgboost/testing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def np_dtypes(
for dtype in dtypes:
X = np.array(orig, dtype=dtype)
yield orig, X
yield orig.tolist(), X.tolist()

for dtype in dtypes:
X = np.array(orig, dtype=dtype)
Expand Down Expand Up @@ -101,6 +102,11 @@ def pd_dtypes() -> Generator:
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=dtype
)
yield orig, df
ser_orig = orig["f0"]
ser = df["f0"]
assert isinstance(ser, pd.Series)
assert isinstance(ser_orig, pd.Series)
yield ser_orig, ser

# Categorical
orig = orig.astype("category")
Expand Down
17 changes: 16 additions & 1 deletion tests/python/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import pytest
from scipy import sparse
from xgboost.testing.data import np_dtypes
from xgboost.testing.data import np_dtypes, pd_dtypes
from xgboost.testing.shared import validate_leaf_output

import xgboost as xgb
Expand Down Expand Up @@ -231,6 +231,7 @@ def test_base_margin(self):
from_dmatrix = booster.predict(dtrain)
np.testing.assert_allclose(from_dmatrix, from_inplace)

@pytest.mark.skipif(**tm.no_pandas())
def test_dtypes(self) -> None:
for orig, x in np_dtypes(self.rows, self.cols):
predt_orig = self.booster.inplace_predict(orig)
Expand All @@ -246,3 +247,17 @@ def test_dtypes(self) -> None:
X: np.ndarray = np.array(orig, dtype=dtype)
with pytest.raises(ValueError):
self.booster.inplace_predict(X)

@pytest.mark.skipif(**tm.no_pandas())
def test_pd_dtypes(self) -> None:
from pandas.api.types import is_bool_dtype
for orig, x in pd_dtypes():
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
continue
y = np.arange(x.shape[0])
Xy = xgb.DMatrix(orig, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist"}, Xy, num_boost_round=1)
predt_orig = booster.inplace_predict(orig)
predt = booster.inplace_predict(x)
np.testing.assert_allclose(predt, predt_orig)
33 changes: 20 additions & 13 deletions tests/python/test_with_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,22 +298,29 @@ def test_cv_as_pandas(self):
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]

def test_nullable_type(self) -> None:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical

for DMatrixT in (xgb.DMatrix, xgb.QuantileDMatrix):
for orig, df in pd_dtypes():
for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical for dtype in df.dtypes)

m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
# extension types
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
# different from pd.BooleanDtype(), None is converted to False with bool
if any(dtype == "bool" for dtype in orig.dtypes):
assert not tm.predictor_equal(m_orig, m_etype)
else:
assert tm.predictor_equal(m_orig, m_etype)

else:
# series
enable_categorical = is_categorical(df.dtype)

m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
# extension types
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
# different from pd.BooleanDtype(), None is converted to False with bool
if hasattr(orig.dtypes, "__iter__") and any(
dtype == "bool" for dtype in orig.dtypes
):
assert not tm.predictor_equal(m_orig, m_etype)
else:
assert tm.predictor_equal(m_orig, m_etype)

if isinstance(df, pd.DataFrame):
f0 = df["f0"]
with pytest.raises(ValueError, match="Label contains NaN"):
xgb.DMatrix(df, f0, enable_categorical=enable_categorical)

0 comments on commit f6effa1

Please sign in to comment.