Skip to content

Commit

Permalink
[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871
Browse files Browse the repository at this point in the history
…) (#3930)

* got fit() working

* add predict()

* predict_proba()

* remove custom objective docs

* Apply suggestions from code review

Co-authored-by: Nikita Titov <[email protected]>

* fix capitalization

* Update tests/python_package_test/test_dask.py

Co-authored-by: Nikita Titov <[email protected]>

Co-authored-by: Nikita Titov <[email protected]>
  • Loading branch information
jameslamb and StrikerRUS authored Feb 9, 2021
1 parent 846b512 commit 06ed433
Show file tree
Hide file tree
Showing 3 changed files with 314 additions and 215 deletions.
142 changes: 120 additions & 22 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# coding: utf-8
"""Distributed training with LightGBM and Dask.distributed.
"""Distributed training with LightGBM and dask.distributed.
This module enables you to perform distributed training with LightGBM on
Dask.Array and Dask.DataFrame collections.
dask.Array and dask.DataFrame collections.
It is based on dask-lightgbm, which was based on dask-xgboost.
"""
Expand All @@ -19,7 +19,14 @@
from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
SKLEARN_INSTALLED, LGBMNotFittedError,
DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker
from .sklearn import (
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
LGBMClassifier,
LGBMModel,
LGBMRegressor,
LGBMRanker
)

_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
_DaskMatrixLike = Union[dask_Array, dask_DataFrame]
Expand Down Expand Up @@ -216,17 +223,17 @@ def _train(
----------
client : dask.distributed.Client
Dask client.
data : dask Array or dask DataFrame of shape = [n_samples, n_features]
data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix.
label : dask Array, dask DataFrame or dask Series of shape = [n_samples]
label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
The target values (class labels in classification, real numbers in regression).
params : dict
Parameters passed to constructor of the local underlying model.
model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Class of the local underlying model.
sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data.
group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
Group/query data.
Only used in the learning-to-rank task.
sum(group) = n_samples.
Expand Down Expand Up @@ -396,7 +403,7 @@ def _predict(
----------
model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Fitted underlying model.
data : dask Array or dask DataFrame of shape = [n_samples, n_features]
data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
Input feature matrix.
raw_score : bool, optional (default=False)
Whether to predict raw scores.
Expand All @@ -413,11 +420,11 @@ def _predict(
Returns
-------
predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
The predicted values.
X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
If ``pred_contrib=True``, the feature contributions for each sample.
"""
if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
Expand Down Expand Up @@ -448,7 +455,7 @@ def _predict(
**kwargs
)
else:
raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data)))
raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))


class _DaskLGBMModel:
Expand Down Expand Up @@ -578,13 +585,17 @@ def __init__(

_base_doc = LGBMClassifier.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)

# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]

def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()

Expand All @@ -604,7 +615,23 @@ def fit(
**kwargs
)

fit.__doc__ = LGBMClassifier.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)

# DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])

# DaskLGBMClassifier support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
)

def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
Expand All @@ -615,7 +642,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
**kwargs
)

predict.__doc__ = LGBMClassifier.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)

def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
Expand All @@ -626,7 +660,14 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
**kwargs
)

predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__
predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted probability for each class for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_probability",
predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
)

def to_local(self) -> LGBMClassifier:
"""Create regular version of lightgbm.LGBMClassifier from the distributed version.
Expand Down Expand Up @@ -695,13 +736,17 @@ def __init__(

_base_doc = LGBMRegressor.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)

# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]

def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()

Expand All @@ -721,7 +766,23 @@ def fit(
**kwargs
)

fit.__doc__ = LGBMRegressor.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)

# DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('verbose :'):])

# DaskLGBMRegressor support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
)

def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
Expand All @@ -731,7 +792,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
**kwargs
)

predict.__doc__ = LGBMRegressor.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)

def to_local(self) -> LGBMRegressor:
"""Create regular version of lightgbm.LGBMRegressor from the distributed version.
Expand Down Expand Up @@ -800,13 +868,17 @@ def __init__(

_base_doc = LGBMRanker.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
__init__.__doc__ = (
_base_doc = (
_before_kwargs
+ 'client : dask.distributed.Client or None, optional (default=None)\n'
+ ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
+ ' ' * 8 + _kwargs + _after_kwargs
)

# the note on custom objective functions in LGBMModel.__init__ is not
# currently relevant for the Dask estimators
__init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]

def __getstate__(self) -> Dict[Any, Any]:
return self._lgb_getstate()

Expand All @@ -832,13 +904,39 @@ def fit(
**kwargs
)

fit.__doc__ = LGBMRanker.fit.__doc__
_base_doc = _lgbmmodel_doc_fit.format(
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
)

# DaskLGBMRanker does not support init_score, evaluation data, or early stopping
_base_doc = (_base_doc[:_base_doc.find('init_score :')]
+ _base_doc[_base_doc.find('init_score :'):])

_base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+ _base_doc[_base_doc.find('verbose :'):])

# DaskLGBMRanker support for callbacks and init_model is not tested
fit.__doc__ = (
_base_doc[:_base_doc.find('callbacks :')]
+ '**kwargs\n'
+ ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
)

def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRanker.predict."""
return _predict(self.to_local(), X, **kwargs)

predict.__doc__ = LGBMRanker.predict.__doc__
predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.",
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
)

def to_local(self) -> LGBMRanker:
"""Create regular version of lightgbm.LGBMRanker from the distributed version.
Expand Down
Loading

0 comments on commit 06ed433

Please sign in to comment.