[dask] [docs] Fix inaccuracies in API docs for Dask module (fixes #3871…

…) (#3930) * got fit() working * add predict() * predict_proba() * remove custom objective docs * Apply suggestions from code review Co-authored-by: Nikita Titov <[email protected]> * fix capitalization * Update tests/python_package_test/test_dask.py Co-authored-by: Nikita Titov <[email protected]> Co-authored-by: Nikita Titov <[email protected]>
microsoft · Feb 9, 2021 · 06ed433 · 06ed433
1 parent 846b512
commit 06ed433
Show file tree

Hide file tree

Showing 3 changed files with 314 additions and 215 deletions.
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
@@ -1,8 +1,8 @@
 # coding: utf-8
-"""Distributed training with LightGBM and Dask.distributed.
+"""Distributed training with LightGBM and dask.distributed.
 
 This module enables you to perform distributed training with LightGBM on
-Dask.Array and Dask.DataFrame collections.
+dask.Array and dask.DataFrame collections.
 
 It is based on dask-lightgbm, which was based on dask-xgboost.
 """
@@ -19,7 +19,14 @@
 from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
                      SKLEARN_INSTALLED, LGBMNotFittedError,
                      DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
-from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker
+from .sklearn import (
+    _lgbmmodel_doc_fit,
+    _lgbmmodel_doc_predict,
+    LGBMClassifier,
+    LGBMModel,
+    LGBMRegressor,
+    LGBMRanker
+)
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
 _DaskMatrixLike = Union[dask_Array, dask_DataFrame]
@@ -216,17 +223,17 @@ def _train(
     ----------
     client : dask.distributed.Client
         Dask client.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
         Input feature matrix.
-    label : dask Array, dask DataFrame or dask Series of shape = [n_samples]
+    label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
         The target values (class labels in classification, real numbers in regression).
     params : dict
         Parameters passed to constructor of the local underlying model.
     model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
         Class of the local underlying model.
-    sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
         Weights of training data.
-    group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
         Group/query data.
         Only used in the learning-to-rank task.
         sum(group) = n_samples.
@@ -396,7 +403,7 @@ def _predict(
     ----------
     model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
         Fitted underlying model.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
         Input feature matrix.
     raw_score : bool, optional (default=False)
         Whether to predict raw scores.
@@ -413,11 +420,11 @@ def _predict(
 
     Returns
     -------
-    predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
+    predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
         The predicted values.
-    X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
+    X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
         If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
+    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
         If ``pred_contrib=True``, the feature contributions for each sample.
     """
     if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
@@ -448,7 +455,7 @@ def _predict(
             **kwargs
         )
     else:
-        raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data)))
+        raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))
 
 
 class _DaskLGBMModel:
@@ -578,13 +585,17 @@ def __init__(
 
     _base_doc = LGBMClassifier.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -604,7 +615,23 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMClassifier.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMClassifier support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
@@ -615,7 +642,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
             **kwargs
         )
 
-    predict.__doc__ = LGBMClassifier.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )
 
     def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
@@ -626,7 +660,14 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
             **kwargs
         )
 
-    predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__
+    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted probability for each class for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_probability",
+        predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )
 
     def to_local(self) -> LGBMClassifier:
         """Create regular version of lightgbm.LGBMClassifier from the distributed version.
@@ -695,13 +736,17 @@ def __init__(
 
     _base_doc = LGBMRegressor.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -721,7 +766,23 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMRegressor.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRegressor support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
@@ -731,7 +792,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
             **kwargs
         )
 
-    predict.__doc__ = LGBMRegressor.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )
 
     def to_local(self) -> LGBMRegressor:
         """Create regular version of lightgbm.LGBMRegressor from the distributed version.
@@ -800,13 +868,17 @@ def __init__(
 
     _base_doc = LGBMRanker.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -832,13 +904,39 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMRanker.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRanker does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('init_score :'):])
+
+    _base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRanker support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRanker.predict."""
         return _predict(self.to_local(), X, **kwargs)
 
-    predict.__doc__ = LGBMRanker.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )
 
     def to_local(self) -> LGBMRanker:
         """Create regular version of lightgbm.LGBMRanker from the distributed version.