diff --git a/README.md b/README.md index 3839ee5af..fd2c47884 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,6 @@ ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net -Dask-LightGBM (distributed and parallel Python-package): https://github.com/dask/dask-lightgbm - Ruby gem: https://github.com/ankane/lightgbm LightGBM4j (Java high-level binding): https://github.com/metarank/lightgbm4j diff --git a/docs/FAQ.rst b/docs/FAQ.rst index e2c7ad9aa..e90776347 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -24,7 +24,7 @@ You may also ping a member of the core team according to the relevant area of ex - `@chivee `__ **Qiwei Ye** (C++ code / Python-package) - `@btrotta `__ **Belinda Trotta** (C++ code) - `@Laurae2 `__ **Damien Soukhavong** (R-package) -- `@jameslamb `__ **James Lamb** (R-package) +- `@jameslamb `__ **James Lamb** (R-package / Dask-package) - `@wxchan `__ **Wenxuan Chen** (Python-package) - `@henry0312 `__ **Tsukasa Omoto** (Python-package) - `@StrikerRUS `__ **Nikita Titov** (Python-package) diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 3cd570864..6dde5d8fb 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -7,7 +7,7 @@ Follow the `Quick Start <./Quick-Start.rst>`__ to know how to use LightGBM first **List of external libraries in which LightGBM can be used in a distributed fashion** -- `Dask-LightGBM`_ allows to create ML workflow on Dask distributed data structures. +- `Dask API of LightGBM <./Python-API.rst#dask-api>`__ (formerly it was a separate package) allows to create ML workflow on Dask distributed data structures. - `MMLSpark`_ integrates LightGBM into Apache Spark ecosystem. `The following example`_ demonstrates how easy it's possible to utilize the great power of Spark. @@ -134,8 +134,6 @@ Example - `A simple parallel example`_ -.. _Dask-LightGBM: https://github.com/dask/dask-lightgbm - .. _MMLSpark: https://aka.ms/spark .. _The following example: https://github.com/Azure/mmlspark/blob/master/notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb diff --git a/docs/Python-API.rst b/docs/Python-API.rst index ef249ad4c..5dee4583b 100644 --- a/docs/Python-API.rst +++ b/docs/Python-API.rst @@ -33,6 +33,16 @@ Scikit-learn API LGBMRegressor LGBMRanker +Dask API +-------- + +.. autosummary:: + :toctree: pythonapi/ + + DaskLGBMClassifier + DaskLGBMRegressor + DaskLGBMRanker + Callbacks --------- diff --git a/docs/conf.py b/docs/conf.py index b84321b95..a66be6df8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,7 +39,7 @@ INTERNAL_REF_REGEX = compile(r"(?P\.\/.+)(?P\.rst)(?P$|# # -- mock out modules MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', - 'sklearn', 'matplotlib', 'pandas', 'graphviz'] + 'sklearn', 'matplotlib', 'pandas', 'graphviz', 'dask', 'dask.distributed'] for mod_name in MOCK_MODULES: sys.modules[mod_name] = Mock() diff --git a/python-package/README.rst b/python-package/README.rst index 265f9d470..121def9b1 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -183,12 +183,22 @@ Run ``python setup.py install --bit32``, if you want to use 32-bit version. All If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide `__) and then just run ``python setup.py install --precompile``. - Build Wheel File **************** You can use ``python setup.py bdist_wheel`` instead of ``python setup.py install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access. +Install Dask-package +'''''''''''''''''''' + +To install all additional dependencies required for Dask-package, you can append ``[dask]`` to LightGBM package name: + +.. code:: sh + + pip install lightgbm[dask] + +Or replace ``python setup.py install`` with ``pip install -e .[dask]`` if you are installing the package from source files. + Troubleshooting --------------- diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 44f2e5667..c8bbb8484 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -19,6 +19,10 @@ try: plot_tree, create_tree_digraph) except ImportError: pass +try: + from .dask import DaskLGBMRegressor, DaskLGBMClassifier, DaskLGBMRanker +except ImportError: + pass dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -31,5 +35,6 @@ __all__ = ['Dataset', 'Booster', 'CVBooster', 'register_logger', 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', + 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index fa12ae2c9..e11807577 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -105,3 +105,12 @@ except ImportError: _LGBMAssertAllFinite = None _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None + +"""dask""" +try: + from dask import array + from dask import dataframe + from dask.distributed import Client + DASK_INSTALLED = True +except ImportError: + DASK_INSTALLED = False diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 3fbb6183d..4acbf1070 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -21,7 +21,8 @@ from dask import dataframe as dd from dask import delayed from dask.distributed import Client, default_client, get_worker, wait -from .basic import _ConfigAliases, _LIB, _log_warning, _safe_call +from .basic import _ConfigAliases, _LIB, _log_warning, _safe_call, LightGBMError +from .compat import DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED from .sklearn import LGBMClassifier, LGBMRegressor, LGBMRanker @@ -393,6 +394,9 @@ def _predict(model, data, raw_score=False, pred_proba=False, pred_leaf=False, pr class _LGBMModel: + def __init__(self): + if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): + raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') def _fit(self, model_factory, X, y=None, sample_weight=None, group=None, client=None, **kwargs): """Docstring is inherited from the LGBMModel.""" @@ -431,7 +435,7 @@ class _LGBMModel: setattr(dest, name, attributes[name]) -class DaskLGBMClassifier(_LGBMModel, LGBMClassifier): +class DaskLGBMClassifier(LGBMClassifier, _LGBMModel): """Distributed version of lightgbm.LGBMClassifier.""" def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): @@ -479,7 +483,7 @@ class DaskLGBMClassifier(_LGBMModel, LGBMClassifier): return self._to_local(LGBMClassifier) -class DaskLGBMRegressor(_LGBMModel, LGBMRegressor): +class DaskLGBMRegressor(LGBMRegressor, _LGBMModel): """Docstring is inherited from the lightgbm.LGBMRegressor.""" def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): @@ -515,7 +519,7 @@ class DaskLGBMRegressor(_LGBMModel, LGBMRegressor): return self._to_local(LGBMRegressor) -class DaskLGBMRanker(_LGBMModel, LGBMRanker): +class DaskLGBMRanker(LGBMRanker, _LGBMModel): """Docstring is inherited from the lightgbm.LGBMRanker.""" def fit(self, X, y=None, sample_weight=None, init_score=None, group=None, client=None, **kwargs): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 51f1b7e6e..478b1efac 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -334,7 +334,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} for obj_alias in _ConfigAliases.get("objective")): if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for ranking cv.') + raise LightGBMError('scikit-learn is required for ranking cv') # ranking task, split according to groups group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False) flatted_group = np.repeat(range(len(group_info)), repeats=group_info) @@ -342,7 +342,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group) elif stratified: if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for stratified cv.') + raise LightGBMError('scikit-learn is required for stratified cv') skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) folds = skf.split(X=np.zeros(num_data), y=full_data.get_label()) else: diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 9fa930c90..96efef17d 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -289,7 +289,7 @@ class LGBMModel(_LGBMModelBase): and you should group grad and hess in this way as well. """ if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for this module') + raise LightGBMError('scikit-learn is required for lightgbm.sklearn') self.boosting_type = boosting_type self.objective = objective diff --git a/python-package/setup.py b/python-package/setup.py index 3e545c490..08cc7d0c0 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -344,7 +344,7 @@ if __name__ == "__main__": extras_require={ 'dask': [ 'dask[array]>=2.0.0', - 'dask[dataframe]>=2.0.0' + 'dask[dataframe]>=2.0.0', 'dask[distributed]>=2.0.0', 'pandas', ],