[ci] [python-package] enable ruff-format on all Python code (#6336)

This commit is contained in:
James Lamb 2024-02-27 10:53:12 -06:00 коммит произвёл GitHub
Родитель 2a08565513
Коммит dd31208ab7
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
10 изменённых файлов: 1986 добавлений и 1617 удалений

Просмотреть файл

@ -23,14 +23,33 @@ except ImportError:
pass
_version_path = Path(__file__).absolute().parent / 'VERSION.txt'
_version_path = Path(__file__).absolute().parent / "VERSION.txt"
if _version_path.is_file():
__version__ = _version_path.read_text(encoding='utf-8').strip()
__version__ = _version_path.read_text(encoding="utf-8").strip()
__all__ = ['Dataset', 'Booster', 'CVBooster', 'Sequence',
'register_logger',
'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker',
'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException',
'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph']
__all__ = [
"Dataset",
"Booster",
"CVBooster",
"Sequence",
"register_logger",
"train",
"cv",
"LGBMModel",
"LGBMRegressor",
"LGBMClassifier",
"LGBMRanker",
"DaskLGBMRegressor",
"DaskLGBMClassifier",
"DaskLGBMRanker",
"log_evaluation",
"record_evaluation",
"reset_parameter",
"early_stopping",
"EarlyStopException",
"plot_importance",
"plot_split_value_histogram",
"plot_metric",
"plot_tree",
"create_tree_digraph",
]

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -18,21 +18,21 @@ if TYPE_CHECKING:
from .engine import CVBooster
__all__ = [
'EarlyStopException',
'early_stopping',
'log_evaluation',
'record_evaluation',
'reset_parameter',
"EarlyStopException",
"early_stopping",
"log_evaluation",
"record_evaluation",
"reset_parameter",
]
_EvalResultDict = Dict[str, Dict[str, List[Any]]]
_EvalResultTuple = Union[
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
]
_ListOfEvalResultTuples = Union[
List[_LGBM_BoosterEvalMethodResultType],
List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]
List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType],
]
@ -95,8 +95,8 @@ class _LogEvaluationCallback:
def __call__(self, env: CallbackEnv) -> None:
if self.period > 0 and env.evaluation_result_list and (env.iteration + 1) % self.period == 0:
result = '\t'.join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list])
_log_info(f'[{env.iteration + 1}]\t{result}')
result = "\t".join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list])
_log_info(f"[{env.iteration + 1}]\t{result}")
def log_evaluation(period: int = 1, show_stdv: bool = True) -> _LogEvaluationCallback:
@ -133,7 +133,7 @@ class _RecordEvaluationCallback:
self.before_iteration = False
if not isinstance(eval_result, dict):
raise TypeError('eval_result should be a dictionary')
raise TypeError("eval_result should be a dictionary")
self.eval_result = eval_result
def _init(self, env: CallbackEnv) -> None:
@ -152,8 +152,8 @@ class _RecordEvaluationCallback:
if len(item) == 4:
self.eval_result[data_name].setdefault(eval_name, [])
else:
self.eval_result[data_name].setdefault(f'{eval_name}-mean', [])
self.eval_result[data_name].setdefault(f'{eval_name}-stdv', [])
self.eval_result[data_name].setdefault(f"{eval_name}-mean", [])
self.eval_result[data_name].setdefault(f"{eval_name}-stdv", [])
def __call__(self, env: CallbackEnv) -> None:
if env.iteration == env.begin_iteration:
@ -171,8 +171,8 @@ class _RecordEvaluationCallback:
data_name, eval_name = item[1].split()
res_mean = item[2]
res_stdv = item[4] # type: ignore[misc]
self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean)
self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv)
self.eval_result[data_name][f"{eval_name}-mean"].append(res_mean)
self.eval_result[data_name][f"{eval_name}-stdv"].append(res_stdv)
def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable:
@ -230,8 +230,10 @@ class _ResetParameterCallback:
elif callable(value):
new_param = value(env.iteration - env.begin_iteration)
else:
raise ValueError("Only list and callable values are supported "
"as a mapping from boosting round index to new parameter value.")
raise ValueError(
"Only list and callable values are supported "
"as a mapping from boosting round index to new parameter value."
)
if new_param != env.params.get(key, None):
new_parameters[key] = new_param
if new_parameters:
@ -276,9 +278,8 @@ class _EarlyStoppingCallback:
stopping_rounds: int,
first_metric_only: bool = False,
verbose: bool = True,
min_delta: Union[float, List[float]] = 0.0
min_delta: Union[float, List[float]] = 0.0,
) -> None:
if not isinstance(stopping_rounds, int) or stopping_rounds <= 0:
raise ValueError(f"stopping_rounds should be an integer and greater than 0. got: {stopping_rounds}")
@ -298,7 +299,7 @@ class _EarlyStoppingCallback:
self.best_iter: List[int] = []
self.best_score_list: List[_ListOfEvalResultTuples] = []
self.cmp_op: List[Callable[[float, float], bool]] = []
self.first_metric = ''
self.first_metric = ""
def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
return curr_score > best_score + delta
@ -321,29 +322,24 @@ class _EarlyStoppingCallback:
def _init(self, env: CallbackEnv) -> None:
if env.evaluation_result_list is None or env.evaluation_result_list == []:
raise ValueError(
"For early stopping, at least one dataset and eval metric is required for evaluation"
)
raise ValueError("For early stopping, at least one dataset and eval metric is required for evaluation")
is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting"))
is_dart = any(env.params.get(alias, "") == "dart" for alias in _ConfigAliases.get("boosting"))
if is_dart:
self.enabled = False
_log_warning('Early stopping is not available in dart mode')
_log_warning("Early stopping is not available in dart mode")
return
# validation sets are guaranteed to not be identical to the training data in cv()
if isinstance(env.model, Booster):
only_train_set = (
len(env.evaluation_result_list) == 1
and self._is_train_set(
ds_name=env.evaluation_result_list[0][0],
eval_name=env.evaluation_result_list[0][1].split(" ")[0],
env=env
)
only_train_set = len(env.evaluation_result_list) == 1 and self._is_train_set(
ds_name=env.evaluation_result_list[0][0],
eval_name=env.evaluation_result_list[0][1].split(" ")[0],
env=env,
)
if only_train_set:
self.enabled = False
_log_warning('Only training set found, disabling early stopping.')
_log_warning("Only training set found, disabling early stopping.")
return
if self.verbose:
@ -355,26 +351,26 @@ class _EarlyStoppingCallback:
n_datasets = len(env.evaluation_result_list) // n_metrics
if isinstance(self.min_delta, list):
if not all(t >= 0 for t in self.min_delta):
raise ValueError('Values for early stopping min_delta must be non-negative.')
raise ValueError("Values for early stopping min_delta must be non-negative.")
if len(self.min_delta) == 0:
if self.verbose:
_log_info('Disabling min_delta for early stopping.')
_log_info("Disabling min_delta for early stopping.")
deltas = [0.0] * n_datasets * n_metrics
elif len(self.min_delta) == 1:
if self.verbose:
_log_info(f'Using {self.min_delta[0]} as min_delta for all metrics.')
_log_info(f"Using {self.min_delta[0]} as min_delta for all metrics.")
deltas = self.min_delta * n_datasets * n_metrics
else:
if len(self.min_delta) != n_metrics:
raise ValueError('Must provide a single value for min_delta or as many as metrics.')
raise ValueError("Must provide a single value for min_delta or as many as metrics.")
if self.first_metric_only and self.verbose:
_log_info(f'Using only {self.min_delta[0]} as early stopping min_delta.')
_log_info(f"Using only {self.min_delta[0]} as early stopping min_delta.")
deltas = self.min_delta * n_datasets
else:
if self.min_delta < 0:
raise ValueError('Early stopping min_delta must be non-negative.')
raise ValueError("Early stopping min_delta must be non-negative.")
if self.min_delta > 0 and n_metrics > 1 and not self.first_metric_only and self.verbose:
_log_info(f'Using {self.min_delta} as min_delta for all metrics.')
_log_info(f"Using {self.min_delta} as min_delta for all metrics.")
deltas = [self.min_delta] * n_datasets * n_metrics
# split is needed for "<dataset type> <metric>" case (e.g. "train l1")
@ -382,18 +378,19 @@ class _EarlyStoppingCallback:
for eval_ret, delta in zip(env.evaluation_result_list, deltas):
self.best_iter.append(0)
if eval_ret[3]: # greater is better
self.best_score.append(float('-inf'))
self.best_score.append(float("-inf"))
self.cmp_op.append(partial(self._gt_delta, delta=delta))
else:
self.best_score.append(float('inf'))
self.best_score.append(float("inf"))
self.cmp_op.append(partial(self._lt_delta, delta=delta))
def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None:
if env.iteration == env.end_iteration - 1:
if self.verbose:
best_score_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
_log_info('Did not meet early stopping. '
f'Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}')
best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
_log_info(
"Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}"
)
if self.first_metric_only:
_log_info(f"Evaluated only: {eval_name_splitted[-1]}")
raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
@ -409,7 +406,7 @@ class _EarlyStoppingCallback:
"Please report it at https://github.com/microsoft/LightGBM/issues"
)
# self.best_score_list is initialized to an empty list
first_time_updating_best_score_list = (self.best_score_list == [])
first_time_updating_best_score_list = self.best_score_list == []
for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2]
if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]):
@ -426,12 +423,14 @@ class _EarlyStoppingCallback:
if self._is_train_set(
ds_name=env.evaluation_result_list[i][0],
eval_name=eval_name_splitted[0],
env=env
env=env,
):
continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train)
elif env.iteration - self.best_iter[i] >= self.stopping_rounds:
if self.verbose:
eval_result_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
eval_result_str = "\t".join(
[_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]
)
_log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
if self.first_metric_only:
_log_info(f"Evaluated only: {eval_name_splitted[-1]}")
@ -439,7 +438,12 @@ class _EarlyStoppingCallback:
self._final_iteration_check(env, eval_name_splitted, i)
def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, min_delta: Union[float, List[float]] = 0.0) -> _EarlyStoppingCallback:
def early_stopping(
stopping_rounds: int,
first_metric_only: bool = False,
verbose: bool = True,
min_delta: Union[float, List[float]] = 0.0,
) -> _EarlyStoppingCallback:
"""Create a callback that activates early stopping.
Activates early stopping.
@ -473,4 +477,9 @@ def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbos
callback : _EarlyStoppingCallback
The callback that activates early stopping.
"""
return _EarlyStoppingCallback(stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta)
return _EarlyStoppingCallback(
stopping_rounds=stopping_rounds,
first_metric_only=first_metric_only,
verbose=verbose,
min_delta=min_delta,
)

Просмотреть файл

@ -8,6 +8,7 @@ try:
from pandas import DataFrame as pd_DataFrame
from pandas import Series as pd_Series
from pandas import concat
try:
from pandas import CategoricalDtype as pd_CategoricalDtype
except ImportError:
@ -40,15 +41,18 @@ except ImportError:
try:
from numpy.random import Generator as np_random_Generator
except ImportError:
class np_random_Generator: # type: ignore
"""Dummy class for np.random.Generator."""
def __init__(self, *args, **kwargs):
pass
"""matplotlib"""
try:
import matplotlib # noqa: F401
MATPLOTLIB_INSTALLED = True
except ImportError:
MATPLOTLIB_INSTALLED = False
@ -56,6 +60,7 @@ except ImportError:
"""graphviz"""
try:
import graphviz # noqa: F401
GRAPHVIZ_INSTALLED = True
except ImportError:
GRAPHVIZ_INSTALLED = False
@ -63,6 +68,7 @@ except ImportError:
"""datatable"""
try:
import datatable
if hasattr(datatable, "Frame"):
dt_DataTable = datatable.Frame
else:
@ -85,6 +91,7 @@ try:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
try:
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
@ -155,6 +162,7 @@ try:
from dask.dataframe import DataFrame as dask_DataFrame
from dask.dataframe import Series as dask_Series
from dask.distributed import Client, Future, default_client, wait
DASK_INSTALLED = True
except ImportError:
DASK_INSTALLED = False
@ -195,6 +203,7 @@ except ImportError:
def __init__(self, *args, **kwargs):
pass
"""pyarrow"""
try:
import pyarrow.compute as pa_compute
@ -205,6 +214,7 @@ try:
from pyarrow.cffi import ffi as arrow_cffi
from pyarrow.types import is_floating as arrow_is_floating
from pyarrow.types import is_integer as arrow_is_integer
PYARROW_INSTALLED = True
except ImportError:
PYARROW_INSTALLED = False
@ -266,4 +276,5 @@ except ImportError:
def _LGBMCpuCount(only_physical_cores: bool = True) -> int:
return cpu_count()
__all__: List[str] = []

Просмотреть файл

@ -51,9 +51,9 @@ from .sklearn import (
)
__all__ = [
'DaskLGBMClassifier',
'DaskLGBMRanker',
'DaskLGBMRegressor',
"DaskLGBMClassifier",
"DaskLGBMRanker",
"DaskLGBMRegressor",
]
_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
@ -67,7 +67,7 @@ class _RemoteSocket:
def acquire(self) -> int:
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.socket.bind(('', 0))
self.socket.bind(("", 0))
return self.socket.getsockname()[1]
def release(self) -> None:
@ -153,9 +153,11 @@ def _concat(seq: List[_DaskPart]) -> _DaskPart:
elif isinstance(seq[0], (pd_DataFrame, pd_Series)):
return concat(seq, axis=0)
elif isinstance(seq[0], ss.spmatrix):
return ss.vstack(seq, format='csr')
return ss.vstack(seq, format="csr")
else:
raise TypeError(f'Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got {type(seq[0]).__name__}.')
raise TypeError(
f"Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got {type(seq[0]).__name__}."
)
def _remove_list_padding(*args: Any) -> List[List[Any]]:
@ -186,41 +188,41 @@ def _train_part(
return_model: bool,
time_out: int,
remote_socket: _RemoteSocket,
**kwargs: Any
**kwargs: Any,
) -> Optional[LGBMModel]:
network_params = {
'machines': machines,
'local_listen_port': local_listen_port,
'time_out': time_out,
'num_machines': num_machines
"machines": machines,
"local_listen_port": local_listen_port,
"time_out": time_out,
"num_machines": num_machines,
}
params.update(network_params)
is_ranker = issubclass(model_factory, LGBMRanker)
# Concatenate many parts into one
data = _concat([x['data'] for x in list_of_parts])
label = _concat([x['label'] for x in list_of_parts])
data = _concat([x["data"] for x in list_of_parts])
label = _concat([x["label"] for x in list_of_parts])
if 'weight' in list_of_parts[0]:
weight = _concat([x['weight'] for x in list_of_parts])
if "weight" in list_of_parts[0]:
weight = _concat([x["weight"] for x in list_of_parts])
else:
weight = None
if 'group' in list_of_parts[0]:
group = _concat([x['group'] for x in list_of_parts])
if "group" in list_of_parts[0]:
group = _concat([x["group"] for x in list_of_parts])
else:
group = None
if 'init_score' in list_of_parts[0]:
init_score = _concat([x['init_score'] for x in list_of_parts])
if "init_score" in list_of_parts[0]:
init_score = _concat([x["init_score"] for x in list_of_parts])
else:
init_score = None
# construct local eval_set data.
n_evals = max(len(x.get('eval_set', [])) for x in list_of_parts)
eval_names = kwargs.pop('eval_names', None)
eval_class_weight = kwargs.get('eval_class_weight')
n_evals = max(len(x.get("eval_set", [])) for x in list_of_parts)
eval_names = kwargs.pop("eval_names", None)
eval_class_weight = kwargs.get("eval_class_weight")
local_eval_set = None
local_eval_names = None
local_eval_sample_weight = None
@ -228,8 +230,8 @@ def _train_part(
local_eval_group = None
if n_evals:
has_eval_sample_weight = any(x.get('eval_sample_weight') is not None for x in list_of_parts)
has_eval_init_score = any(x.get('eval_init_score') is not None for x in list_of_parts)
has_eval_sample_weight = any(x.get("eval_sample_weight") is not None for x in list_of_parts)
has_eval_init_score = any(x.get("eval_init_score") is not None for x in list_of_parts)
local_eval_set = []
evals_result_names = []
@ -251,7 +253,7 @@ def _train_part(
init_score_e = []
g_e = []
for part in list_of_parts:
if not part.get('eval_set'):
if not part.get("eval_set"):
continue
# require that eval_name exists in evaluated result data in case dropped due to padding.
@ -259,12 +261,12 @@ def _train_part(
if eval_names:
evals_result_name = eval_names[i]
else:
evals_result_name = f'valid_{i}'
evals_result_name = f"valid_{i}"
eval_set = part['eval_set'][i]
eval_set = part["eval_set"][i]
if eval_set is _DatasetNames.TRAINSET:
x_e.append(part['data'])
y_e.append(part['label'])
x_e.append(part["data"])
y_e.append(part["label"])
else:
x_e.extend(eval_set[0])
y_e.extend(eval_set[1])
@ -272,24 +274,24 @@ def _train_part(
if evals_result_name not in evals_result_names:
evals_result_names.append(evals_result_name)
eval_weight = part.get('eval_sample_weight')
eval_weight = part.get("eval_sample_weight")
if eval_weight:
if eval_weight[i] is _DatasetNames.SAMPLE_WEIGHT:
w_e.append(part['weight'])
w_e.append(part["weight"])
else:
w_e.extend(eval_weight[i])
eval_init_score = part.get('eval_init_score')
eval_init_score = part.get("eval_init_score")
if eval_init_score:
if eval_init_score[i] is _DatasetNames.INIT_SCORE:
init_score_e.append(part['init_score'])
init_score_e.append(part["init_score"])
else:
init_score_e.extend(eval_init_score[i])
eval_group = part.get('eval_group')
eval_group = part.get("eval_group")
if eval_group:
if eval_group[i] is _DatasetNames.GROUP:
g_e.append(part['group'])
g_e.append(part["group"])
else:
g_e.extend(eval_group[i])
@ -313,7 +315,7 @@ def _train_part(
if eval_names:
local_eval_names = [eval_names[i] for i in eval_component_idx]
if eval_class_weight:
kwargs['eval_class_weight'] = [eval_class_weight[i] for i in eval_component_idx]
kwargs["eval_class_weight"] = [eval_class_weight[i] for i in eval_component_idx]
model = model_factory(**params)
if remote_socket is not None:
@ -331,7 +333,7 @@ def _train_part(
eval_init_score=local_eval_init_score,
eval_group=local_eval_group,
eval_names=local_eval_names,
**kwargs
**kwargs,
)
else:
model.fit(
@ -343,7 +345,7 @@ def _train_part(
eval_sample_weight=local_eval_sample_weight,
eval_init_score=local_eval_init_score,
eval_names=local_eval_names,
**kwargs
**kwargs,
)
finally:
@ -389,7 +391,9 @@ def _machines_to_worker_map(machines: str, worker_addresses: Iterable[str]) -> D
machine_addresses = machines.split(",")
if len(set(machine_addresses)) != len(machine_addresses):
raise ValueError(f"Found duplicates in 'machines' ({machines}). Each entry in 'machines' must be a unique IP-port combination.")
raise ValueError(
f"Found duplicates in 'machines' ({machines}). Each entry in 'machines' must be a unique IP-port combination."
)
machine_to_port = defaultdict(set)
for address in machine_addresses:
@ -423,7 +427,7 @@ def _train(
eval_group: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None,
**kwargs: Any
**kwargs: Any,
) -> LGBMModel:
"""Inner train routine.
@ -512,36 +516,34 @@ def _train(
params = deepcopy(params)
# capture whether local_listen_port or its aliases were provided
listen_port_in_params = any(
alias in params for alias in _ConfigAliases.get("local_listen_port")
)
listen_port_in_params = any(alias in params for alias in _ConfigAliases.get("local_listen_port"))
# capture whether machines or its aliases were provided
machines_in_params = any(
alias in params for alias in _ConfigAliases.get("machines")
)
machines_in_params = any(alias in params for alias in _ConfigAliases.get("machines"))
params = _choose_param_value(
main_param_name="tree_learner",
params=params,
default_value="data"
default_value="data",
)
allowed_tree_learners = {
'data',
'data_parallel',
'feature',
'feature_parallel',
'voting',
'voting_parallel'
"data",
"data_parallel",
"feature",
"feature_parallel",
"voting",
"voting_parallel",
}
if params["tree_learner"] not in allowed_tree_learners:
_log_warning(f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default')
params['tree_learner'] = 'data'
_log_warning(
f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default'
)
params["tree_learner"] = "data"
# Some passed-in parameters can be removed:
# * 'num_machines': set automatically from Dask worker list
# * 'num_threads': overridden to match nthreads on each Dask process
for param_alias in _ConfigAliases.get('num_machines', 'num_threads'):
for param_alias in _ConfigAliases.get("num_machines", "num_threads"):
if param_alias in params:
_log_warning(f"Parameter {param_alias} will be ignored.")
params.pop(param_alias)
@ -549,23 +551,23 @@ def _train(
# Split arrays/dataframes into parts. Arrange parts into dicts to enforce co-locality
data_parts = _split_to_parts(data=data, is_matrix=True)
label_parts = _split_to_parts(data=label, is_matrix=False)
parts = [{'data': x, 'label': y} for (x, y) in zip(data_parts, label_parts)]
parts = [{"data": x, "label": y} for (x, y) in zip(data_parts, label_parts)]
n_parts = len(parts)
if sample_weight is not None:
weight_parts = _split_to_parts(data=sample_weight, is_matrix=False)
for i in range(n_parts):
parts[i]['weight'] = weight_parts[i]
parts[i]["weight"] = weight_parts[i]
if group is not None:
group_parts = _split_to_parts(data=group, is_matrix=False)
for i in range(n_parts):
parts[i]['group'] = group_parts[i]
parts[i]["group"] = group_parts[i]
if init_score is not None:
init_score_parts = _split_to_parts(data=init_score, is_matrix=False)
for i in range(n_parts):
parts[i]['init_score'] = init_score_parts[i]
parts[i]["init_score"] = init_score_parts[i]
# evals_set will to be re-constructed into smaller lists of (X, y) tuples, where
# X and y are each delayed sub-lists of original eval dask Collections.
@ -575,47 +577,16 @@ def _train(
n_largest_eval_parts = max(x[0].npartitions for x in eval_set)
eval_sets: Dict[
int,
List[
Union[
_DatasetNames,
Tuple[
List[Optional[_DaskMatrixLike]],
List[Optional[_DaskVectorLike]]
]
]
]
int, List[Union[_DatasetNames, Tuple[List[Optional[_DaskMatrixLike]], List[Optional[_DaskVectorLike]]]]]
] = defaultdict(list)
if eval_sample_weight:
eval_sample_weights: Dict[
int,
List[
Union[
_DatasetNames,
List[Optional[_DaskVectorLike]]
]
]
] = defaultdict(list)
eval_sample_weights: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskVectorLike]]]]] = defaultdict(
list
)
if eval_group:
eval_groups: Dict[
int,
List[
Union[
_DatasetNames,
List[Optional[_DaskVectorLike]]
]
]
] = defaultdict(list)
eval_groups: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskVectorLike]]]]] = defaultdict(list)
if eval_init_score:
eval_init_scores: Dict[
int,
List[
Union[
_DatasetNames,
List[Optional[_DaskMatrixLike]]
]
]
] = defaultdict(list)
eval_init_scores: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskMatrixLike]]]]] = defaultdict(list)
for i, (X_eval, y_eval) in enumerate(eval_set):
n_this_eval_parts = X_eval.npartitions
@ -704,13 +675,13 @@ def _train(
# assign sub-eval_set components to worker parts.
for parts_idx, e_set in eval_sets.items():
parts[parts_idx]['eval_set'] = e_set
parts[parts_idx]["eval_set"] = e_set
if eval_sample_weight:
parts[parts_idx]['eval_sample_weight'] = eval_sample_weights[parts_idx]
parts[parts_idx]["eval_sample_weight"] = eval_sample_weights[parts_idx]
if eval_init_score:
parts[parts_idx]['eval_init_score'] = eval_init_scores[parts_idx]
parts[parts_idx]["eval_init_score"] = eval_init_scores[parts_idx]
if eval_group:
parts[parts_idx]['eval_group'] = eval_groups[parts_idx]
parts[parts_idx]["eval_group"] = eval_groups[parts_idx]
# Start computation in the background
parts = list(map(delayed, parts))
@ -718,7 +689,7 @@ def _train(
wait(parts)
for part in parts:
if part.status == 'error': # type: ignore
if part.status == "error": # type: ignore
# trigger error locally
return part # type: ignore[return-value]
@ -735,7 +706,7 @@ def _train(
for worker in worker_map:
has_eval_set = False
for part in worker_map[worker]:
if 'eval_set' in part.result(): # type: ignore[attr-defined]
if "eval_set" in part.result(): # type: ignore[attr-defined]
has_eval_set = True
break
@ -747,13 +718,13 @@ def _train(
# assign general validation set settings to fit kwargs.
if eval_names:
kwargs['eval_names'] = eval_names
kwargs["eval_names"] = eval_names
if eval_class_weight:
kwargs['eval_class_weight'] = eval_class_weight
kwargs["eval_class_weight"] = eval_class_weight
if eval_metric:
kwargs['eval_metric'] = eval_metric
kwargs["eval_metric"] = eval_metric
if eval_at:
kwargs['eval_at'] = eval_at
kwargs["eval_at"] = eval_at
master_worker = next(iter(worker_map))
worker_ncores = client.ncores()
@ -763,14 +734,14 @@ def _train(
params = _choose_param_value(
main_param_name="local_listen_port",
params=params,
default_value=12400
default_value=12400,
)
local_listen_port = params.pop("local_listen_port")
params = _choose_param_value(
main_param_name="machines",
params=params,
default_value=None
default_value=None,
)
machines = params.pop("machines")
@ -781,7 +752,7 @@ def _train(
_log_info("Using passed-in 'machines' parameter")
worker_address_to_port = _machines_to_worker_map(
machines=machines,
worker_addresses=worker_addresses
worker_addresses=worker_addresses,
)
else:
if listen_port_in_params:
@ -795,19 +766,16 @@ def _train(
)
raise LightGBMError(msg)
worker_address_to_port = {
address: local_listen_port
for address in worker_addresses
}
worker_address_to_port = {address: local_listen_port for address in worker_addresses}
else:
_log_info("Finding random open ports for workers")
worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers(client, list(worker_map.keys()))
worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers(
client, list(worker_map.keys())
)
machines = ','.join([
f'{urlparse(worker_address).hostname}:{port}'
for worker_address, port
in worker_address_to_port.items()
])
machines = ",".join(
[f"{urlparse(worker_address).hostname}:{port}" for worker_address, port in worker_address_to_port.items()]
)
num_machines = len(worker_address_to_port)
@ -823,18 +791,18 @@ def _train(
client.submit(
_train_part,
model_factory=model_factory,
params={**params, 'num_threads': worker_ncores[worker]},
params={**params, "num_threads": worker_ncores[worker]},
list_of_parts=list_of_parts,
machines=machines,
local_listen_port=worker_address_to_port[worker],
num_machines=num_machines,
time_out=params.get('time_out', 120),
time_out=params.get("time_out", 120),
remote_socket=worker_to_socket_future.get(worker, None),
return_model=(worker == master_worker),
workers=[worker],
allow_other_workers=False,
pure=False,
**kwargs
**kwargs,
)
for worker, list_of_parts in worker_map.items()
]
@ -848,14 +816,14 @@ def _train(
# on the Dask cluster you're connected to and which workers have pieces of
# the training data
if not listen_port_in_params:
for param in _ConfigAliases.get('local_listen_port'):
for param in _ConfigAliases.get("local_listen_port"):
model._other_params.pop(param, None)
if not machines_in_params:
for param in _ConfigAliases.get('machines'):
for param in _ConfigAliases.get("machines"):
model._other_params.pop(param, None)
for param in _ConfigAliases.get('num_machines', 'timeout'):
for param in _ConfigAliases.get("num_machines", "timeout"):
model._other_params.pop(param, None)
return model
@ -868,9 +836,8 @@ def _predict_part(
pred_proba: bool,
pred_leaf: bool,
pred_contrib: bool,
**kwargs: Any
**kwargs: Any,
) -> _DaskPart:
result: _DaskPart
if part.shape[0] == 0:
result = np.array([])
@ -880,7 +847,7 @@ def _predict_part(
raw_score=raw_score,
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
**kwargs
**kwargs,
)
else:
result = model.predict(
@ -888,7 +855,7 @@ def _predict_part(
raw_score=raw_score,
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
**kwargs
**kwargs,
)
# dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series
@ -896,7 +863,7 @@ def _predict_part(
if len(result.shape) == 2:
result = pd_DataFrame(result, index=part.index)
else:
result = pd_Series(result, index=part.index, name='predictions')
result = pd_Series(result, index=part.index, name="predictions")
return result
@ -910,7 +877,7 @@ def _predict(
pred_leaf: bool = False,
pred_contrib: bool = False,
dtype: _PredictionDtype = np.float32,
**kwargs: Any
**kwargs: Any,
) -> Union[dask_Array, List[dask_Array]]:
"""Inner predict routine.
@ -943,7 +910,7 @@ def _predict(
If ``pred_contrib=True``, the feature contributions for each sample.
"""
if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask')
raise LightGBMError("dask, pandas and scikit-learn are required for lightgbm.dask")
if isinstance(data, dask_DataFrame):
return data.map_partitions(
_predict_part,
@ -952,19 +919,14 @@ def _predict(
pred_proba=pred_proba,
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
**kwargs
**kwargs,
).values
elif isinstance(data, dask_Array):
# for multi-class classification with sparse matrices, pred_contrib predictions
# are returned as a list of sparse matrices (one per class)
num_classes = model._n_classes
if (
num_classes > 2
and pred_contrib
and isinstance(data._meta, ss.spmatrix)
):
if num_classes > 2 and pred_contrib and isinstance(data._meta, ss.spmatrix):
predict_function = partial(
_predict_part,
model=model,
@ -972,7 +934,7 @@ def _predict(
pred_proba=pred_proba,
pred_leaf=False,
pred_contrib=True,
**kwargs
**kwargs,
)
delayed_chunks = data.to_delayed()
@ -999,16 +961,16 @@ def _predict(
part = dask_array_from_delayed(
value=_extract(partition, i),
shape=(nrows_per_chunk[j], num_cols),
meta=pred_meta
meta=pred_meta,
)
out[i].append(part)
# by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix
# the code below is used instead to ensure that the sparse type is preserved during concatentation
if isinstance(pred_meta, ss.csr_matrix):
concat_fn = partial(ss.vstack, format='csr')
concat_fn = partial(ss.vstack, format="csr")
elif isinstance(pred_meta, ss.csc_matrix):
concat_fn = partial(ss.vstack, format='csc')
concat_fn = partial(ss.vstack, format="csc")
else:
concat_fn = ss.vstack
@ -1020,7 +982,7 @@ def _predict(
dask_array_from_delayed(
value=delayed(concat_fn)(out[i]),
shape=(data.shape[0], num_cols),
meta=pred_meta
meta=pred_meta,
)
)
@ -1042,7 +1004,7 @@ def _predict(
if len(pred_row.shape) > 1:
chunks += (pred_row.shape[1],)
else:
map_blocks_kwargs['drop_axis'] = 1
map_blocks_kwargs["drop_axis"] = 1
return data.map_blocks(
predict_fn,
chunks=chunks,
@ -1051,11 +1013,10 @@ def _predict(
**map_blocks_kwargs,
)
else:
raise TypeError(f'Data must be either Dask Array or Dask DataFrame. Got {type(data).__name__}.')
raise TypeError(f"Data must be either Dask Array or Dask DataFrame. Got {type(data).__name__}.")
class _DaskLGBMModel:
@property
def client_(self) -> Client:
""":obj:`dask.distributed.Client`: Dask client.
@ -1064,7 +1025,7 @@ class _DaskLGBMModel:
with ``model.set_params(client=client)``.
"""
if not getattr(self, "fitted_", False):
raise LGBMNotFittedError('Cannot access property client_ before calling fit().')
raise LGBMNotFittedError("Cannot access property client_ before calling fit().")
return _get_dask_client(client=self.client)
@ -1093,12 +1054,12 @@ class _DaskLGBMModel:
eval_group: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None,
**kwargs: Any
**kwargs: Any,
) -> "_DaskLGBMModel":
if not DASK_INSTALLED:
raise LightGBMError('dask is required for lightgbm.dask')
raise LightGBMError("dask is required for lightgbm.dask")
if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask')
raise LightGBMError("dask, pandas and scikit-learn are required for lightgbm.dask")
params = self.get_params(True) # type: ignore[attr-defined]
params.pop("client", None)
@ -1120,7 +1081,7 @@ class _DaskLGBMModel:
eval_group=eval_group,
eval_metric=eval_metric,
eval_at=eval_at,
**kwargs
**kwargs,
)
self.set_params(**model.get_params()) # type: ignore[attr-defined]
@ -1137,7 +1098,10 @@ class _DaskLGBMModel:
return model
@staticmethod
def _lgb_dask_copy_extra_params(source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel]) -> None:
def _lgb_dask_copy_extra_params(
source: Union["_DaskLGBMModel", LGBMModel],
dest: Union["_DaskLGBMModel", LGBMModel],
) -> None:
params = source.get_params() # type: ignore[union-attr]
attributes = source.__dict__
extra_param_names = set(attributes.keys()).difference(params.keys())
@ -1150,7 +1114,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
def __init__(
self,
boosting_type: str = 'gbdt',
boosting_type: str = "gbdt",
num_leaves: int = 31,
max_depth: int = -1,
learning_rate: float = 0.1,
@ -1158,19 +1122,19 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
subsample_for_bin: int = 200000,
objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
class_weight: Optional[Union[dict, str]] = None,
min_split_gain: float = 0.,
min_split_gain: float = 0.0,
min_child_weight: float = 1e-3,
min_child_samples: int = 20,
subsample: float = 1.,
subsample: float = 1.0,
subsample_freq: int = 0,
colsample_bytree: float = 1.,
reg_alpha: float = 0.,
reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
colsample_bytree: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
importance_type: str = "split",
client: Optional[Client] = None,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is inherited from the lightgbm.LGBMClassifier.__init__."""
self.client = client
@ -1194,11 +1158,11 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
random_state=random_state,
n_jobs=n_jobs,
importance_type=importance_type,
**kwargs
**kwargs,
)
_base_doc = LGBMClassifier.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore
__init__.__doc__ = f"""
{_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
{' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
@ -1220,7 +1184,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
eval_class_weight: Optional[List[Union[dict, str]]] = None,
eval_init_score: Optional[List[_DaskCollection]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
**kwargs: Any
**kwargs: Any,
) -> "DaskLGBMClassifier":
"""Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
self._lgb_dask_fit(
@ -1235,7 +1199,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
eval_class_weight=eval_class_weight,
eval_init_score=eval_init_score,
eval_metric=eval_metric,
**kwargs
**kwargs,
)
return self
@ -1247,15 +1211,13 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
group_shape="Dask Array or Dask Series or None, optional (default=None)",
eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)",
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)"
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
)
# DaskLGBMClassifier does not support group, eval_group.
_base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('eval_set :'):])
_base_doc = _base_doc[: _base_doc.find("group :")] + _base_doc[_base_doc.find("eval_set :") :]
_base_doc = (_base_doc[:_base_doc.find('eval_group :')]
+ _base_doc[_base_doc.find('eval_metric :'):])
_base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
# DaskLGBMClassifier support for callbacks and init_model is not tested
fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
@ -1278,7 +1240,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
return _predict(
@ -1292,7 +1254,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
predict.__doc__ = _lgbmmodel_doc_predict.format(
@ -1301,7 +1263,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]"
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]",
)
def predict_proba(
@ -1313,7 +1275,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
return _predict(
@ -1327,7 +1289,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
@ -1336,7 +1298,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
output_name="predicted_probability",
predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]"
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]",
)
def to_local(self) -> LGBMClassifier:
@ -1355,7 +1317,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
def __init__(
self,
boosting_type: str = 'gbdt',
boosting_type: str = "gbdt",
num_leaves: int = 31,
max_depth: int = -1,
learning_rate: float = 0.1,
@ -1363,19 +1325,19 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
subsample_for_bin: int = 200000,
objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
class_weight: Optional[Union[dict, str]] = None,
min_split_gain: float = 0.,
min_split_gain: float = 0.0,
min_child_weight: float = 1e-3,
min_child_samples: int = 20,
subsample: float = 1.,
subsample: float = 1.0,
subsample_freq: int = 0,
colsample_bytree: float = 1.,
reg_alpha: float = 0.,
reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
colsample_bytree: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
importance_type: str = "split",
client: Optional[Client] = None,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is inherited from the lightgbm.LGBMRegressor.__init__."""
self.client = client
@ -1399,11 +1361,11 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
random_state=random_state,
n_jobs=n_jobs,
importance_type=importance_type,
**kwargs
**kwargs,
)
_base_doc = LGBMRegressor.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore
__init__.__doc__ = f"""
{_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
{' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
@ -1424,7 +1386,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
eval_sample_weight: Optional[List[_DaskVectorLike]] = None,
eval_init_score: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
**kwargs: Any
**kwargs: Any,
) -> "DaskLGBMRegressor":
"""Docstring is inherited from the lightgbm.LGBMRegressor.fit."""
self._lgb_dask_fit(
@ -1438,7 +1400,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
eval_sample_weight=eval_sample_weight,
eval_init_score=eval_init_score,
eval_metric=eval_metric,
**kwargs
**kwargs,
)
return self
@ -1450,18 +1412,15 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
group_shape="Dask Array or Dask Series or None, optional (default=None)",
eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)"
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
)
# DaskLGBMRegressor does not support group, eval_class_weight, eval_group.
_base_doc = (_base_doc[:_base_doc.find('group :')]
+ _base_doc[_base_doc.find('eval_set :'):])
_base_doc = _base_doc[: _base_doc.find("group :")] + _base_doc[_base_doc.find("eval_set :") :]
_base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')]
+ _base_doc[_base_doc.find('eval_init_score :'):])
_base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :]
_base_doc = (_base_doc[:_base_doc.find('eval_group :')]
+ _base_doc[_base_doc.find('eval_metric :'):])
_base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
# DaskLGBMRegressor support for callbacks and init_model is not tested
fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
@ -1484,7 +1443,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
return _predict(
@ -1497,7 +1456,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
predict.__doc__ = _lgbmmodel_doc_predict.format(
@ -1506,7 +1465,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]",
)
def to_local(self) -> LGBMRegressor:
@ -1525,7 +1484,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
def __init__(
self,
boosting_type: str = 'gbdt',
boosting_type: str = "gbdt",
num_leaves: int = 31,
max_depth: int = -1,
learning_rate: float = 0.1,
@ -1533,19 +1492,19 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
subsample_for_bin: int = 200000,
objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
class_weight: Optional[Union[dict, str]] = None,
min_split_gain: float = 0.,
min_split_gain: float = 0.0,
min_child_weight: float = 1e-3,
min_child_samples: int = 20,
subsample: float = 1.,
subsample: float = 1.0,
subsample_freq: int = 0,
colsample_bytree: float = 1.,
reg_alpha: float = 0.,
reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
colsample_bytree: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
importance_type: str = "split",
client: Optional[Client] = None,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is inherited from the lightgbm.LGBMRanker.__init__."""
self.client = client
@ -1569,11 +1528,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
random_state=random_state,
n_jobs=n_jobs,
importance_type=importance_type,
**kwargs
**kwargs,
)
_base_doc = LGBMRanker.__init__.__doc__
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore
_before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore
__init__.__doc__ = f"""
{_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
{' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
@ -1597,7 +1556,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
eval_group: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
**kwargs: Any
**kwargs: Any,
) -> "DaskLGBMRanker":
"""Docstring is inherited from the lightgbm.LGBMRanker.fit."""
self._lgb_dask_fit(
@ -1614,7 +1573,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
eval_group=eval_group,
eval_metric=eval_metric,
eval_at=eval_at,
**kwargs
**kwargs,
)
return self
@ -1626,17 +1585,18 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
group_shape="Dask Array or Dask Series or None, optional (default=None)",
eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)"
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
)
# DaskLGBMRanker does not support eval_class_weight or early stopping
_base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')]
+ _base_doc[_base_doc.find('eval_init_score :'):])
_base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :]
_base_doc = (_base_doc[:_base_doc.find('feature_name :')]
+ "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n"
+ f"{' ':8}The evaluation positions of the specified metric.\n"
+ f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}")
_base_doc = (
_base_doc[: _base_doc.find("feature_name :")]
+ "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n"
+ f"{' ':8}The evaluation positions of the specified metric.\n"
+ f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}"
)
# DaskLGBMRanker support for callbacks and init_model is not tested
fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
@ -1659,7 +1619,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
) -> dask_Array:
"""Docstring is inherited from the lightgbm.LGBMRanker.predict."""
return _predict(
@ -1672,7 +1632,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
predict.__doc__ = _lgbmmodel_doc_predict.format(
@ -1681,7 +1641,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
output_name="predicted_result",
predicted_result_shape="Dask Array of shape = [n_samples]",
X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]",
)
def to_local(self) -> LGBMRanker:

Просмотреть файл

@ -28,9 +28,9 @@ from .basic import (
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
__all__ = [
'cv',
'CVBooster',
'train',
"cv",
"CVBooster",
"train",
]
@ -41,13 +41,13 @@ _LGBM_CustomMetricFunction = Union[
],
Callable[
[np.ndarray, Dataset],
List[_LGBM_EvalFunctionResultType]
List[_LGBM_EvalFunctionResultType],
],
]
_LGBM_PreprocFunction = Callable[
[Dataset, Dataset, Dict[str, Any]],
Tuple[Dataset, Dataset, Dict[str, Any]]
Tuple[Dataset, Dataset, Dict[str, Any]],
]
@ -59,10 +59,10 @@ def train(
valid_names: Optional[List[str]] = None,
feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
init_model: Optional[Union[str, Path, Booster]] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
keep_training_booster: bool = False,
callbacks: Optional[List[Callable]] = None
callbacks: Optional[List[Callable]] = None,
) -> Booster:
"""Perform the training with given parameters.
@ -169,14 +169,14 @@ def train(
# create predictor first
params = copy.deepcopy(params)
params = _choose_param_value(
main_param_name='objective',
main_param_name="objective",
params=params,
default_value=None
default_value=None,
)
fobj: Optional[_LGBM_CustomObjectiveFunction] = None
if callable(params["objective"]):
fobj = params["objective"]
params["objective"] = 'none'
params["objective"] = "none"
for alias in _ConfigAliases.get("num_iterations"):
if alias in params:
num_boost_round = params.pop(alias)
@ -186,33 +186,26 @@ def train(
params = _choose_param_value(
main_param_name="early_stopping_round",
params=params,
default_value=None
default_value=None,
)
if params["early_stopping_round"] is None:
params.pop("early_stopping_round")
first_metric_only = params.get('first_metric_only', False)
first_metric_only = params.get("first_metric_only", False)
predictor: Optional[_InnerPredictor] = None
if isinstance(init_model, (str, Path)):
predictor = _InnerPredictor.from_model_file(
model_file=init_model,
pred_parameter=params
)
predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params)
elif isinstance(init_model, Booster):
predictor = _InnerPredictor.from_booster(
booster=init_model,
pred_parameter=dict(init_model.params, **params)
)
predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params))
if predictor is not None:
init_iteration = predictor.current_iteration()
else:
init_iteration = 0
train_set._update_params(params) \
._set_predictor(predictor) \
.set_feature_name(feature_name) \
.set_categorical_feature(categorical_feature)
train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
categorical_feature
)
is_valid_contain_train = False
train_data_name = "training"
@ -234,13 +227,13 @@ def train(
if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append(f'valid_{i}')
name_valid_sets.append(f"valid_{i}")
# process callbacks
if callbacks is None:
callbacks_set = set()
else:
for i, cb in enumerate(callbacks):
cb.__dict__.setdefault('order', i - len(callbacks))
cb.__dict__.setdefault("order", i - len(callbacks))
callbacks_set = set(callbacks)
if "early_stopping_round" in params:
@ -251,15 +244,16 @@ def train(
verbose=_choose_param_value(
main_param_name="verbosity",
params=params,
default_value=1
).pop("verbosity") > 0
default_value=1,
).pop("verbosity")
> 0,
)
)
callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)}
callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order'))
callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order'))
callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
# construct booster
try:
@ -277,12 +271,16 @@ def train(
# start training
for i in range(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
params=params,
iteration=i,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=None))
cb(
callback.CallbackEnv(
model=booster,
params=params,
iteration=i,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=None,
)
)
booster.update(fobj=fobj)
@ -294,12 +292,16 @@ def train(
evaluation_result_list.extend(booster.eval_valid(feval))
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=booster,
params=params,
iteration=i,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=evaluation_result_list))
cb(
callback.CallbackEnv(
model=booster,
params=params,
iteration=i,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=evaluation_result_list,
)
)
except callback.EarlyStopException as earlyStopException:
booster.best_iteration = earlyStopException.best_iteration + 1
evaluation_result_list = earlyStopException.best_score
@ -334,7 +336,7 @@ class CVBooster:
def __init__(
self,
model_file: Optional[Union[str, Path]] = None
model_file: Optional[Union[str, Path]] = None,
):
"""Initialize the CVBooster.
@ -361,18 +363,23 @@ class CVBooster:
"""Serialize CVBooster to dict."""
models_str = []
for booster in self.boosters:
models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration,
importance_type=importance_type))
models_str.append(
booster.model_to_string(
num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type
)
)
return {"boosters": models_str, "best_iteration": self.best_iteration}
def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
"""Redirect methods call of CVBooster."""
def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
"""Call methods with each booster, and concatenate their results."""
ret = []
for booster in self.boosters:
ret.append(getattr(booster, name)(*args, **kwargs))
return ret
return handler_function
def __getstate__(self) -> Dict[str, Any]:
@ -401,7 +408,7 @@ class CVBooster:
self,
num_iteration: Optional[int] = None,
start_iteration: int = 0,
importance_type: str = 'split'
importance_type: str = "split",
) -> str:
"""Save CVBooster to JSON string.
@ -430,7 +437,7 @@ class CVBooster:
filename: Union[str, Path],
num_iteration: Optional[int] = None,
start_iteration: int = 0,
importance_type: str = 'split'
importance_type: str = "split",
) -> "CVBooster":
"""Save CVBooster to a file as JSON text.
@ -469,16 +476,18 @@ def _make_n_folds(
fpreproc: Optional[_LGBM_PreprocFunction],
stratified: bool,
shuffle: bool,
eval_train_metric: bool
eval_train_metric: bool,
) -> CVBooster:
"""Make a n-fold list of Booster from random indices."""
full_data = full_data.construct()
num_data = full_data.num_data()
if folds is not None:
if not hasattr(folds, '__iter__') and not hasattr(folds, 'split'):
raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx) tuples "
"or scikit-learn splitter object with split method")
if hasattr(folds, 'split'):
if not hasattr(folds, "__iter__") and not hasattr(folds, "split"):
raise AttributeError(
"folds should be a generator or iterator of (train_idx, test_idx) tuples "
"or scikit-learn splitter object with split method"
)
if hasattr(folds, "split"):
group_info = full_data.get_group()
if group_info is not None:
group_info = np.array(group_info, dtype=np.int32, copy=False)
@ -487,11 +496,13 @@ def _make_n_folds(
flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg",
"xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
for obj_alias in _ConfigAliases.get("objective")):
if any(
params.get(obj_alias, "")
in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
for obj_alias in _ConfigAliases.get("objective")
):
if not SKLEARN_INSTALLED:
raise LightGBMError('scikit-learn is required for ranking cv')
raise LightGBMError("scikit-learn is required for ranking cv")
# ranking task, split according to groups
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
@ -499,7 +510,7 @@ def _make_n_folds(
folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
elif stratified:
if not SKLEARN_INSTALLED:
raise LightGBMError('scikit-learn is required for stratified cv')
raise LightGBMError("scikit-learn is required for stratified cv")
skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
else:
@ -508,7 +519,7 @@ def _make_n_folds(
else:
randidx = np.arange(num_data)
kstep = int(num_data / nfold)
test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)]
test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)]
train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
folds = zip(train_id, test_id)
@ -523,14 +534,14 @@ def _make_n_folds(
tparam = params
booster_for_fold = Booster(tparam, train_set)
if eval_train_metric:
booster_for_fold.add_valid(train_set, 'train')
booster_for_fold.add_valid(valid_set, 'valid')
booster_for_fold.add_valid(train_set, "train")
booster_for_fold.add_valid(valid_set, "valid")
ret.boosters.append(booster_for_fold)
return ret
def _agg_cv_result(
raw_results: List[List[_LGBM_BoosterEvalMethodResultType]]
raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
"""Aggregate cross-validation results."""
cvmap: Dict[str, List[float]] = OrderedDict()
@ -541,7 +552,7 @@ def _agg_cv_result(
metric_type[key] = one_line[3]
cvmap.setdefault(key, [])
cvmap[key].append(one_line[2])
return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
def cv(
@ -555,13 +566,13 @@ def cv(
metrics: Optional[Union[str, List[str]]] = None,
feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
init_model: Optional[Union[str, Path, Booster]] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
fpreproc: Optional[_LGBM_PreprocFunction] = None,
seed: int = 0,
callbacks: Optional[List[Callable]] = None,
eval_train_metric: bool = False,
return_cvbooster: bool = False
return_cvbooster: bool = False,
) -> Dict[str, Union[List[float], CVBooster]]:
"""Perform the cross-validation with given parameters.
@ -683,14 +694,14 @@ def cv(
params = copy.deepcopy(params)
params = _choose_param_value(
main_param_name='objective',
main_param_name="objective",
params=params,
default_value=None
default_value=None,
)
fobj: Optional[_LGBM_CustomObjectiveFunction] = None
if callable(params["objective"]):
fobj = params["objective"]
params["objective"] = 'none'
params["objective"] = "none"
for alias in _ConfigAliases.get("num_iterations"):
if alias in params:
_log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
@ -700,21 +711,21 @@ def cv(
params = _choose_param_value(
main_param_name="early_stopping_round",
params=params,
default_value=None
default_value=None,
)
if params["early_stopping_round"] is None:
params.pop("early_stopping_round")
first_metric_only = params.get('first_metric_only', False)
first_metric_only = params.get("first_metric_only", False)
if isinstance(init_model, (str, Path)):
predictor = _InnerPredictor.from_model_file(
model_file=init_model,
pred_parameter=params
pred_parameter=params,
)
elif isinstance(init_model, Booster):
predictor = _InnerPredictor.from_booster(
booster=init_model,
pred_parameter=dict(init_model.params, **params)
pred_parameter=dict(init_model.params, **params),
)
else:
predictor = None
@ -722,25 +733,31 @@ def cv(
if metrics is not None:
for metric_alias in _ConfigAliases.get("metric"):
params.pop(metric_alias, None)
params['metric'] = metrics
params["metric"] = metrics
train_set._update_params(params) \
._set_predictor(predictor) \
.set_feature_name(feature_name) \
.set_categorical_feature(categorical_feature)
train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
categorical_feature
)
results = defaultdict(list)
cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold,
params=params, seed=seed, fpreproc=fpreproc,
stratified=stratified, shuffle=shuffle,
eval_train_metric=eval_train_metric)
cvfolds = _make_n_folds(
full_data=train_set,
folds=folds,
nfold=nfold,
params=params,
seed=seed,
fpreproc=fpreproc,
stratified=stratified,
shuffle=shuffle,
eval_train_metric=eval_train_metric,
)
# setup callbacks
if callbacks is None:
callbacks_set = set()
else:
for i, cb in enumerate(callbacks):
cb.__dict__.setdefault('order', i - len(callbacks))
cb.__dict__.setdefault("order", i - len(callbacks))
callbacks_set = set(callbacks)
if "early_stopping_round" in params:
@ -751,46 +768,55 @@ def cv(
verbose=_choose_param_value(
main_param_name="verbosity",
params=params,
default_value=1
).pop("verbosity") > 0
default_value=1,
).pop("verbosity")
> 0,
)
)
callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)}
callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order'))
callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order'))
callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=cvfolds,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
cb(
callback.CallbackEnv(
model=cvfolds,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None,
)
)
cvfolds.update(fobj=fobj) # type: ignore[call-arg]
res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg]
for _, key, mean, _, std in res:
results[f'{key}-mean'].append(mean)
results[f'{key}-stdv'].append(std)
results[f"{key}-mean"].append(mean)
results[f"{key}-stdv"].append(std)
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=cvfolds,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=res))
cb(
callback.CallbackEnv(
model=cvfolds,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=res,
)
)
except callback.EarlyStopException as earlyStopException:
cvfolds.best_iteration = earlyStopException.best_iteration + 1
for bst in cvfolds.boosters:
bst.best_iteration = cvfolds.best_iteration
for k in results:
results[k] = results[k][:cvfolds.best_iteration]
results[k] = results[k][: cvfolds.best_iteration]
break
if return_cvbooster:
results['cvbooster'] = cvfolds # type: ignore[assignment]
results["cvbooster"] = cvfolds # type: ignore[assignment]
return dict(results)

Просмотреть файл

@ -16,17 +16,19 @@ def find_lib_path() -> List[str]:
List of all found library paths to LightGBM.
"""
curr_path = Path(__file__).absolute()
dll_path = [curr_path.parents[1],
curr_path.parents[0] / 'bin',
curr_path.parents[0] / 'lib']
if system() in ('Windows', 'Microsoft'):
dll_path.append(curr_path.parents[1] / 'Release')
dll_path.append(curr_path.parents[1] / 'windows' / 'x64' / 'DLL')
dll_path = [p / 'lib_lightgbm.dll' for p in dll_path]
dll_path = [
curr_path.parents[1],
curr_path.parents[0] / "bin",
curr_path.parents[0] / "lib",
]
if system() in ("Windows", "Microsoft"):
dll_path.append(curr_path.parents[1] / "Release")
dll_path.append(curr_path.parents[1] / "windows" / "x64" / "DLL")
dll_path = [p / "lib_lightgbm.dll" for p in dll_path]
else:
dll_path = [p / 'lib_lightgbm.so' for p in dll_path]
dll_path = [p / "lib_lightgbm.so" for p in dll_path]
lib_path = [str(p) for p in dll_path if p.is_file()]
if not lib_path:
dll_path_joined = '\n'.join(map(str, dll_path))
raise Exception(f'Cannot find lightgbm library file in following paths:\n{dll_path_joined}')
dll_path_joined = "\n".join(map(str, dll_path))
raise Exception(f"Cannot find lightgbm library file in following paths:\n{dll_path_joined}")
return lib_path

Просмотреть файл

@ -12,11 +12,11 @@ from .compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, pd_DataFrame
from .sklearn import LGBMModel
__all__ = [
'create_tree_digraph',
'plot_importance',
'plot_metric',
'plot_split_value_histogram',
'plot_tree',
"create_tree_digraph",
"plot_importance",
"plot_metric",
"plot_split_value_histogram",
"plot_tree",
]
@ -27,9 +27,7 @@ def _check_not_tuple_of_2_elements(obj: Any, obj_name: str) -> None:
def _float2str(value: float, precision: Optional[int]) -> str:
return (f"{value:.{precision}f}"
if precision is not None and not isinstance(value, str)
else str(value))
return f"{value:.{precision}f}" if precision is not None and not isinstance(value, str) else str(value)
def plot_importance(
@ -38,17 +36,17 @@ def plot_importance(
height: float = 0.2,
xlim: Optional[Tuple[float, float]] = None,
ylim: Optional[Tuple[float, float]] = None,
title: Optional[str] = 'Feature importance',
xlabel: Optional[str] = 'Feature importance',
ylabel: Optional[str] = 'Features',
importance_type: str = 'auto',
title: Optional[str] = "Feature importance",
xlabel: Optional[str] = "Feature importance",
ylabel: Optional[str] = "Features",
importance_type: str = "auto",
max_num_features: Optional[int] = None,
ignore_zero: bool = True,
figsize: Optional[Tuple[float, float]] = None,
dpi: Optional[int] = None,
grid: bool = True,
precision: Optional[int] = 3,
**kwargs: Any
**kwargs: Any,
) -> Any:
"""Plot model's feature importances.
@ -104,7 +102,7 @@ def plot_importance(
if MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt
else:
raise ImportError('You must install matplotlib and restart your session to plot importance.')
raise ImportError("You must install matplotlib and restart your session to plot importance.")
if isinstance(booster, LGBMModel):
if importance_type == "auto":
@ -114,7 +112,7 @@ def plot_importance(
if importance_type == "auto":
importance_type = "split"
else:
raise TypeError('booster must be Booster or LGBMModel.')
raise TypeError("booster must be Booster or LGBMModel.")
importance = booster.feature_importance(importance_type=importance_type)
feature_name = booster.feature_name()
@ -131,28 +129,26 @@ def plot_importance(
if ax is None:
if figsize is not None:
_check_not_tuple_of_2_elements(figsize, 'figsize')
_check_not_tuple_of_2_elements(figsize, "figsize")
_, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
ylocs = np.arange(len(values))
ax.barh(ylocs, values, align='center', height=height, **kwargs)
ax.barh(ylocs, values, align="center", height=height, **kwargs)
for x, y in zip(values, ylocs):
ax.text(x + 1, y,
_float2str(x, precision) if importance_type == 'gain' else x,
va='center')
ax.text(x + 1, y, _float2str(x, precision) if importance_type == "gain" else x, va="center")
ax.set_yticks(ylocs)
ax.set_yticklabels(labels)
if xlim is not None:
_check_not_tuple_of_2_elements(xlim, 'xlim')
_check_not_tuple_of_2_elements(xlim, "xlim")
else:
xlim = (0, max(values) * 1.1)
ax.set_xlim(xlim)
if ylim is not None:
_check_not_tuple_of_2_elements(ylim, 'ylim')
_check_not_tuple_of_2_elements(ylim, "ylim")
else:
ylim = (-1, len(values))
ax.set_ylim(ylim)
@ -160,7 +156,7 @@ def plot_importance(
if title is not None:
ax.set_title(title)
if xlabel is not None:
xlabel = xlabel.replace('@importance_type@', importance_type)
xlabel = xlabel.replace("@importance_type@", importance_type)
ax.set_xlabel(xlabel)
if ylabel is not None:
ax.set_ylabel(ylabel)
@ -176,13 +172,13 @@ def plot_split_value_histogram(
width_coef: float = 0.8,
xlim: Optional[Tuple[float, float]] = None,
ylim: Optional[Tuple[float, float]] = None,
title: Optional[str] = 'Split value histogram for feature with @index/name@ @feature@',
xlabel: Optional[str] = 'Feature split value',
ylabel: Optional[str] = 'Count',
title: Optional[str] = "Split value histogram for feature with @index/name@ @feature@",
xlabel: Optional[str] = "Feature split value",
ylabel: Optional[str] = "Count",
figsize: Optional[Tuple[float, float]] = None,
dpi: Optional[int] = None,
grid: bool = True,
**kwargs: Any
**kwargs: Any,
) -> Any:
"""Plot split value histogram for the specified feature of the model.
@ -238,29 +234,28 @@ def plot_split_value_histogram(
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
else:
raise ImportError('You must install matplotlib and restart your session to plot split value histogram.')
raise ImportError("You must install matplotlib and restart your session to plot split value histogram.")
if isinstance(booster, LGBMModel):
booster = booster.booster_
elif not isinstance(booster, Booster):
raise TypeError('booster must be Booster or LGBMModel.')
raise TypeError("booster must be Booster or LGBMModel.")
hist, split_bins = booster.get_split_value_histogram(feature=feature, bins=bins, xgboost_style=False)
if np.count_nonzero(hist) == 0:
raise ValueError('Cannot plot split value histogram, '
f'because feature {feature} was not used in splitting')
raise ValueError("Cannot plot split value histogram, " f"because feature {feature} was not used in splitting")
width = width_coef * (split_bins[1] - split_bins[0])
centred = (split_bins[:-1] + split_bins[1:]) / 2
if ax is None:
if figsize is not None:
_check_not_tuple_of_2_elements(figsize, 'figsize')
_check_not_tuple_of_2_elements(figsize, "figsize")
_, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
ax.bar(centred, hist, align='center', width=width, **kwargs)
ax.bar(centred, hist, align="center", width=width, **kwargs)
if xlim is not None:
_check_not_tuple_of_2_elements(xlim, 'xlim')
_check_not_tuple_of_2_elements(xlim, "xlim")
else:
range_result = split_bins[-1] - split_bins[0]
xlim = (split_bins[0] - range_result * 0.2, split_bins[-1] + range_result * 0.2)
@ -268,14 +263,14 @@ def plot_split_value_histogram(
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
if ylim is not None:
_check_not_tuple_of_2_elements(ylim, 'ylim')
_check_not_tuple_of_2_elements(ylim, "ylim")
else:
ylim = (0, max(hist) * 1.1)
ax.set_ylim(ylim)
if title is not None:
title = title.replace('@feature@', str(feature))
title = title.replace('@index/name@', ('name' if isinstance(feature, str) else 'index'))
title = title.replace("@feature@", str(feature))
title = title.replace("@index/name@", ("name" if isinstance(feature, str) else "index"))
ax.set_title(title)
if xlabel is not None:
ax.set_xlabel(xlabel)
@ -292,12 +287,12 @@ def plot_metric(
ax=None,
xlim: Optional[Tuple[float, float]] = None,
ylim: Optional[Tuple[float, float]] = None,
title: Optional[str] = 'Metric during training',
xlabel: Optional[str] = 'Iterations',
ylabel: Optional[str] = '@metric@',
title: Optional[str] = "Metric during training",
xlabel: Optional[str] = "Iterations",
ylabel: Optional[str] = "@metric@",
figsize: Optional[Tuple[float, float]] = None,
dpi: Optional[int] = None,
grid: bool = True
grid: bool = True,
) -> Any:
"""Plot one metric during training.
@ -345,31 +340,33 @@ def plot_metric(
if MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt
else:
raise ImportError('You must install matplotlib and restart your session to plot metric.')
raise ImportError("You must install matplotlib and restart your session to plot metric.")
if isinstance(booster, LGBMModel):
eval_results = deepcopy(booster.evals_result_)
elif isinstance(booster, dict):
eval_results = deepcopy(booster)
elif isinstance(booster, Booster):
raise TypeError("booster must be dict or LGBMModel. To use plot_metric with Booster type, first record the metrics using record_evaluation callback then pass that to plot_metric as argument `booster`")
raise TypeError(
"booster must be dict or LGBMModel. To use plot_metric with Booster type, first record the metrics using record_evaluation callback then pass that to plot_metric as argument `booster`"
)
else:
raise TypeError('booster must be dict or LGBMModel.')
raise TypeError("booster must be dict or LGBMModel.")
num_data = len(eval_results)
if not num_data:
raise ValueError('eval results cannot be empty.')
raise ValueError("eval results cannot be empty.")
if ax is None:
if figsize is not None:
_check_not_tuple_of_2_elements(figsize, 'figsize')
_check_not_tuple_of_2_elements(figsize, "figsize")
_, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
if dataset_names is None:
dataset_names_iter = iter(eval_results.keys())
elif not isinstance(dataset_names, (list, tuple, set)) or not dataset_names:
raise ValueError('dataset_names should be iterable and cannot be empty')
raise ValueError("dataset_names should be iterable and cannot be empty")
else:
dataset_names_iter = iter(dataset_names)
@ -382,7 +379,7 @@ def plot_metric(
metric, results = metrics_for_one.popitem()
else:
if metric not in metrics_for_one:
raise KeyError('No given metric in eval results.')
raise KeyError("No given metric in eval results.")
results = metrics_for_one[metric]
num_iteration = len(results)
max_result = max(results)
@ -397,16 +394,16 @@ def plot_metric(
min_result = min(*results, min_result)
ax.plot(x_, results, label=name)
ax.legend(loc='best')
ax.legend(loc="best")
if xlim is not None:
_check_not_tuple_of_2_elements(xlim, 'xlim')
_check_not_tuple_of_2_elements(xlim, "xlim")
else:
xlim = (0, num_iteration)
ax.set_xlim(xlim)
if ylim is not None:
_check_not_tuple_of_2_elements(ylim, 'ylim')
_check_not_tuple_of_2_elements(ylim, "ylim")
else:
range_result = max_result - min_result
ylim = (min_result - range_result * 0.2, max_result + range_result * 0.2)
@ -417,7 +414,7 @@ def plot_metric(
if xlabel is not None:
ax.set_xlabel(xlabel)
if ylabel is not None:
ylabel = ylabel.replace('@metric@', metric)
ylabel = ylabel.replace("@metric@", metric)
ax.set_ylabel(ylabel)
ax.grid(grid)
return ax
@ -432,19 +429,20 @@ def _determine_direction_for_numeric_split(
missing_type = _MissingType(missing_type_str)
if math.isnan(fval) and missing_type != _MissingType.NAN:
fval = 0.0
if ((missing_type == _MissingType.ZERO and _is_zero(fval))
or (missing_type == _MissingType.NAN and math.isnan(fval))):
direction = 'left' if default_left else 'right'
if (missing_type == _MissingType.ZERO and _is_zero(fval)) or (
missing_type == _MissingType.NAN and math.isnan(fval)
):
direction = "left" if default_left else "right"
else:
direction = 'left' if fval <= threshold else 'right'
direction = "left" if fval <= threshold else "right"
return direction
def _determine_direction_for_categorical_split(fval: float, thresholds: str) -> str:
if math.isnan(fval) or int(fval) < 0:
return 'right'
int_thresholds = {int(t) for t in thresholds.split('||')}
return 'left' if int(fval) in int_thresholds else 'right'
return "right"
int_thresholds = {int(t) for t in thresholds.split("||")}
return "left" if int(fval) in int_thresholds else "right"
def _to_graphviz(
@ -456,7 +454,7 @@ def _to_graphviz(
constraints: Optional[List[int]],
example_case: Optional[Union[np.ndarray, pd_DataFrame]],
max_category_values: int,
**kwargs: Any
**kwargs: Any,
) -> Any:
"""Convert specified tree to graphviz instance.
@ -466,120 +464,124 @@ def _to_graphviz(
if GRAPHVIZ_INSTALLED:
from graphviz import Digraph
else:
raise ImportError('You must install graphviz and restart your session to plot tree.')
raise ImportError("You must install graphviz and restart your session to plot tree.")
def add(
root: Dict[str, Any],
total_count: int,
parent: Optional[str],
decision: Optional[str],
highlight: bool
root: Dict[str, Any], total_count: int, parent: Optional[str], decision: Optional[str], highlight: bool
) -> None:
"""Recursively add node or edge."""
fillcolor = 'white'
style = ''
fillcolor = "white"
style = ""
tooltip = None
if highlight:
color = 'blue'
penwidth = '3'
color = "blue"
penwidth = "3"
else:
color = 'black'
penwidth = '1'
if 'split_index' in root: # non-leaf
color = "black"
penwidth = "1"
if "split_index" in root: # non-leaf
shape = "rectangle"
l_dec = 'yes'
r_dec = 'no'
threshold = root['threshold']
if root['decision_type'] == '<=':
l_dec = "yes"
r_dec = "no"
threshold = root["threshold"]
if root["decision_type"] == "<=":
operator = "&#8804;"
elif root['decision_type'] == '==':
elif root["decision_type"] == "==":
operator = "="
else:
raise ValueError('Invalid decision type in tree model.')
raise ValueError("Invalid decision type in tree model.")
name = f"split{root['split_index']}"
split_feature = root['split_feature']
split_feature = root["split_feature"]
if feature_names is not None:
label = f"<B>{feature_names[split_feature]}</B> {operator}"
else:
label = f"feature <B>{split_feature}</B> {operator} "
direction = None
if example_case is not None:
if root['decision_type'] == '==':
if root["decision_type"] == "==":
direction = _determine_direction_for_categorical_split(
fval=example_case[split_feature],
thresholds=root['threshold']
fval=example_case[split_feature], thresholds=root["threshold"]
)
else:
direction = _determine_direction_for_numeric_split(
fval=example_case[split_feature],
threshold=root['threshold'],
missing_type_str=root['missing_type'],
default_left=root['default_left']
threshold=root["threshold"],
missing_type_str=root["missing_type"],
default_left=root["default_left"],
)
if root['decision_type'] == '==':
category_values = root['threshold'].split('||')
if root["decision_type"] == "==":
category_values = root["threshold"].split("||")
if len(category_values) > max_category_values:
tooltip = root['threshold']
threshold = '||'.join(category_values[:2]) + '||...||' + category_values[-1]
tooltip = root["threshold"]
threshold = "||".join(category_values[:2]) + "||...||" + category_values[-1]
label += f"<B>{_float2str(threshold, precision)}</B>"
for info in ['split_gain', 'internal_value', 'internal_weight', "internal_count", "data_percentage"]:
for info in ["split_gain", "internal_value", "internal_weight", "internal_count", "data_percentage"]:
if info in show_info:
output = info.split('_')[-1]
if info in {'split_gain', 'internal_value', 'internal_weight'}:
output = info.split("_")[-1]
if info in {"split_gain", "internal_value", "internal_weight"}:
label += f"<br/>{_float2str(root[info], precision)} {output}"
elif info == 'internal_count':
elif info == "internal_count":
label += f"<br/>{output}: {root[info]}"
elif info == "data_percentage":
label += f"<br/>{_float2str(root['internal_count'] / total_count * 100, 2)}% of data"
if constraints:
if constraints[root['split_feature']] == 1:
if constraints[root["split_feature"]] == 1:
fillcolor = "#ddffdd" # light green
if constraints[root['split_feature']] == -1:
if constraints[root["split_feature"]] == -1:
fillcolor = "#ffdddd" # light red
style = "filled"
label = f"<{label}>"
add(
root=root['left_child'],
root=root["left_child"],
total_count=total_count,
parent=name,
decision=l_dec,
highlight=highlight and direction == "left"
highlight=highlight and direction == "left",
)
add(
root=root['right_child'],
root=root["right_child"],
total_count=total_count,
parent=name,
decision=r_dec,
highlight=highlight and direction == "right"
highlight=highlight and direction == "right",
)
else: # leaf
shape = "ellipse"
name = f"leaf{root['leaf_index']}"
label = f"leaf {root['leaf_index']}: "
label += f"<B>{_float2str(root['leaf_value'], precision)}</B>"
if 'leaf_weight' in show_info:
if "leaf_weight" in show_info:
label += f"<br/>{_float2str(root['leaf_weight'], precision)} weight"
if 'leaf_count' in show_info:
if "leaf_count" in show_info:
label += f"<br/>count: {root['leaf_count']}"
if "data_percentage" in show_info:
label += f"<br/>{_float2str(root['leaf_count'] / total_count * 100, 2)}% of data"
label = f"<{label}>"
graph.node(name, label=label, shape=shape, style=style, fillcolor=fillcolor, color=color, penwidth=penwidth, tooltip=tooltip)
graph.node(
name,
label=label,
shape=shape,
style=style,
fillcolor=fillcolor,
color=color,
penwidth=penwidth,
tooltip=tooltip,
)
if parent is not None:
graph.edge(parent, name, decision, color=color, penwidth=penwidth)
graph = Digraph(**kwargs)
rankdir = "LR" if orientation == "horizontal" else "TB"
graph.attr("graph", nodesep="0.05", ranksep="0.3", rankdir=rankdir)
if "internal_count" in tree_info['tree_structure']:
if "internal_count" in tree_info["tree_structure"]:
add(
root=tree_info['tree_structure'],
total_count=tree_info['tree_structure']["internal_count"],
root=tree_info["tree_structure"],
total_count=tree_info["tree_structure"]["internal_count"],
parent=None,
decision=None,
highlight=example_case is not None
highlight=example_case is not None,
)
else:
raise Exception("Cannot plot trees with no split")
@ -610,10 +612,10 @@ def create_tree_digraph(
tree_index: int = 0,
show_info: Optional[List[str]] = None,
precision: Optional[int] = 3,
orientation: str = 'horizontal',
orientation: str = "horizontal",
example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None,
max_category_values: int = 10,
**kwargs: Any
**kwargs: Any,
) -> Any:
"""Create a digraph representation of specified tree.
@ -689,32 +691,32 @@ def create_tree_digraph(
if isinstance(booster, LGBMModel):
booster = booster.booster_
elif not isinstance(booster, Booster):
raise TypeError('booster must be Booster or LGBMModel.')
raise TypeError("booster must be Booster or LGBMModel.")
model = booster.dump_model()
tree_infos = model['tree_info']
feature_names = model.get('feature_names', None)
monotone_constraints = model.get('monotone_constraints', None)
tree_infos = model["tree_info"]
feature_names = model.get("feature_names", None)
monotone_constraints = model.get("monotone_constraints", None)
if tree_index < len(tree_infos):
tree_info = tree_infos[tree_index]
else:
raise IndexError('tree_index is out of range.')
raise IndexError("tree_index is out of range.")
if show_info is None:
show_info = []
if example_case is not None:
if not isinstance(example_case, (np.ndarray, pd_DataFrame)) or example_case.ndim != 2:
raise ValueError('example_case must be a numpy 2-D array or a pandas DataFrame')
raise ValueError("example_case must be a numpy 2-D array or a pandas DataFrame")
if example_case.shape[0] != 1:
raise ValueError('example_case must have a single row.')
raise ValueError("example_case must have a single row.")
if isinstance(example_case, pd_DataFrame):
example_case = _data_from_pandas(
data=example_case,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=booster.pandas_categorical
pandas_categorical=booster.pandas_categorical,
)[0]
example_case = example_case[0]
@ -727,7 +729,7 @@ def create_tree_digraph(
constraints=monotone_constraints,
example_case=example_case,
max_category_values=max_category_values,
**kwargs
**kwargs,
)
@ -739,9 +741,9 @@ def plot_tree(
dpi: Optional[int] = None,
show_info: Optional[List[str]] = None,
precision: Optional[int] = 3,
orientation: str = 'horizontal',
orientation: str = "horizontal",
example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None,
**kwargs: Any
**kwargs: Any,
) -> Any:
"""Plot specified tree.
@ -807,22 +809,28 @@ def plot_tree(
import matplotlib.image
import matplotlib.pyplot as plt
else:
raise ImportError('You must install matplotlib and restart your session to plot tree.')
raise ImportError("You must install matplotlib and restart your session to plot tree.")
if ax is None:
if figsize is not None:
_check_not_tuple_of_2_elements(figsize, 'figsize')
_check_not_tuple_of_2_elements(figsize, "figsize")
_, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
graph = create_tree_digraph(booster=booster, tree_index=tree_index,
show_info=show_info, precision=precision,
orientation=orientation, example_case=example_case, **kwargs)
graph = create_tree_digraph(
booster=booster,
tree_index=tree_index,
show_info=show_info,
precision=precision,
orientation=orientation,
example_case=example_case,
**kwargs,
)
s = BytesIO()
s.write(graph.pipe(format='png'))
s.write(graph.pipe(format="png"))
s.seek(0)
img = matplotlib.image.imread(s)
ax.imshow(img)
ax.axis('off')
ax.axis("off")
return ax

Просмотреть файл

@ -46,10 +46,10 @@ from .compat import (
from .engine import train
__all__ = [
'LGBMClassifier',
'LGBMModel',
'LGBMRanker',
'LGBMRegressor',
"LGBMClassifier",
"LGBMModel",
"LGBMRanker",
"LGBMRegressor",
]
_LGBM_ScikitMatrixLike = Union[
@ -57,58 +57,58 @@ _LGBM_ScikitMatrixLike = Union[
List[Union[List[float], List[int]]],
np.ndarray,
pd_DataFrame,
scipy.sparse.spmatrix
scipy.sparse.spmatrix,
]
_LGBM_ScikitCustomObjectiveFunction = Union[
# f(labels, preds)
Callable[
[Optional[np.ndarray], np.ndarray],
Tuple[np.ndarray, np.ndarray]
Tuple[np.ndarray, np.ndarray],
],
# f(labels, preds, weights)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
Tuple[np.ndarray, np.ndarray]
Tuple[np.ndarray, np.ndarray],
],
# f(labels, preds, weights, group)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
Tuple[np.ndarray, np.ndarray]
Tuple[np.ndarray, np.ndarray],
],
]
_LGBM_ScikitCustomEvalFunction = Union[
# f(labels, preds)
Callable[
[Optional[np.ndarray], np.ndarray],
_LGBM_EvalFunctionResultType
_LGBM_EvalFunctionResultType,
],
Callable[
[Optional[np.ndarray], np.ndarray],
List[_LGBM_EvalFunctionResultType]
List[_LGBM_EvalFunctionResultType],
],
# f(labels, preds, weights)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
_LGBM_EvalFunctionResultType
_LGBM_EvalFunctionResultType,
],
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
List[_LGBM_EvalFunctionResultType]
List[_LGBM_EvalFunctionResultType],
],
# f(labels, preds, weights, group)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
_LGBM_EvalFunctionResultType
_LGBM_EvalFunctionResultType,
],
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
List[_LGBM_EvalFunctionResultType]
]
List[_LGBM_EvalFunctionResultType],
],
]
_LGBM_ScikitEvalMetricType = Union[
str,
_LGBM_ScikitCustomEvalFunction,
List[Union[str, _LGBM_ScikitCustomEvalFunction]]
List[Union[str, _LGBM_ScikitCustomEvalFunction]],
]
_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]
@ -119,7 +119,7 @@ def _get_group_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray
"Estimators in lightgbm.sklearn should only retrieve query groups from a constructed Dataset. "
"If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
)
assert (group is None or isinstance(group, np.ndarray)), error_msg
assert group is None or isinstance(group, np.ndarray), error_msg
return group
@ -139,7 +139,7 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra
"Estimators in lightgbm.sklearn should only retrieve weights from a constructed Dataset. "
"If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
)
assert (weight is None or isinstance(weight, np.ndarray)), error_msg
assert weight is None or isinstance(weight, np.ndarray), error_msg
return weight
@ -189,7 +189,11 @@ class _ObjectiveFunctionWrapper:
"""
self.func = func
def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]:
def __call__(
self,
preds: np.ndarray,
dataset: Dataset,
) -> Tuple[np.ndarray, np.ndarray]:
"""Call passed function with appropriate arguments.
Parameters
@ -271,7 +275,7 @@ class _EvalFunctionWrapper:
def __call__(
self,
preds: np.ndarray,
dataset: Dataset
dataset: Dataset,
) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]:
"""Call passed function with appropriate arguments.
@ -310,8 +314,7 @@ class _EvalFunctionWrapper:
# documentation templates for LGBMModel methods are shared between the classes in
# this module and those in the ``dask`` module
_lgbmmodel_doc_fit = (
"""
_lgbmmodel_doc_fit = """
Build a gradient boosting model from the training set (X, y).
Parameters
@ -372,7 +375,6 @@ _lgbmmodel_doc_fit = (
self : LGBMModel
Returns self.
"""
)
_lgbmmodel_doc_custom_eval_note = """
Note
@ -405,8 +407,7 @@ _lgbmmodel_doc_custom_eval_note = """
Is eval result higher better, e.g. AUC is ``is_higher_better``.
"""
_lgbmmodel_doc_predict = (
"""
_lgbmmodel_doc_predict = """
{description}
Parameters
@ -451,7 +452,6 @@ _lgbmmodel_doc_predict = (
X_SHAP_values : {X_SHAP_values_shape}
If ``pred_contrib=True``, the feature contributions for each sample.
"""
)
class LGBMModel(_LGBMModelBase):
@ -459,7 +459,7 @@ class LGBMModel(_LGBMModelBase):
def __init__(
self,
boosting_type: str = 'gbdt',
boosting_type: str = "gbdt",
num_leaves: int = 31,
max_depth: int = -1,
learning_rate: float = 0.1,
@ -467,18 +467,18 @@ class LGBMModel(_LGBMModelBase):
subsample_for_bin: int = 200000,
objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
class_weight: Optional[Union[Dict, str]] = None,
min_split_gain: float = 0.,
min_split_gain: float = 0.0,
min_child_weight: float = 1e-3,
min_child_samples: int = 20,
subsample: float = 1.,
subsample: float = 1.0,
subsample_freq: int = 0,
colsample_bytree: float = 1.,
reg_alpha: float = 0.,
reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
colsample_bytree: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
**kwargs
importance_type: str = "split",
**kwargs,
):
r"""Construct a gradient boosting model.
@ -598,8 +598,10 @@ class LGBMModel(_LGBMModelBase):
and grad and hess should be returned in the same format.
"""
if not SKLEARN_INSTALLED:
raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '
'You must install scikit-learn and restart your session to use this module.')
raise LightGBMError(
"scikit-learn is required for lightgbm.sklearn. "
"You must install scikit-learn and restart your session to use this module."
)
self.boosting_type = boosting_type
self.objective = objective
@ -636,14 +638,13 @@ class LGBMModel(_LGBMModelBase):
def _more_tags(self) -> Dict[str, Any]:
return {
'allow_nan': True,
'X_types': ['2darray', 'sparse', '1dlabels'],
'_xfail_checks': {
'check_no_attributes_set_in_init':
'scikit-learn incorrectly asserts that private attributes '
'cannot be set in __init__: '
'(see https://github.com/microsoft/LightGBM/issues/2628)'
}
"allow_nan": True,
"X_types": ["2darray", "sparse", "1dlabels"],
"_xfail_checks": {
"check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
"cannot be set in __init__: "
"(see https://github.com/microsoft/LightGBM/issues/2628)"
},
}
def __sklearn_is_fitted__(self) -> bool:
@ -703,8 +704,8 @@ class LGBMModel(_LGBMModelBase):
assert stage in {"fit", "predict"}
params = self.get_params()
params.pop('objective', None)
for alias in _ConfigAliases.get('objective'):
params.pop("objective", None)
for alias in _ConfigAliases.get("objective"):
if alias in params:
obj = params.pop(alias)
_log_warning(f"Found '{alias}' in params. Will use it instead of 'objective' argument")
@ -725,33 +726,31 @@ class LGBMModel(_LGBMModelBase):
raise ValueError("Unknown LGBMModel type.")
if callable(self._objective):
if stage == "fit":
params['objective'] = _ObjectiveFunctionWrapper(self._objective)
params["objective"] = _ObjectiveFunctionWrapper(self._objective)
else:
params['objective'] = 'None'
params["objective"] = "None"
else:
params['objective'] = self._objective
params["objective"] = self._objective
params.pop('importance_type', None)
params.pop('n_estimators', None)
params.pop('class_weight', None)
params.pop("importance_type", None)
params.pop("n_estimators", None)
params.pop("class_weight", None)
if isinstance(params['random_state'], np.random.RandomState):
params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
elif isinstance(params['random_state'], np_random_Generator):
params['random_state'] = int(
params['random_state'].integers(np.iinfo(np.int32).max)
)
if isinstance(params["random_state"], np.random.RandomState):
params["random_state"] = params["random_state"].randint(np.iinfo(np.int32).max)
elif isinstance(params["random_state"], np_random_Generator):
params["random_state"] = int(params["random_state"].integers(np.iinfo(np.int32).max))
if self._n_classes > 2:
for alias in _ConfigAliases.get('num_class'):
for alias in _ConfigAliases.get("num_class"):
params.pop(alias, None)
params['num_class'] = self._n_classes
if hasattr(self, '_eval_at'):
params["num_class"] = self._n_classes
if hasattr(self, "_eval_at"):
eval_at = self._eval_at
for alias in _ConfigAliases.get('eval_at'):
for alias in _ConfigAliases.get("eval_at"):
if alias in params:
_log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument")
eval_at = params.pop(alias)
params['eval_at'] = eval_at
params["eval_at"] = eval_at
# register default metric for consistency with callable eval_metric case
original_metric = self._objective if isinstance(self._objective, str) else None
@ -809,10 +808,10 @@ class LGBMModel(_LGBMModelBase):
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
callbacks: Optional[List[Callable]] = None,
init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None
init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None,
) -> "LGBMModel":
"""Docstring is set after definition, using a template."""
params = self._process_params(stage="fit")
@ -832,9 +831,9 @@ class LGBMModel(_LGBMModelBase):
eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)]
# concatenate metric from params (or default if not provided in params) and eval_metric
params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric']
params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
params['metric'] = [metric for metric in params['metric'] if metric is not None]
params["metric"] = [params["metric"]] if isinstance(params["metric"], (str, type(None))) else params["metric"]
params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
params["metric"] = [metric for metric in params["metric"] if metric is not None]
if not isinstance(X, (pd_DataFrame, dt_DataTable)):
_X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
@ -856,9 +855,15 @@ class LGBMModel(_LGBMModelBase):
# copy for consistency
self._n_features_in = self._n_features
train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
init_score=init_score, categorical_feature=categorical_feature,
params=params)
train_set = Dataset(
data=_X,
label=_y,
weight=sample_weight,
group=group,
init_score=init_score,
categorical_feature=categorical_feature,
params=params,
)
valid_sets: List[Dataset] = []
if eval_set is not None:
@ -880,8 +885,8 @@ class LGBMModel(_LGBMModelBase):
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
valid_weight = _get_meta_data(eval_sample_weight, "eval_sample_weight", i)
valid_class_weight = _get_meta_data(eval_class_weight, "eval_class_weight", i)
if valid_class_weight is not None:
if isinstance(valid_class_weight, dict) and self._class_map is not None:
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
@ -890,11 +895,17 @@ class LGBMModel(_LGBMModelBase):
valid_weight = valid_class_sample_weight
else:
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
valid_group = _get_meta_data(eval_group, 'eval_group', i)
valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
group=valid_group, init_score=valid_init_score,
categorical_feature='auto', params=params)
valid_init_score = _get_meta_data(eval_init_score, "eval_init_score", i)
valid_group = _get_meta_data(eval_group, "eval_group", i)
valid_set = Dataset(
data=valid_data[0],
label=valid_data[1],
weight=valid_weight,
group=valid_group,
init_score=valid_init_score,
categorical_feature="auto",
params=params,
)
valid_sets.append(valid_set)
@ -918,7 +929,7 @@ class LGBMModel(_LGBMModelBase):
feval=eval_metrics_callable, # type: ignore[arg-type]
init_model=init_model,
feature_name=feature_name,
callbacks=callbacks
callbacks=callbacks,
)
self._evals_result = evals_result
@ -932,16 +943,20 @@ class LGBMModel(_LGBMModelBase):
del train_set, valid_sets
return self
fit.__doc__ = _lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note
fit.__doc__ = (
_lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)",
)
+ "\n\n"
+ _lgbmmodel_doc_custom_eval_note
)
def predict(
self,
@ -952,7 +967,7 @@ class LGBMModel(_LGBMModelBase):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is set after definition, using a template."""
if not self.__sklearn_is_fitted__():
@ -961,9 +976,11 @@ class LGBMModel(_LGBMModelBase):
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
n_features = X.shape[1]
if self._n_features != n_features:
raise ValueError("Number of features of the model must "
f"match the input. Model n_features_ is {self._n_features} and "
f"input n_features is {n_features}")
raise ValueError(
"Number of features of the model must "
f"match the input. Model n_features_ is {self._n_features} and "
f"input n_features is {n_features}"
)
# retrive original params that possibly can be used in both training and prediction
# and then overwrite them (considering aliases) with params that were passed directly in prediction
predict_params = self._process_params(stage="predict")
@ -975,7 +992,7 @@ class LGBMModel(_LGBMModelBase):
"num_iteration",
"pred_leaf",
"pred_contrib",
*kwargs.keys()
*kwargs.keys(),
):
predict_params.pop(alias, None)
predict_params.update(kwargs)
@ -986,9 +1003,14 @@ class LGBMModel(_LGBMModelBase):
predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"])
return self._Booster.predict( # type: ignore[union-attr]
X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features,
**predict_params
X,
raw_score=raw_score,
start_iteration=start_iteration,
num_iteration=num_iteration,
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**predict_params,
)
predict.__doc__ = _lgbmmodel_doc_predict.format(
@ -997,42 +1019,44 @@ class LGBMModel(_LGBMModelBase):
output_name="predicted_result",
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects",
)
@property
def n_features_(self) -> int:
""":obj:`int`: The number of features of fitted model."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
raise LGBMNotFittedError("No n_features found. Need to call fit beforehand.")
return self._n_features
@property
def n_features_in_(self) -> int:
""":obj:`int`: The number of features of fitted model."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.')
raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.")
return self._n_features_in
@property
def best_score_(self) -> _LGBM_BoosterBestScoreType:
""":obj:`dict`: The best score of fitted model."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.')
raise LGBMNotFittedError("No best_score found. Need to call fit beforehand.")
return self._best_score
@property
def best_iteration_(self) -> int:
""":obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.')
raise LGBMNotFittedError(
"No best_iteration found. Need to call fit with early_stopping callback beforehand."
)
return self._best_iteration
@property
def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]:
""":obj:`str` or :obj:`callable`: The concrete objective used while fitting this model."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
raise LGBMNotFittedError("No objective found. Need to call fit beforehand.")
return self._objective # type: ignore[return-value]
@property
@ -1041,11 +1065,11 @@ class LGBMModel(_LGBMModelBase):
This might be less than parameter ``n_estimators`` if early stopping was enabled or
if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
.. versionadded:: 4.0.0
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No n_estimators found. Need to call fit beforehand.')
raise LGBMNotFittedError("No n_estimators found. Need to call fit beforehand.")
return self._Booster.current_iteration() # type: ignore
@property
@ -1054,25 +1078,25 @@ class LGBMModel(_LGBMModelBase):
This might be less than parameter ``n_estimators`` if early stopping was enabled or
if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
.. versionadded:: 4.0.0
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No n_iter found. Need to call fit beforehand.')
raise LGBMNotFittedError("No n_iter found. Need to call fit beforehand.")
return self._Booster.current_iteration() # type: ignore
@property
def booster_(self) -> Booster:
"""Booster: The underlying Booster of this model."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No booster found. Need to call fit beforehand.')
raise LGBMNotFittedError("No booster found. Need to call fit beforehand.")
return self._Booster # type: ignore[return-value]
@property
def evals_result_(self) -> _EvalResultDict:
""":obj:`dict`: The evaluation results if validation sets have been specified."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.')
raise LGBMNotFittedError("No results found. Need to call fit with eval_set beforehand.")
return self._evals_result
@property
@ -1085,14 +1109,14 @@ class LGBMModel(_LGBMModelBase):
to configure the type of importance values to be extracted.
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
raise LGBMNotFittedError("No feature_importances found. Need to call fit beforehand.")
return self._Booster.feature_importance(importance_type=self.importance_type) # type: ignore[union-attr]
@property
def feature_name_(self) -> List[str]:
""":obj:`list` of shape = [n_features]: The names of features."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.')
raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
return self._Booster.feature_name() # type: ignore[union-attr]
@ -1110,10 +1134,10 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
callbacks: Optional[List[Callable]] = None,
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,
) -> "LGBMRegressor":
"""Docstring is inherited from the LGBMModel."""
super().fit(
@ -1129,17 +1153,17 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks,
init_model=init_model
init_model=init_model,
)
return self
_base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRegressor") # type: ignore
_base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore
+ _base_doc[_base_doc.find('eval_set :'):]) # type: ignore
_base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')]
+ _base_doc[_base_doc.find('eval_init_score :'):])
fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
+ _base_doc[_base_doc.find('eval_metric :'):])
_base_doc = (
_base_doc[: _base_doc.find("group :")] # type: ignore
+ _base_doc[_base_doc.find("eval_set :") :]
) # type: ignore
_base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :]
fit.__doc__ = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
@ -1157,10 +1181,10 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
eval_class_weight: Optional[List[float]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
callbacks: Optional[List[Callable]] = None,
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,
) -> "LGBMClassifier":
"""Docstring is inherited from the LGBMModel."""
_LGBMAssertAllFinite(y)
@ -1187,16 +1211,16 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
eval_metric_list = []
if self._n_classes > 2:
for index, metric in enumerate(eval_metric_list):
if metric in {'logloss', 'binary_logloss'}:
if metric in {"logloss", "binary_logloss"}:
eval_metric_list[index] = "multi_logloss"
elif metric in {'error', 'binary_error'}:
elif metric in {"error", "binary_error"}:
eval_metric_list[index] = "multi_error"
else:
for index, metric in enumerate(eval_metric_list):
if metric in {'logloss', 'multi_logloss'}:
eval_metric_list[index] = 'binary_logloss'
elif metric in {'error', 'multi_error'}:
eval_metric_list[index] = 'binary_error'
if metric in {"logloss", "multi_logloss"}:
eval_metric_list[index] = "binary_logloss"
elif metric in {"error", "multi_error"}:
eval_metric_list[index] = "binary_error"
eval_metric = eval_metric_list
# do not modify args, as it causes errors in model selection tools
@ -1225,15 +1249,16 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks,
init_model=init_model
init_model=init_model,
)
return self
_base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMClassifier") # type: ignore
_base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore
+ _base_doc[_base_doc.find('eval_set :'):]) # type: ignore
fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
+ _base_doc[_base_doc.find('eval_metric :'):])
_base_doc = (
_base_doc[: _base_doc.find("group :")] # type: ignore
+ _base_doc[_base_doc.find("eval_set :") :]
) # type: ignore
fit.__doc__ = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
def predict(
self,
@ -1244,7 +1269,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is inherited from the LGBMModel."""
result = self.predict_proba(
@ -1255,7 +1280,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
return result
@ -1274,7 +1299,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
pred_leaf: bool = False,
pred_contrib: bool = False,
validate_features: bool = False,
**kwargs: Any
**kwargs: Any,
):
"""Docstring is set after definition, using a template."""
result = super().predict(
@ -1285,17 +1310,19 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
pred_leaf=pred_leaf,
pred_contrib=pred_contrib,
validate_features=validate_features,
**kwargs
**kwargs,
)
if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
_log_warning("Cannot compute class probabilities or labels "
"due to the usage of customized objective function.\n"
"Returning raw scores instead.")
_log_warning(
"Cannot compute class probabilities or labels "
"due to the usage of customized objective function.\n"
"Returning raw scores instead."
)
return result
elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib: # type: ignore [operator]
return result
else:
return np.vstack((1. - result, result)).transpose()
return np.vstack((1.0 - result, result)).transpose()
predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted probability for each class for each sample.",
@ -1303,21 +1330,21 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
output_name="predicted_probability",
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects",
)
@property
def classes_(self) -> np.ndarray:
""":obj:`array` of shape = [n_classes]: The class label array."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
raise LGBMNotFittedError("No classes found. Need to call fit beforehand.")
return self._classes # type: ignore[return-value]
@property
def n_classes_(self) -> int:
""":obj:`int`: The number of classes."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
raise LGBMNotFittedError("No classes found. Need to call fit beforehand.")
return self._n_classes
@ -1345,10 +1372,10 @@ class LGBMRanker(LGBMModel):
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
feature_name: _LGBM_FeatureNameConfiguration = "auto",
categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
callbacks: Optional[List[Callable]] = None,
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,
) -> "LGBMRanker":
"""Docstring is inherited from the LGBMModel."""
# check group data
@ -1360,12 +1387,16 @@ class LGBMRanker(LGBMModel):
raise ValueError("Eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set):
raise ValueError("Length of eval_group should be equal to eval_set")
elif (isinstance(eval_group, dict)
and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))
or isinstance(eval_group, list)
and any(group is None for group in eval_group)):
raise ValueError("Should set group for all eval datasets for ranking task; "
"if you use dict, the index should start from 0")
elif (
isinstance(eval_group, dict)
and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))
or isinstance(eval_group, list)
and any(group is None for group in eval_group)
):
raise ValueError(
"Should set group for all eval datasets for ranking task; "
"if you use dict, the index should start from 0"
)
self._eval_at = eval_at
super().fit(
@ -1383,15 +1414,17 @@ class LGBMRanker(LGBMModel):
feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks,
init_model=init_model
init_model=init_model,
)
return self
_base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRanker") # type: ignore
fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')] # type: ignore
+ _base_doc[_base_doc.find('eval_init_score :'):]) # type: ignore
fit.__doc__ = (
_base_doc[: _base_doc.find("eval_class_weight :")] # type: ignore
+ _base_doc[_base_doc.find("eval_init_score :") :]
) # type: ignore
_base_doc = fit.__doc__
_before_feature_name, _feature_name, _after_feature_name = _base_doc.partition('feature_name :')
_before_feature_name, _feature_name, _after_feature_name = _base_doc.partition("feature_name :")
fit.__doc__ = f"""{_before_feature_name}eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))
The evaluation positions of the specified metric.
{_feature_name}{_after_feature_name}"""

Просмотреть файл

@ -114,7 +114,6 @@ exclude = [
"compile/*.py",
"external_libs/*.py",
"lightgbm-python/*.py",
"python-package/*.py",
]
indent-style = "space"
quote-style = "double"