[ci] [python-package] enable ruff-format on tests and examples (#6317)

This commit is contained in:
James Lamb 2024-02-21 12:15:38 -06:00 коммит произвёл GitHub
Родитель b60068c810
Коммит 1b792e7166
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
30 изменённых файлов: 3222 добавлений и 3849 удалений

Просмотреть файл

@ -7,6 +7,12 @@ exclude: |
)$
repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
@ -14,12 +20,8 @@ repos:
# Run the linter.
- id: ruff
args: ["--config", "python-package/pyproject.toml"]
types_or: [python, jupyter]
# Run the formatter.
- id: ruff-format
args: ["--config", "python-package/pyproject.toml"]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
types_or: [python, jupyter]

Просмотреть файл

@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "binary",
"metric": "binary_logloss",
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}
# generate feature names
feature_name = [f'feature_{col}' for col in range(num_feature)]
feature_name = [f"feature_{col}" for col in range(num_feature)]
print('Starting training...')
print("Starting training...")
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21],
)
print('Finished first 10 rounds...')
print("Finished first 10 rounds...")
# check feature name
print(f'7th feature name is: {lgb_train.feature_name[6]}')
print(f"7th feature name is: {lgb_train.feature_name[6]}")
print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")
print('Dumping model to JSON...')
print("Dumping model to JSON...")
# dump model to JSON (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
with open("model.json", "w+") as f:
json.dump(model_json, f, indent=4)
# feature names
print(f'Feature names: {gbm.feature_name()}')
print(f"Feature names: {gbm.feature_name()}")
# feature importances
print(f'Feature importances: {list(gbm.feature_importance())}')
print(f"Feature importances: {list(gbm.feature_importance())}")
print('Loading model to predict...')
print("Loading model to predict...")
# load model to predict
bst = lgb.Booster(model_file='model.txt')
bst = lgb.Booster(model_file="model.txt")
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")
print('Dumping and loading model with pickle...')
print("Dumping and loading model with pickle...")
# dump model with pickle
with open('model.pkl', 'wb') as fout:
with open("model.pkl", "wb") as fout:
pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
with open("model.pkl", "rb") as fin:
pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...')
print("Finished 10 - 20 rounds with model file...")
# decay learning rates
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)
print('Finished 20 - 30 rounds with decay learning rates...')
print("Finished 20 - 30 rounds with decay learning rates...")
# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)
print('Finished 30 - 40 rounds with changing bagging_fraction...')
print("Finished 30 - 40 rounds with changing bagging_fraction...")
# self-defined objective function
@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...')
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
hess = preds * (1.0 - preds)
return grad, hess
@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
preds = 1.0 / (1.0 + np.exp(-preds))
return "error", np.mean(labels != (preds > 0.5)), False
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=binary_error,
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")
# another self-defined eval metric
@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'accuracy', np.mean(labels == (preds > 0.5)), True
preds = 1.0 / (1.0 + np.exp(-preds))
return "accuracy", np.mean(labels == (preds > 0.5)), True
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval,
)
print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")
print('Starting a new training job...')
print("Starting a new training job...")
# callback
@ -208,17 +207,14 @@ def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new_valid')
print("Add a new valid dataset at iteration 5...")
env.model.add_valid(lgb_eval_new, "new_valid")
callback.before_iteration = True
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])
print('Finished first 10 rounds with callback function...')
print("Finished first 10 rounds with callback function...")

Просмотреть файл

@ -10,9 +10,9 @@ import lightgbm as lgb
if __name__ == "__main__":
print("loading data")
rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank'
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train'))
group = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
group = np.loadtxt(str(rank_example_dir / "rank.train.query"))
print("initializing a Dask cluster")
@ -32,25 +32,14 @@ if __name__ == "__main__":
# a sparse boundary to partition the data
X = X.toarray()
dX = da.from_array(
x=X,
chunks=[
(rows_in_part1, rows_in_part2),
(num_features,)
]
)
dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
dy = da.from_array(
x=y,
chunks=[
(rows_in_part1, rows_in_part2),
]
)
dg = da.from_array(
x=group,
chunks=[
(100, group.size - 100)
]
],
)
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])
print("beginning training")

Просмотреть файл

@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
data = []
ylist = []
for f in input_flist:
f = h5py.File(f, 'r')
data.append(HDFSequence(f['X'], batch_size))
ylist.append(f['Y'][:])
f = h5py.File(f, "r")
data.append(HDFSequence(f["X"], batch_size))
ylist.append(f["Y"][:])
params = {
'bin_construct_sample_cnt': 200000,
'max_bin': 255,
"bin_construct_sample_cnt": 200000,
"max_bin": 255,
}
y = np.concatenate(ylist)
dataset = lgb.Dataset(data, label=y, params=params)
@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
# The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
# as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
# are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
dataset.save_binary('regression.train.from_hdf.bin')
dataset.save_binary("regression.train.from_hdf.bin")
def save2hdf(input_data, fname, batch_size):
@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size):
Please note chunk size settings in the implementation for I/O performance optimization.
"""
with h5py.File(fname, 'w') as f:
with h5py.File(fname, "w") as f:
for name, data in input_data.items():
nrow, ncol = data.shape
if ncol == 1:
@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size):
# Also note that the data is stored in row major order to avoid extra copy when passing to
# lightgbm Dataset.
chunk = (batch_size, ncol)
f.create_dataset(name, data=data, chunks=chunk, compression='lzf')
f.create_dataset(name, data=data, chunks=chunk, compression="lzf")
def generate_hdf(input_fname, output_basename, batch_size):
# Save to 2 HDF5 files for demonstration.
df = pd.read_csv(input_fname, header=None, sep='\t')
df = pd.read_csv(input_fname, header=None, sep="\t")
mid = len(df) // 2
df1 = df.iloc[:mid]
@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size):
# We can store multiple datasets inside a single HDF5 file.
# Separating X and Y for choosing best chunk size for data loading.
fname1 = f'{output_basename}1.h5'
fname2 = f'{output_basename}2.h5'
save2hdf({'Y': df1.iloc[:, :1], 'X': df1.iloc[:, 1:]}, fname1, batch_size)
save2hdf({'Y': df2.iloc[:, :1], 'X': df2.iloc[:, 1:]}, fname2, batch_size)
fname1 = f"{output_basename}1.h5"
fname2 = f"{output_basename}2.h5"
save2hdf({"Y": df1.iloc[:, :1], "X": df1.iloc[:, 1:]}, fname1, batch_size)
save2hdf({"Y": df2.iloc[:, :1], "X": df2.iloc[:, 1:]}, fname2, batch_size)
return [fname1, fname2]
def main():
batch_size = 64
output_basename = 'regression'
output_basename = "regression"
hdf_files = generate_hdf(
str(Path(__file__).absolute().parents[1] / 'regression' / 'regression.train'),
output_basename,
batch_size
str(Path(__file__).absolute().parents[1] / "regression" / "regression.train"), output_basename, batch_size
)
create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size)
if __name__ == '__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -24,23 +24,19 @@ import lightgbm as lgb
# single continuous predictor
np.random.seed(0)
N = 1000
X = pd.DataFrame({
'continuous': range(N),
'categorical': np.repeat([0, 1, 2, 3, 4], N / 5)
})
X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
LINEAR_TERM = np.array([
-0.5 + 0.01 * X['continuous'][k]
+ CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0])
]) + np.random.normal(0, 1, X.shape[0])
LINEAR_TERM = np.array(
[-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
) + np.random.normal(0, 1, X.shape[0])
TRUE_PROB = expit(LINEAR_TERM)
Y = np.random.binomial(1, TRUE_PROB, size=N)
DATA = {
'X': X,
'probability_labels': TRUE_PROB,
'binary_labels': Y,
'lgb_with_binary_labels': lgb.Dataset(X, Y),
'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB),
"X": X,
"probability_labels": TRUE_PROB,
"binary_labels": Y,
"lgb_with_binary_labels": lgb.Dataset(X, Y),
"lgb_with_probability_labels": lgb.Dataset(X, TRUE_PROB),
}
@ -72,34 +68,25 @@ def experiment(objective, label_type, data):
np.random.seed(0)
nrounds = 5
lgb_data = data[f"lgb_with_{label_type}_labels"]
params = {
'objective': objective,
'feature_fraction': 1,
'bagging_fraction': 1,
'verbose': -1
}
params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
time_zero = time.time()
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
y_fitted = gbm.predict(data['X'])
y_fitted = gbm.predict(data["X"])
y_true = data[f"{label_type}_labels"]
duration = time.time() - time_zero
return {
'time': duration,
'correlation': np.corrcoef(y_fitted, y_true)[0, 1],
'logloss': log_loss(y_fitted, y_true)
}
return {"time": duration, "correlation": np.corrcoef(y_fitted, y_true)[0, 1], "logloss": log_loss(y_fitted, y_true)}
#################
# Observe the behavior of `binary` and `xentropy` objectives
print('Performance of `binary` objective with binary labels:')
print(experiment('binary', label_type='binary', data=DATA))
print("Performance of `binary` objective with binary labels:")
print(experiment("binary", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with binary labels:')
print(experiment('xentropy', label_type='binary', data=DATA))
print("Performance of `xentropy` objective with binary labels:")
print(experiment("xentropy", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with probability labels:')
print(experiment('xentropy', label_type='probability', data=DATA))
print("Performance of `xentropy` objective with probability labels:")
print(experiment("xentropy", label_type="probability", data=DATA))
# Trying this throws an error on non-binary values of y:
# experiment('binary', label_type='probability', DATA)
@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA))
# there are reasons to suspect that `binary` should run faster when the
# label is an integer instead of a float
K = 10
A = [experiment('binary', label_type='binary', data=DATA)['time']
for k in range(K)]
B = [experiment('xentropy', label_type='binary', data=DATA)['time']
for k in range(K)]
A = [experiment("binary", label_type="binary", data=DATA)["time"] for k in range(K)]
B = [experiment("xentropy", label_type="binary", data=DATA)["time"] for k in range(K)]
print(f"Best `binary` time: {min(A)}")
print(f"Best `xentropy` time: {min(B)}")

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -8,13 +8,13 @@ import lightgbm as lgb
if lgb.compat.MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt
else:
raise ImportError('You need to install matplotlib and restart your session for plot_example.py.')
raise ImportError("You need to install matplotlib and restart your session for plot_example.py.")
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'num_leaves': 5,
'metric': ('l1', 'l2'),
'verbose': 0
}
params = {"num_leaves": 5, "metric": ("l1", "l2"), "verbose": 0}
evals_result = {} # to record eval results for plotting
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.train(
params,
lgb_train,
num_boost_round=100,
valid_sets=[lgb_train, lgb_test],
feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
categorical_feature=[21],
callbacks=[
lgb.log_evaluation(10),
lgb.record_evaluation(evals_result)
]
callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],
)
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
print("Plotting metrics recorded during training...")
ax = lgb.plot_metric(evals_result, metric="l1")
plt.show()
print('Plotting feature importances...')
print("Plotting feature importances...")
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()
print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
print("Plotting split value histogram...")
ax = lgb.plot_split_value_histogram(gbm, feature="f26", bins="auto")
plt.show()
print('Plotting 54th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
print("Plotting 54th tree...") # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=["split_gain"])
plt.show()
print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
print("Plotting 54th tree with graphviz...")
graph = lgb.create_tree_digraph(gbm, tree_index=53, name="Tree54")
graph.render(view=True)

Просмотреть файл

@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "regression",
"metric": {"l2", "l1"},
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
callbacks=[lgb.early_stopping(stopping_rounds=5)])
gbm = lgb.train(
params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]
)
print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")
print('Starting predicting...')
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
print(f"The RMSE of prediction is: {rmse_test}")

Просмотреть файл

@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
learning_rate=0.05,
n_estimators=20)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='l1',
callbacks=[lgb.early_stopping(5)])
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])
print('Starting predicting...')
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
print(f"The RMSE of prediction is: {rmse_test}")
# feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}')
print(f"Feature importances: {list(gbm.feature_importances_)}")
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
print('Starting training with custom eval function...')
print("Starting training with custom eval function...")
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=rmsle,
callbacks=[lgb.early_stopping(5)])
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])
# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
print('Starting training with multiple custom eval functions...')
print("Starting training with multiple custom eval functions...")
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=[rmsle, rae],
callbacks=[lgb.early_stopping(5)])
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])
print('Starting predicting...')
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f'The RMSLE of prediction is: {rmsle_test}')
print(f'The RAE of prediction is: {rae_test}')
print(f"The RMSLE of prediction is: {rmsle_test}")
print(f"The RAE of prediction is: {rae_test}")
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
param_grid = {"learning_rate": [0.01, 0.1, 1], "n_estimators": [20, 40]}
gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)
print(f'Best parameters found by grid search are: {gbm.best_params_}')
print(f"Best parameters found by grid search are: {gbm.best_params_}")

Просмотреть файл

@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional,
import numpy as np
import scipy.sparse
from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
pd_CategoricalDtype, pd_DataFrame, pd_Series)
from .compat import (
PANDAS_INSTALLED,
PYARROW_INSTALLED,
arrow_cffi,
arrow_is_floating,
arrow_is_integer,
concat,
dt_DataTable,
pa_Array,
pa_chunked_array,
pa_ChunkedArray,
pa_compute,
pa_Table,
pd_CategoricalDtype,
pd_DataFrame,
pd_Series,
)
from .libpath import find_lib_path
if TYPE_CHECKING:

Просмотреть файл

@ -5,8 +5,14 @@ from dataclasses import dataclass
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning)
from .basic import (
Booster,
_ConfigAliases,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_log_info,
_log_warning,
)
if TYPE_CHECKING:
from .engine import CVBooster

Просмотреть файл

@ -19,12 +19,36 @@ import numpy as np
import scipy.sparse as ss
from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning
from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, Future, LGBMNotFittedError, concat,
dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
default_client, delayed, pd_DataFrame, pd_Series, wait)
from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit,
_lgbmmodel_doc_predict)
from .compat import (
DASK_INSTALLED,
PANDAS_INSTALLED,
SKLEARN_INSTALLED,
Client,
Future,
LGBMNotFittedError,
concat,
dask_Array,
dask_array_from_delayed,
dask_bag_from_delayed,
dask_DataFrame,
dask_Series,
default_client,
delayed,
pd_DataFrame,
pd_Series,
wait,
)
from .sklearn import (
LGBMClassifier,
LGBMModel,
LGBMRanker,
LGBMRegressor,
_LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType,
_lgbmmodel_doc_custom_eval_note,
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
)
__all__ = [
'DaskLGBMClassifier',

Просмотреть файл

@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from . import callback
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration, _log_warning)
from .basic import (
Booster,
Dataset,
LightGBMError,
_choose_param_value,
_ConfigAliases,
_InnerPredictor,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_CustomObjectiveFunction,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_log_warning,
)
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
__all__ = [

Просмотреть файл

@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import scipy.sparse
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
from .basic import (
Booster,
Dataset,
LightGBMError,
_choose_param_value,
_ConfigAliases,
_LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_LGBM_GroupType,
_LGBM_InitScoreType,
_LGBM_LabelType,
_LGBM_WeightType,
_log_warning,
)
from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
dt_DataTable, np_random_Generator, pd_DataFrame)
from .compat import (
SKLEARN_INSTALLED,
LGBMNotFittedError,
_LGBMAssertAllFinite,
_LGBMCheckArray,
_LGBMCheckClassificationTargets,
_LGBMCheckSampleWeight,
_LGBMCheckXY,
_LGBMClassifierBase,
_LGBMComputeSampleWeight,
_LGBMCpuCount,
_LGBMLabelEncoder,
_LGBMModelBase,
_LGBMRegressorBase,
dt_DataTable,
np_random_Generator,
pd_DataFrame,
)
from .engine import train
__all__ = [

Просмотреть файл

@ -81,10 +81,14 @@ minimum-version = "0.4.4"
# end:build-system
[tool.isort]
include_trailing_comma = true
line_length = 120
# "vertical hanging indent", to match what ruff-format does
# ref: https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html#3-vertical-hanging-indent
multi_line_output = 3
skip_glob = [
"*/external_libs/*",
"*/lightgbm-python/*"
"*/lightgbm-python/*",
]
[tool.mypy]
@ -108,14 +112,13 @@ docstring-code-format = false
exclude = [
"build/*.py",
"compile/*.py",
"examples/*.py",
"external_libs/*.py",
"lightgbm-python/*.py",
"python-package/*.py",
"tests/*.py"
]
indent-style = "space"
quote-style = "double"
skip-magic-trailing-comma = false
[tool.ruff.lint]
ignore = [

Просмотреть файл

@ -10,7 +10,7 @@ try:
from lightgbm.basic import _LIB as LIB
except ModuleNotFoundError:
print("Could not import lightgbm Python package, looking for lib_lightgbm at the repo root")
if system() in ('Windows', 'Microsoft'):
if system() in ("Windows", "Microsoft"):
lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll"
else:
lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so"
@ -25,7 +25,7 @@ dtype_int64 = 3
def c_str(string):
return ctypes.c_char_p(string.encode('utf-8'))
return ctypes.c_char_p(string.encode("utf-8"))
def load_from_file(filename, reference):
@ -33,17 +33,13 @@ def load_from_file(filename, reference):
if reference is not None:
ref = reference
handle = ctypes.c_void_p()
LIB.LGBM_DatasetCreateFromFile(
c_str(str(filename)),
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
LIB.LGBM_DatasetCreateFromFile(c_str(str(filename)), c_str("max_bin=15"), ref, ctypes.byref(handle))
print(LIB.LGBM_GetLastError())
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
@ -69,20 +65,22 @@ def load_from_csr(filename, reference):
ctypes.c_int64(len(csr.indptr)),
ctypes.c_int64(len(csr.data)),
ctypes.c_int64(csr.shape[1]),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
@ -104,20 +102,22 @@ def load_from_csc(filename, reference):
ctypes.c_int64(len(csc.indptr)),
ctypes.c_int64(len(csc.data)),
ctypes.c_int64(csc.shape[0]),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
@ -137,20 +137,22 @@ def load_from_mat(filename, reference):
ctypes.c_int32(mat.shape[0]),
ctypes.c_int32(mat.shape[1]),
ctypes.c_int(1),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
@ -159,29 +161,26 @@ def free_dataset(handle):
def test_dataset():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
train = load_from_file(binary_example_dir / 'binary.train', None)
test = load_from_mat(binary_example_dir / 'binary.test', train)
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_file(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / "binary.test", train)
free_dataset(test)
test = load_from_csr(binary_example_dir / 'binary.test', train)
test = load_from_csr(binary_example_dir / "binary.test", train)
free_dataset(test)
test = load_from_csc(binary_example_dir / 'binary.test', train)
test = load_from_csc(binary_example_dir / "binary.test", train)
free_dataset(test)
save_to_binary(train, 'train.binary.bin')
save_to_binary(train, "train.binary.bin")
free_dataset(train)
train = load_from_file('train.binary.bin', None)
train = load_from_file("train.binary.bin", None)
free_dataset(train)
def test_booster():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
train = load_from_mat(binary_example_dir / 'binary.train', None)
test = load_from_mat(binary_example_dir / 'binary.test', train)
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_mat(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / "binary.test", train)
booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate(
train,
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
ctypes.byref(booster))
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(1, 51):
@ -189,28 +188,18 @@ def test_booster():
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_int(0)
LIB.LGBM_BoosterGetEval(
booster,
ctypes.c_int(0),
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
booster, ctypes.c_int(0), ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
)
if i % 10 == 0:
print(f'{i} iteration test AUC {result[0]:.6f}')
LIB.LGBM_BoosterSaveModel(
booster,
ctypes.c_int(0),
ctypes.c_int(-1),
ctypes.c_int(0),
c_str('model.txt'))
print(f"{i} iteration test AUC {result[0]:.6f}")
LIB.LGBM_BoosterSaveModel(booster, ctypes.c_int(0), ctypes.c_int(-1), ctypes.c_int(0), c_str("model.txt"))
LIB.LGBM_BoosterFree(booster)
free_dataset(train)
free_dataset(test)
booster2 = ctypes.c_void_p()
num_total_model = ctypes.c_int(0)
LIB.LGBM_BoosterCreateFromModelfile(
c_str('model.txt'),
ctypes.byref(num_total_model),
ctypes.byref(booster2))
data = np.loadtxt(str(binary_example_dir / 'binary.test'), dtype=np.float64)
LIB.LGBM_BoosterCreateFromModelfile(c_str("model.txt"), ctypes.byref(num_total_model), ctypes.byref(booster2))
data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
mat = data[:, 1:]
preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0)
@ -225,58 +214,51 @@ def test_booster():
ctypes.c_int(1),
ctypes.c_int(0),
ctypes.c_int(25),
c_str(''),
c_str(""),
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
)
LIB.LGBM_BoosterPredictForFile(
booster2,
c_str(str(binary_example_dir / 'binary.test')),
c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(25),
c_str(''),
c_str('preb.txt'))
c_str(""),
c_str("preb.txt"),
)
LIB.LGBM_BoosterPredictForFile(
booster2,
c_str(str(binary_example_dir / 'binary.test')),
c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(10),
ctypes.c_int(25),
c_str(''),
c_str('preb.txt'))
c_str(""),
c_str("preb.txt"),
)
LIB.LGBM_BoosterFree(booster2)
def test_max_thread_control():
# at initialization, should be -1
num_threads = ctypes.c_int(0)
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == -1
# updating that value through the C API should work
ret = LIB.LGBM_SetMaxThreads(
ctypes.c_int(6)
)
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(6))
assert ret == 0
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == 6
# resetting to any negative number should set it to -1
ret = LIB.LGBM_SetMaxThreads(
ctypes.c_int(-123)
)
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(-123))
assert ret == 0
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == -1

Просмотреть файл

@ -3,5 +3,5 @@ from pathlib import Path
import numpy as np
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob('*.pred')]
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob("*.pred")]
np.testing.assert_allclose(preds[0], preds[1])

Просмотреть файл

@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score
TESTS_DIR = Path(__file__).absolute().parent
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def executable(pytestconfig) -> str:
"""Returns the path to the lightgbm executable."""
return pytestconfig.getoption('execfile')
return pytestconfig.getoption("execfile")
def _find_random_open_port() -> int:
"""Find a random open port on localhost."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
s.bind(("", 0))
port = s.getsockname()[1]
return port # noqa: RET504
@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]:
def _write_dict(d: Dict, file: io.TextIOWrapper) -> None:
for k, v in d.items():
file.write(f'{k} = {v}\n')
file.write(f"{k} = {v}\n")
def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
The data is returned as a numpy array with the label as the first column.
"""
if task == 'binary-classification':
if task == "binary-classification":
centers = [[-4, -4], [4, 4]]
X, y = make_blobs(n_samples, centers=centers, random_state=42)
elif task == 'regression':
elif task == "regression":
X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
return np.hstack([y.reshape(-1, 1), X])
@ -54,22 +54,22 @@ class DistributedMockup:
"""Simulate distributed training."""
default_train_config = {
'task': 'train',
'pre_partition': True,
'machine_list_file': TESTS_DIR / 'mlist.txt',
'tree_learner': 'data',
'force_row_wise': True,
'verbose': 0,
'num_boost_round': 20,
'num_leaves': 15,
'num_threads': 2,
"task": "train",
"pre_partition": True,
"machine_list_file": TESTS_DIR / "mlist.txt",
"tree_learner": "data",
"force_row_wise": True,
"verbose": 0,
"num_boost_round": 20,
"num_leaves": 15,
"num_threads": 2,
}
default_predict_config = {
'task': 'predict',
'data': TESTS_DIR / 'train.txt',
'input_model': TESTS_DIR / 'model0.txt',
'output_result': TESTS_DIR / 'predictions.txt',
"task": "predict",
"data": TESTS_DIR / "train.txt",
"input_model": TESTS_DIR / "model0.txt",
"output_result": TESTS_DIR / "predictions.txt",
}
def __init__(self, executable: str):
@ -77,8 +77,8 @@ class DistributedMockup:
def worker_train(self, i: int) -> subprocess.CompletedProcess:
"""Start the training process on the `i`-th worker."""
config_path = TESTS_DIR / f'train{i}.conf'
cmd = [self.executable, f'config={config_path}']
config_path = TESTS_DIR / f"train{i}.conf"
cmd = [self.executable, f"config={config_path}"]
return subprocess.run(cmd)
def _set_ports(self) -> None:
@ -92,18 +92,18 @@ class DistributedMockup:
ports.update(candidates)
i += 1
if i == max_tries:
raise RuntimeError('Unable to find non-colliding ports.')
raise RuntimeError("Unable to find non-colliding ports.")
self.listen_ports = list(ports)
with open(TESTS_DIR / 'mlist.txt', 'wt') as f:
with open(TESTS_DIR / "mlist.txt", "wt") as f:
for port in self.listen_ports:
f.write(f'127.0.0.1 {port}\n')
f.write(f"127.0.0.1 {port}\n")
def _write_data(self, partitions: List[np.ndarray]) -> None:
"""Write all training data as train.txt and each training partition as train{i}.txt."""
all_data = np.vstack(partitions)
np.savetxt(str(TESTS_DIR / 'train.txt'), all_data, delimiter=',')
np.savetxt(str(TESTS_DIR / "train.txt"), all_data, delimiter=",")
for i, partition in enumerate(partitions):
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
np.savetxt(str(TESTS_DIR / f"train{i}.txt"), partition, delimiter=",")
def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
"""Run the distributed training process on a single machine.
@ -118,7 +118,7 @@ class DistributedMockup:
"""
self.train_config = copy.deepcopy(self.default_train_config)
self.train_config.update(train_config)
self.n_workers = self.train_config['num_machines']
self.n_workers = self.train_config["num_machines"]
self._set_ports()
self._write_data(partitions)
self.label_ = np.hstack([partition[:, 0] for partition in partitions])
@ -131,7 +131,7 @@ class DistributedMockup:
results = [f.result() for f in futures]
for result in results:
if result.returncode != 0:
raise RuntimeError('Error in training')
raise RuntimeError("Error in training")
def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
"""Compute the predictions using the model created in the fit step.
@ -141,14 +141,14 @@ class DistributedMockup:
"""
self.predict_config = copy.deepcopy(self.default_predict_config)
self.predict_config.update(predict_config)
config_path = TESTS_DIR / 'predict.conf'
with open(config_path, 'wt') as file:
config_path = TESTS_DIR / "predict.conf"
with open(config_path, "wt") as file:
_write_dict(self.predict_config, file)
cmd = [self.executable, f'config={config_path}']
cmd = [self.executable, f"config={config_path}"]
result = subprocess.run(cmd)
if result.returncode != 0:
raise RuntimeError('Error in prediction')
return np.loadtxt(str(TESTS_DIR / 'predictions.txt'))
raise RuntimeError("Error in prediction")
return np.loadtxt(str(TESTS_DIR / "predictions.txt"))
def write_train_config(self, i: int) -> None:
"""Create a file train{i}.conf with the required configuration to train.
@ -156,41 +156,41 @@ class DistributedMockup:
Each worker gets a different port and piece of the data, the rest are the
model parameters contained in `self.config`.
"""
with open(TESTS_DIR / f'train{i}.conf', 'wt') as file:
output_model = TESTS_DIR / f'model{i}.txt'
data = TESTS_DIR / f'train{i}.txt'
file.write(f'output_model = {output_model}\n')
file.write(f'local_listen_port = {self.listen_ports[i]}\n')
file.write(f'data = {data}\n')
with open(TESTS_DIR / f"train{i}.conf", "wt") as file:
output_model = TESTS_DIR / f"model{i}.txt"
data = TESTS_DIR / f"train{i}.txt"
file.write(f"output_model = {output_model}\n")
file.write(f"local_listen_port = {self.listen_ports[i]}\n")
file.write(f"data = {data}\n")
_write_dict(self.train_config, file)
def test_classifier(executable):
"""Test the classification task."""
num_machines = 2
data = create_data(task='binary-classification')
data = create_data(task="binary-classification")
partitions = np.array_split(data, num_machines)
train_params = {
'objective': 'binary',
'num_machines': num_machines,
"objective": "binary",
"num_machines": num_machines,
}
clf = DistributedMockup(executable)
clf.fit(partitions, train_params)
y_probas = clf.predict(predict_config={})
y_pred = y_probas > 0.5
assert accuracy_score(clf.label_, y_pred) == 1.
assert accuracy_score(clf.label_, y_pred) == 1.0
def test_regressor(executable):
"""Test the regression task."""
num_machines = 2
data = create_data(task='regression')
data = create_data(task="regression")
partitions = np.array_split(data, num_machines)
train_params = {
'objective': 'regression',
'num_machines': num_machines,
"objective": "regression",
"num_machines": num_machines,
}
reg = DistributedMockup(executable)
reg.fit(partitions, train_params)
y_pred = reg.predict(predict_config={})
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.0)

Просмотреть файл

@ -1,7 +1,7 @@
from pathlib import Path
default_exec_file = Path(__file__).absolute().parents[2] / 'lightgbm'
default_exec_file = Path(__file__).absolute().parents[2] / "lightgbm"
def pytest_addoption(parser):
parser.addoption('--execfile', action='store', default=str(default_exec_file))
parser.addoption("--execfile", action="store", default=str(default_exec_file))

Просмотреть файл

@ -71,9 +71,7 @@ def generate_random_arrow_table(
values: Optional[np.ndarray] = None,
) -> pa.Table:
columns = [
generate_random_arrow_array(
num_datapoints, seed + i, generate_nulls=generate_nulls, values=values
)
generate_random_arrow_array(num_datapoints, seed + i, generate_nulls=generate_nulls, values=values)
for i in range(num_columns)
]
names = [f"col_{i}" for i in range(num_columns)]
@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy():
arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False)
arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32())
arrow_dataset = lgb.Dataset(
arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups
)
arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups)
arrow_dataset.construct()
pandas_dataset = lgb.Dataset(
@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy():
# Check for equality
for field in ("label", "weight", "group"):
np_assert_array_equal(
arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True
)
np_assert_array_equal(arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True)
np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True)
np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True)
@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
],
)
@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
def test_dataset_construct_init_scores_array(
array_type: Any, init_score_data: Any, arrow_type: Any
):
def test_dataset_construct_init_scores_array(array_type: Any, init_score_data: Any, arrow_type: Any):
data = generate_dummy_arrow_table()
init_scores = array_type(init_score_data, type=arrow_type)
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True)
p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True)
p_first_iter_pandas = booster.predict(
data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True
)
p_first_iter_pandas = booster.predict(data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True)
np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True)

Просмотреть файл

@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
def test_basic(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True),
test_size=0.1, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
feature_names[1] = "a" * 1000 # set one name to a value longer than default buffer size
train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
@ -34,7 +35,7 @@ def test_basic(tmp_path):
"verbose": -1,
"num_threads": 1,
"max_bin": 255,
"gpu_use_dp": True
"gpu_use_dp": True,
}
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
@ -49,7 +50,7 @@ def test_basic(tmp_path):
assert bst.current_iteration() == 20
assert bst.num_trees() == 20
assert bst.num_model_per_iteration() == 1
if getenv('TASK', '') != 'cuda':
if getenv("TASK", "") != "cuda":
assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
assert bst.upper_bound() == pytest.approx(3.3182142872462883)
@ -79,20 +80,19 @@ def test_basic(tmp_path):
# test that shape is checked during prediction
bad_X_test = X_test[:, 1:]
bad_shape_error_msg = "The number of features in data*"
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, bad_X_test)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, sparse.csr_matrix(bad_X_test))
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, sparse.csc_matrix(bad_X_test))
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
np.testing.assert_raises_regex(
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
)
np.testing.assert_raises_regex(
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
)
with open(tname, "w+b") as f:
dump_svmlight_file(bad_X_test, y_test, f)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, tname)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
with open(tname, "w+b") as f:
dump_svmlight_file(X_test, y_test, f, zero_based=False)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, tname)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
class NumpySequence(lgb.Sequence):
@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
elif isinstance(idx, slice):
if not (idx.step is None or idx.step == 1):
raise NotImplementedError("No need to implement, caller will not set step by now")
return self.ndarray[idx.start:idx.stop]
return self.ndarray[idx.start : idx.stop]
elif isinstance(idx, list):
return self.ndarray[idx]
else:
@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
return seqs
@pytest.mark.parametrize('sample_count', [11, 100, None])
@pytest.mark.parametrize('batch_size', [3, None])
@pytest.mark.parametrize('include_0_and_nan', [False, True])
@pytest.mark.parametrize('num_seq', [1, 3])
@pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
params = {'bin_construct_sample_cnt': sample_count}
params = {"bin_construct_sample_cnt": sample_count}
nrow = 50
half_nrow = nrow // 2
@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
X = data[:, :-1]
Y = data[:, -1]
npy_bin_fname = tmpdir / 'data_from_npy.bin'
seq_bin_fname = tmpdir / 'data_from_seq.bin'
npy_bin_fname = tmpdir / "data_from_npy.bin"
seq_bin_fname = tmpdir / "data_from_seq.bin"
# Create dataset from numpy array directly.
ds = lgb.Dataset(X, label=Y, params=params)
@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
valid_X = valid_data[:, :-1]
valid_Y = valid_data[:, -1]
valid_npy_bin_fname = tmpdir / 'valid_data_from_npy.bin'
valid_seq_bin_fname = tmpdir / 'valid_data_from_seq.bin'
valid_seq2_bin_fname = tmpdir / 'valid_data_from_seq2.bin'
valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
valid_ds.save_binary(valid_npy_bin_fname)
@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)
@pytest.mark.parametrize('num_seq', [1, 2])
@pytest.mark.parametrize("num_seq", [1, 2])
def test_sequence_get_data(num_seq):
nrow = 20
ncol = 11
@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
def test_chunked_dataset():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
@ -232,12 +233,13 @@ def test_chunked_dataset():
def test_chunked_dataset_linear():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
params = {"bin_construct_sample_cnt": 100, 'linear_tree': True}
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
train_data = lgb.Dataset(X_train, label=y_train, params=params)
valid_data = train_data.create_valid(X_test, label=y_test, params=params)
train_data.construct()
@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
def test_save_dataset_subset_and_load_from_file(tmp_path):
data = np.random.rand(100, 2)
params = {'max_bin': 50, 'min_data_in_bin': 10}
params = {"max_bin": 50, "min_data_in_bin": 10}
ds = lgb.Dataset(data, params=params)
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / 'subset.bin')
lgb.Dataset(tmp_path / 'subset.bin', params=params).construct()
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
def test_subset_group():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
assert len(lgb_train.get_group()) == 201
subset = lgb_train.subset(list(range(10))).construct()
@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
d = lgb.Dataset(X, feature_name=names).construct()
dname = tmp_path / "d.txt"
d._dump_text(dname)
with open(d1name, 'rt') as d1f:
with open(d1name, "rt") as d1f:
d1txt = d1f.read()
with open(dname, 'rt') as df:
with open(dname, "rt") as df:
dtxt = df.read()
assert dtxt == d1txt
@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
def test_add_features_same_booster_behaviour(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
d1name = tmp_path / "d1.txt"
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
with open(dname, "rt") as df:
dtxt = df.read()
with open(d1name, 'rt') as d1f:
with open(d1name, "rt") as d1f:
d1txt = d1f.read()
assert dtxt == d1txt
@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
n_col = 5
X = np.random.random((n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = [f'col_{i}' for i in range(n_col)]
names = [f"col_{i}" for i in range(n_col)]
seq = _create_sequence_from_ndarray(X, 1, 30)
seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
npy_list_ds = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]],
feature_name=names, free_raw_data=False).construct()
npy_list_ds = lgb.Dataset(
[X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
).construct()
immergeable_dds = [seq_ds, npy_list_ds]
for x_1 in xxs:
# test that method works even with free_raw_data=True
@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
d1.add_features_from(d2)
assert isinstance(d1.get_data(), original_type)
assert d1.get_data().shape == (n_row, n_col * idx)
res_feature_names += [f'D{idx}_{name}' for name in names]
res_feature_names += [f"D{idx}_{name}" for name in names]
assert d1.feature_name == res_feature_names
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
arr_a = np.zeros((100, 1), dtype=np.float32)
arr_b = np.random.normal(size=(100, 5))
dataset_a = lgb.Dataset(arr_a).construct()
expected_msg = (
'[LightGBM] [Warning] There are no meaningful features which satisfy '
'the provided configuration. Decreasing Dataset parameters min_data_in_bin '
'or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n'
"[LightGBM] [Warning] There are no meaningful features which satisfy "
"the provided configuration. Decreasing Dataset parameters min_data_in_bin "
"or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
)
log_lines = capsys.readouterr().out
assert expected_msg in log_lines
@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
base = lgb.Booster(train_set=ds)
@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
base.update()
basename = tmp_path / "basename.txt"
base.save_model(basename)
with open(basename, 'rt') as f:
with open(basename, "rt") as f:
basetxt = f.read()
# Set extremely harsh penalties, so CEGB will block most splits.
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
{'cegb_penalty_split': 1}]
cases = [
{"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
{"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
{"cegb_penalty_split": 1},
]
for case in cases:
booster = lgb.Booster(train_set=ds, params=case)
for _ in range(10):
booster.update()
casename = tmp_path / "casename.txt"
booster.save_model(casename)
with open(casename, 'rt') as f:
with open(casename, "rt") as f:
casetxt = f.read()
assert basetxt != casetxt
@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
# Compare pairs of penalties, to ensure scaling works as intended
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
({'cegb_penalty_split': 1},
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
for (p1, p2) in pairs:
pairs = [
(
{"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
{"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
),
(
{"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
{"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
),
({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
]
for p1, p2 in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2)
for _ in range(10):
@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
# Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1.reset_parameter(p2)
booster1.save_model(p1name)
with open(p1name, 'rt') as f:
with open(p1name, "rt") as f:
p1txt = f.read()
p2name = tmp_path / "p2.txt"
booster2.save_model(p2name)
with open(p2name, 'rt') as f:
with open(p2name, "rt") as f:
p2txt = f.read()
assert p1txt == p2txt
def test_consistent_state_for_dataset_fields():
def check_asserts(data):
np.testing.assert_allclose(data.label, data.get_label())
np.testing.assert_allclose(data.label, data.get_field('label'))
np.testing.assert_allclose(data.label, data.get_field("label"))
assert not np.isnan(data.label[0])
assert not np.isinf(data.label[1])
np.testing.assert_allclose(data.weight, data.get_weight())
np.testing.assert_allclose(data.weight, data.get_field('weight'))
np.testing.assert_allclose(data.weight, data.get_field("weight"))
assert not np.isnan(data.weight[0])
assert not np.isinf(data.weight[1])
np.testing.assert_allclose(data.init_score, data.get_init_score())
np.testing.assert_allclose(data.init_score, data.get_field('init_score'))
np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
assert not np.isnan(data.init_score[0])
assert not np.isinf(data.init_score[1])
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]],
data.label[0]))
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
assert data.label[1] == pytest.approx(data.weight[1])
assert data.feature_name == data.get_feature_name()
@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
sequence = np.ones(y.shape[0])
sequence[0] = np.nan
sequence[1] = np.inf
feature_names = [f'f{i}'for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence,
weight=sequence, init_score=sequence,
feature_name=feature_names).construct()
feature_names = [f"f{i}" for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
check_asserts(lgb_data)
lgb_data = lgb.Dataset(X, y).construct()
lgb_data.set_label(sequence)
@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
def test_dataset_construction_overwrites_user_provided_metadata_fields():
X = np.array([[1.0, 2.0], [3.0, 4.0]])
position = np.array([0.0, 1.0], dtype=np.float32)
if getenv('TASK', '') == 'cuda':
if getenv("TASK", "") == "cuda":
position = None
dtrain = lgb.Dataset(
X,
params={
"min_data_in_bin": 1,
"min_data_in_leaf": 1,
"verbosity": -1
},
params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
group=[1, 1],
init_score=[0.312, 0.708],
label=[1, 2],
@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
assert dtrain.get_init_score() == [0.312, 0.708]
assert dtrain.label == [1, 2]
assert dtrain.get_label() == [1, 2]
if getenv('TASK', '') != 'cuda':
np_assert_array_equal(
dtrain.position,
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
np_assert_array_equal(
dtrain.get_position(),
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
if getenv("TASK", "") != "cuda":
np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
assert dtrain.weight == [0.5, 1.5]
assert dtrain.get_weight() == [0.5, 1.5]
@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.group, expected_group, strict=True)
np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
# get_field("group") returns a numpy array with boundaries, instead of size
np_assert_array_equal(
dtrain.get_field("group"),
np.array([0, 1, 2], dtype=np.int32),
strict=True
)
np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
expected_init_score = np.array([0.312, 0.708],)
expected_init_score = np.array(
[0.312, 0.708],
)
np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
if getenv('TASK', '') != 'cuda':
if getenv("TASK", "") != "cuda":
expected_position = np.array([0.0, 1.0], dtype=np.float32)
np_assert_array_equal(dtrain.position, expected_position, strict=True)
np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
# NOTE: "position" is converted to int32 on the C++ side
np_assert_array_equal(
dtrain.get_field("position"),
np.array([0.0, 1.0], dtype=np.int32),
strict=True
)
np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
expected_weight = np.array([0.5, 1.5], dtype=np.float32)
np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
def test_choose_param_value():
original_params = {
"local_listen_port": 1234,
"port": 2222,
@ -599,30 +584,20 @@ def test_choose_param_value():
# should resolve duplicate aliases, and prefer the main parameter
params = lgb.basic._choose_param_value(
main_param_name="local_listen_port",
params=original_params,
default_value=5555
main_param_name="local_listen_port", params=original_params, default_value=5555
)
assert params["local_listen_port"] == 1234
assert "port" not in params
# should choose the highest priority alias and set that value on main param
# if only aliases are used
params = lgb.basic._choose_param_value(
main_param_name="num_iterations",
params=params,
default_value=17
)
params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
assert params["num_iterations"] == 13
assert "num_trees" not in params
assert "n_iter" not in params
# should use the default if main param and aliases are missing
params = lgb.basic._choose_param_value(
main_param_name="learning_rate",
params=params,
default_value=0.789
)
params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
assert params["learning_rate"] == 0.789
# all changes should be made on copies and not modify the original
@ -637,37 +612,23 @@ def test_choose_param_value():
def test_choose_param_value_preserves_nones():
# preserves None found for main param and still removes aliases
params = lgb.basic._choose_param_value(
main_param_name="num_threads",
params={
"num_threads": None,
"n_jobs": 4,
"objective": "regression"
},
default_value=2
params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
default_value=2,
)
assert params == {"num_threads": None, "objective": "regression"}
# correctly chooses value when only an alias is provided
params = lgb.basic._choose_param_value(
main_param_name="num_threads",
params={
"n_jobs": None,
"objective": "regression"
},
default_value=2
main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
)
assert params == {"num_threads": None, "objective": "regression"}
# adds None if that's given as the default and param not found
params = lgb.basic._choose_param_value(
main_param_name="min_data_in_leaf",
params={
"objective": "regression"
},
default_value=None
main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
)
assert params == {"objective": "regression", "min_data_in_leaf": None}
@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
def test_choose_param_value_objective(objective_alias):
# If callable is found in objective
params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=None
)
assert params['objective'] == dummy_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
assert params["objective"] == dummy_obj
# Value in params should be preferred to the default_value passed from keyword arguments
params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=mse_obj
)
assert params['objective'] == dummy_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
assert params["objective"] == dummy_obj
# None of objective or its aliases in params, but default_value is callable.
params = {}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=mse_obj
)
assert params['objective'] == mse_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
assert params["objective"] == mse_obj
@pytest.mark.parametrize('collection', ['1d_np', '2d_np', 'pd_float', 'pd_str', '1d_list', '2d_list'])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_list_to_1d_numpy(collection, dtype):
collection2y = {
'1d_np': np.random.rand(10),
'2d_np': np.random.rand(10, 1),
'pd_float': np.random.rand(10),
'pd_str': ['a', 'b'],
'1d_list': [1] * 10,
'2d_list': [[1], [2]],
"1d_np": np.random.rand(10),
"2d_np": np.random.rand(10, 1),
"pd_float": np.random.rand(10),
"pd_str": ["a", "b"],
"1d_list": [1] * 10,
"2d_list": [[1], [2]],
}
y = collection2y[collection]
if collection.startswith('pd'):
if collection.startswith("pd"):
if not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
pytest.skip("pandas is not installed")
else:
y = pd_Series(y)
if isinstance(y, np.ndarray) and len(y.shape) == 2:
with pytest.warns(UserWarning, match='column-vector'):
with pytest.warns(UserWarning, match="column-vector"):
lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
return
elif isinstance(y, list) and isinstance(y[0], list):
@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
assert result.dtype == dtype
@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list'])
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
def test_init_score_for_multiclass_classification(init_score_type):
init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
if init_score_type == 'array':
if init_score_type == "array":
init_score = np.array(init_score)
elif init_score_type == 'dataframe':
elif init_score_type == "dataframe":
if not PANDAS_INSTALLED:
pytest.skip('Pandas is not installed.')
pytest.skip("Pandas is not installed.")
init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2)
ds = lgb.Dataset(data, init_score=init_score).construct()
np.testing.assert_equal(ds.get_field('init_score'), init_score)
np.testing.assert_equal(ds.get_field("init_score"), init_score)
np.testing.assert_equal(ds.init_score, init_score)
def test_smoke_custom_parser(tmp_path):
data_path = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' / 'binary.train'
parser_config_file = tmp_path / 'parser.ini'
with open(parser_config_file, 'w') as fout:
data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
parser_config_file = tmp_path / "parser.ini"
with open(parser_config_file, "w") as fout:
fout.write('{"className": "dummy", "id": "1"}')
data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
with pytest.raises(lgb.basic.LightGBMError,
match="Cannot find parser class 'dummy', please register first or check config format"):
with pytest.raises(
lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
):
data.construct()
@ -770,9 +720,13 @@ def test_param_aliases():
assert all(isinstance(i, list) for i in aliases.values())
assert all(len(i) >= 1 for i in aliases.values())
assert all(k in v for k, v in aliases.items())
assert lgb.basic._ConfigAliases.get('config', 'task') == {'config', 'config_file', 'task', 'task_type'}
assert lgb.basic._ConfigAliases.get_sorted('min_data_in_leaf') == [
'min_data_in_leaf', 'min_data', 'min_samples_leaf', 'min_child_samples', 'min_data_per_leaf'
assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
"min_data_in_leaf",
"min_data",
"min_samples_leaf",
"min_child_samples",
"min_data_per_leaf",
]
@ -793,10 +747,10 @@ def test_custom_objective_safety():
y_multiclass = np.arange(nrows) % nclass
ds_binary = lgb.Dataset(X, y_binary).construct()
ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
bad_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
good_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
bad_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
good_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_binary.update(fobj=_good_gradients)
with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
bad_bst_binary.update(fobj=_bad_gradients)
@ -805,33 +759,30 @@ def test_custom_objective_safety():
bad_bst_multi.update(fobj=_bad_gradients)
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto'])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas')
pd = pytest.importorskip("pandas")
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)
@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
@pytest.mark.parametrize('categories', ['seen', 'unseen'])
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
pd = pytest.importorskip('pandas')
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
if categories == 'seen':
pandas_categorical = [['a', 'b']]
pd = pytest.importorskip("pandas")
X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
column_name = "a" if feature_name == "auto" else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
if categories == "seen":
pandas_categorical = [["a", "b"]]
else:
pandas_categorical = [['a']]
pandas_categorical = [["a"]]
data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
# check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes
if categories == 'seen':
if categories == "seen":
# if all categories were seen during training we just take the codes
codes = df[column_name].cat.codes
else:
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
a_code = df[column_name].cat.categories.get_loc('a')
codes = np.where(df[column_name] == 'a', a_code, np.nan)
a_code = df[column_name].cat.categories.get_loc("a")
codes = np.where(df[column_name] == "a", a_code, np.nan)
np.testing.assert_equal(codes, data[:, 0])
@pytest.mark.parametrize('min_data_in_bin', [2, 10])
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
def test_feature_num_bin(min_data_in_bin):
X = np.vstack([
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]).T
X = np.vstack(
[
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]
).T
n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
ds_kwargs = {
"params": {'min_data_in_bin': min_data_in_bin},
"params": {"min_data_in_bin": min_data_in_bin},
"categorical_feature": [n_continuous], # last feature
}
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
assert bins_by_name == expected_num_bins
# test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])]
default_names = [f"Column_{i}" for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range
@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
with pytest.raises(
lgb.basic.LightGBMError,
match=(
f'Tried to retrieve number of bins for feature index {num_features}, '
f'but the valid feature indices are \\[0, {num_features - 1}\\].'
)
f"Tried to retrieve number of bins for feature index {num_features}, "
f"but the valid feature indices are \\[0, {num_features - 1}\\]."
),
):
ds.feature_num_bin(num_features)
@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
def test_feature_num_bin_with_max_bin_by_feature():
X = np.random.rand(100, 3)
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct()
ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
def test_set_leaf_output():
X, y = load_breast_cancer(return_X_y=True)
ds = lgb.Dataset(X, y)
bst = lgb.Booster({'num_leaves': 2}, ds)
bst = lgb.Booster({"num_leaves": 2}, ds)
bst.update()
y_pred = bst.predict(X)
for leaf_id in range(2):

Просмотреть файл

@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
return 0.6 if boosting_round < 15 else 0.8
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_early_stopping_callback_is_picklable(serializer):
rounds = 5
callback = lgb.early_stopping(stopping_rounds=rounds)
@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
lgb.early_stopping(stopping_rounds="neverrrr")
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_log_evaluation_callback_is_picklable(serializer):
periods = 42
callback = lgb.log_evaluation(period=periods)
@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
assert callback.period == periods
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_record_evaluation_callback_is_picklable(serializer):
results = {}
callback = lgb.record_evaluation(eval_result=results)
@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
assert callback.eval_result is results
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_reset_parameter_callback_is_picklable(serializer):
params = {
'bagging_fraction': [0.7] * 5 + [0.6] * 5,
'feature_fraction': reset_feature_fraction
}
params = {"bagging_fraction": [0.7] * 5 + [0.6] * 5, "feature_fraction": reset_feature_fraction}
callback = lgb.reset_parameter(**params)
callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
assert callback_from_disk.order == 10

Просмотреть файл

@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
import lightgbm as lgb
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / 'examples'
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / "examples"
class FileLoader:
def __init__(self, directory, prefix, config_file='train.conf'):
def __init__(self, directory, prefix, config_file="train.conf"):
self.directory = directory
self.prefix = prefix
self.params = {'gpu_use_dp': True}
with open(self.directory / config_file, 'r') as f:
self.params = {"gpu_use_dp": True}
with open(self.directory / config_file, "r") as f:
for line in f.readlines():
line = line.strip()
if line and not line.startswith('#'):
key, value = [token.strip() for token in line.split('=')]
if 'early_stopping' not in key: # disable early_stopping
self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value)
if line and not line.startswith("#"):
key, value = [token.strip() for token in line.split("=")]
if "early_stopping" not in key: # disable early_stopping
self.params[key] = value if key not in {"num_trees", "num_threads"} else int(value)
def load_dataset(self, suffix, is_sparse=False):
filename = str(self.path(suffix))
@ -33,14 +32,14 @@ class FileLoader:
return mat[:, 1:], mat[:, 0], filename
def load_field(self, suffix):
return np.loadtxt(str(self.directory / f'{self.prefix}{suffix}'))
return np.loadtxt(str(self.directory / f"{self.prefix}{suffix}"))
def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
def load_cpp_result(self, result_file="LightGBM_predict_result.txt"):
return np.loadtxt(str(self.directory / result_file))
def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
params = dict(self.params)
params['force_row_wise'] = True
params["force_row_wise"] = True
gbm = lgb.train(params, lgb_train)
y_pred = gbm.predict(X_test)
cpp_pred = gbm.predict(X_test_fn)
@ -49,7 +48,7 @@ class FileLoader:
def file_load_check(self, lgb_train, name):
lgb_train_f = lgb.Dataset(self.path(name), params=self.params).construct()
for f in ('num_data', 'num_feature', 'get_label', 'get_weight', 'get_init_score', 'get_group'):
for f in ("num_data", "num_feature", "get_label", "get_weight", "get_init_score", "get_group"):
a = getattr(lgb_train, f)()
b = getattr(lgb_train_f, f)()
if a is None and b is None:
@ -62,83 +61,83 @@ class FileLoader:
assert a == b, f
def path(self, suffix):
return self.directory / f'{self.prefix}{suffix}'
return self.directory / f"{self.prefix}{suffix}"
def test_binary():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
weight_train = fd.load_field('.train.weight')
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_binary_linear():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary', 'train_linear.conf')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
weight_train = fd.load_field('.train.weight')
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary", "train_linear.conf")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_multiclass():
fd = FileLoader(EXAMPLES_DIR / 'multiclass_classification', 'multiclass')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
fd = FileLoader(EXAMPLES_DIR / "multiclass_classification", "multiclass")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train)
sk_pred = gbm.predict_proba(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_regression():
fd = FileLoader(EXAMPLES_DIR / 'regression', 'regression')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
init_score_train = fd.load_field('.train.init')
fd = FileLoader(EXAMPLES_DIR / "regression", "regression")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
init_score_train = fd.load_field(".train.init")
lgb_train = lgb.Dataset(X_train, y_train, init_score=init_score_train)
gbm = lgb.LGBMRegressor(**fd.params)
gbm.fit(X_train, y_train, init_score=init_score_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_lambdarank():
fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank')
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
group_train = fd.load_field('.train.query')
fd = FileLoader(EXAMPLES_DIR / "lambdarank", "rank")
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
params = dict(fd.params)
params['force_col_wise'] = True
params["force_col_wise"] = True
gbm = lgb.LGBMRanker(**params)
gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_xendcg():
fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank')
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
group_train = fd.load_field('.train.query')
fd = FileLoader(EXAMPLES_DIR / "xendcg", "rank")
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
gbm = lgb.LGBMRanker(**fd.params)
gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
params_gpu = params_cpu.copy()
params_gpu["device"] = "gpu"
# Double-precision floats are only supported on x86_64 with PoCL
params_gpu["gpu_use_dp"] = (platform.machine() == "x86_64")
params_gpu["gpu_use_dp"] = platform.machine() == "x86_64"
gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
gpu_score = log_loss(y, gpu_bst.predict(X))

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
if MATPLOTLIB_INSTALLED:
import matplotlib
matplotlib.use('Agg')
matplotlib.use("Agg")
if GRAPHVIZ_INSTALLED:
import graphviz
@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
@pytest.fixture(scope="module")
def breast_cancer_split():
return train_test_split(*load_breast_cancer(return_X_y=True),
test_size=0.1, random_state=1)
return train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=1)
def _categorical_data(category_values_lower_bound, category_values_upper_bound):
@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
@pytest.fixture
def params():
return {"objective": "binary",
"verbose": -1,
"num_leaves": 3}
return {"objective": "binary", "verbose": -1, "num_leaves": 3}
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_importance(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_importance(gbm0)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Feature importance'
assert ax0.get_xlabel() == 'Feature importance'
assert ax0.get_ylabel() == 'Features'
assert ax0.get_title() == "Feature importance"
assert ax0.get_xlabel() == "Feature importance"
assert ax0.get_ylabel() == "Features"
assert len(ax0.patches) <= 30
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train)
ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
ax1 = lgb.plot_importance(gbm1, color="r", title="t", xlabel="x", ylabel="y")
assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 't'
assert ax1.get_xlabel() == 'x'
assert ax1.get_ylabel() == 'y'
assert ax1.get_title() == "t"
assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 30
for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None)
ax2 = lgb.plot_importance(gbm0, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None)
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == ''
assert ax2.get_xlabel() == ''
assert ax2.get_ylabel() == ''
assert ax2.get_title() == ""
assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == ""
assert len(ax2.patches) <= 30
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
ax3 = lgb.plot_importance(gbm0, title='t @importance_type@', xlabel='x @importance_type@', ylabel='y @importance_type@')
ax3 = lgb.plot_importance(
gbm0, title="t @importance_type@", xlabel="x @importance_type@", ylabel="y @importance_type@"
)
assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 't @importance_type@'
assert ax3.get_xlabel() == 'x split'
assert ax3.get_ylabel() == 'y @importance_type@'
assert ax3.get_title() == "t @importance_type@"
assert ax3.get_xlabel() == "x split"
assert ax3.get_ylabel() == "y @importance_type@"
assert len(ax3.patches) <= 30
gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
assert first_bar1 != first_bar3
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_split_value_histogram(gbm0, 27)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Split value histogram for feature with index 27'
assert ax0.get_xlabel() == 'Feature split value'
assert ax0.get_ylabel() == 'Count'
assert ax0.get_title() == "Split value histogram for feature with index 27"
assert ax0.get_xlabel() == "Feature split value"
assert ax0.get_ylabel() == "Count"
assert len(ax0.patches) <= 2
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train)
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
title='Histogram for feature @index/name@ @feature@',
xlabel='x', ylabel='y', color='r')
ax1 = lgb.plot_split_value_histogram(
gbm1,
gbm1.booster_.feature_name()[27],
figsize=(10, 5),
title="Histogram for feature @index/name@ @feature@",
xlabel="x",
ylabel="y",
color="r",
)
assert isinstance(ax1, matplotlib.axes.Axes)
title = f'Histogram for feature name {gbm1.booster_.feature_name()[27]}'
title = f"Histogram for feature name {gbm1.booster_.feature_name()[27]}"
assert ax1.get_title() == title
assert ax1.get_xlabel() == 'x'
assert ax1.get_ylabel() == 'y'
assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 2
for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
title=None, xlabel=None, ylabel=None)
ax2 = lgb.plot_split_value_histogram(
gbm0, 27, bins=10, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None
)
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == ''
assert ax2.get_xlabel() == ''
assert ax2.get_ylabel() == ''
assert ax2.get_title() == ""
assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == ""
assert len(ax2.patches) == 10
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
with pytest.raises(ValueError):
lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED,
reason='matplotlib or graphviz is not installed')
@pytest.mark.skipif(
not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, reason="matplotlib or graphviz is not installed"
)
def test_plot_tree(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split
gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
with pytest.raises(IndexError):
lgb.plot_tree(gbm, tree_index=83)
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=["split_gain"])
assert isinstance(ax, matplotlib.axes.Axes)
w, h = ax.axes.get_figure().get_size_inches()
assert int(w) == 15
assert int(h) == 8
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_create_tree_digraph(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split
@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'})
graph = lgb.create_tree_digraph(
gbm,
tree_index=3,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert '#ffdddd' in graph_body
assert '#ddffdd' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "#ffdddd" in graph_body
assert "#ddffdd" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_below_max_category_values():
X_train, y_train = _categorical_data(2, 10)
params = {
@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
"deterministic": True,
"num_threads": 1,
"seed": 708,
"verbose": -1
"verbose": -1,
}
gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train)
@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'},
max_category_values=10)
graph = lgb.create_tree_digraph(
gbm,
tree_index=3,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=10,
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
assert '||...||' not in graph_body
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
assert "||...||" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_above_max_category_values():
X_train, y_train = _categorical_data(20, 30)
params = {
@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
"deterministic": True,
"num_threads": 1,
"seed": 708,
"verbose": -1
"verbose": -1,
}
gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train)
@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=9,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'},
max_category_values=4)
graph = lgb.create_tree_digraph(
gbm,
tree_index=9,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=4,
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
assert '||...||' in graph_body
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
assert "||...||" in graph_body
@pytest.mark.parametrize('use_missing', [True, False])
@pytest.mark.parametrize('zero_as_missing', [True, False])
@pytest.mark.parametrize("use_missing", [True, False])
@pytest.mark.parametrize("zero_as_missing", [True, False])
def test_numeric_split_direction(use_missing, zero_as_missing):
if use_missing and zero_as_missing:
pytest.skip('use_missing and zero_as_missing both set to True')
pytest.skip("use_missing and zero_as_missing both set to True")
X, y = make_synthetic_regression()
rng = np.random.RandomState(0)
zero_mask = rng.rand(X.shape[0]) < 0.05
@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
X[nan_mask, :] = np.nan
ds = lgb.Dataset(X, y)
params = {
'num_leaves': 127,
'min_child_samples': 1,
'use_missing': use_missing,
'zero_as_missing': zero_as_missing,
"num_leaves": 127,
"min_child_samples": 1,
"use_missing": use_missing,
"zero_as_missing": zero_as_missing,
}
bst = lgb.train(params, ds, num_boost_round=1)
case_with_zero = X[zero_mask][[0]]
expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure']
while 'decision_type' in node:
node = bst.dump_model()["tree_info"][0]["tree_structure"]
while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_zero[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
case_with_zero[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert node['leaf_index'] == expected_leaf_zero
node = node["left_child"] if direction == "left" else node["right_child"]
assert node["leaf_index"] == expected_leaf_zero
if use_missing:
case_with_nan = X[nan_mask][[0]]
expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure']
while 'decision_type' in node:
node = bst.dump_model()["tree_info"][0]["tree_structure"]
while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_nan[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
case_with_nan[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert node['leaf_index'] == expected_leaf_nan
node = node["left_child"] if direction == "left" else node["right_child"]
assert node["leaf_index"] == expected_leaf_nan
assert expected_leaf_zero != expected_leaf_nan
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_example_case_in_tree_digraph():
rng = np.random.RandomState(0)
x1 = rng.rand(100)
cat = rng.randint(1, 3, size=x1.size)
X = np.vstack([x1, cat]).T
y = x1 + 2 * cat
feature_name = ['x1', 'cat']
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=['cat'])
feature_name = ["x1", "cat"]
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=["cat"])
num_round = 3
bst = lgb.train({'num_leaves': 7}, ds, num_boost_round=num_round)
bst = lgb.train({"num_leaves": 7}, ds, num_boost_round=num_round)
mod = bst.dump_model()
example_case = X[[0]]
makes_categorical_splits = False
@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
for i in range(num_round):
graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
gbody = graph.body
node = mod['tree_info'][i]['tree_structure']
while 'decision_type' in node: # iterate through the splits
split_index = node['split_index']
node = mod["tree_info"][i]["tree_structure"]
while "decision_type" in node: # iterate through the splits
split_index = node["split_index"]
node_in_graph = [n for n in gbody if f'split{split_index}' in n and '->' not in n]
node_in_graph = [n for n in gbody if f"split{split_index}" in n and "->" not in n]
assert len(node_in_graph) == 1
seen_indices.add(gbody.index(node_in_graph[0]))
edge_to_node = [e for e in gbody if f'-> split{split_index}' in e]
if node['decision_type'] == '<=':
edge_to_node = [e for e in gbody if f"-> split{split_index}" in e]
if node["decision_type"] == "<=":
direction = lgb.plotting._determine_direction_for_numeric_split(
example_case[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'])
example_case[0][node["split_feature"]],
node["threshold"],
node["missing_type"],
node["default_left"],
)
else:
makes_categorical_splits = True
direction = lgb.plotting._determine_direction_for_categorical_split(
example_case[0][node['split_feature']], node['threshold']
example_case[0][node["split_feature"]], node["threshold"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert 'color=blue' in node_in_graph[0]
node = node["left_child"] if direction == "left" else node["right_child"]
assert "color=blue" in node_in_graph[0]
if edge_to_node:
assert len(edge_to_node) == 1
assert 'color=blue' in edge_to_node[0]
assert "color=blue" in edge_to_node[0]
seen_indices.add(gbody.index(edge_to_node[0]))
# we're in a leaf now
leaf_index = node['leaf_index']
leaf_in_graph = [n for n in gbody if f'leaf{leaf_index}' in n and '->' not in n]
edge_to_leaf = [e for e in gbody if f'-> leaf{leaf_index}' in e]
leaf_index = node["leaf_index"]
leaf_in_graph = [n for n in gbody if f"leaf{leaf_index}" in n and "->" not in n]
edge_to_leaf = [e for e in gbody if f"-> leaf{leaf_index}" in e]
assert len(leaf_in_graph) == 1
assert 'color=blue' in leaf_in_graph[0]
assert "color=blue" in leaf_in_graph[0]
assert len(edge_to_leaf) == 1
assert 'color=blue' in edge_to_leaf[0]
assert "color=blue" in edge_to_leaf[0]
seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])
# check that the rest of the elements have black color
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and 'graph' not in e]
assert all('color=black' in e for e in remaining_elements)
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and "graph" not in e]
assert all("color=black" in e for e in remaining_elements)
# check that we got to the expected leaf
expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
assert makes_categorical_splits
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.parametrize('input_type', ['array', 'dataframe'])
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
def test_empty_example_case_on_tree_digraph_raises_error(input_type):
X, y = make_synthetic_regression()
if input_type == 'dataframe':
if input_type == "dataframe":
if not PANDAS_INSTALLED:
pytest.skip(reason='pandas is not installed')
pytest.skip(reason="pandas is not installed")
X = pd_DataFrame(X)
ds = lgb.Dataset(X, y)
bst = lgb.train({'num_leaves': 3}, ds, num_boost_round=1)
bst = lgb.train({"num_leaves": 3}, ds, num_boost_round=1)
example_case = X[:0]
if input_type == 'dataframe':
if input_type == "dataframe":
example_case = pd_DataFrame(example_case)
with pytest.raises(ValueError, match='example_case must have a single row.'):
with pytest.raises(ValueError, match="example_case must have a single row."):
lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_metrics(params, breast_cancer_split, train_data):
X_train, X_test, y_train, y_test = breast_cancer_split
test_data = lgb.Dataset(X_test, y_test, reference=train_data)
params.update({"metric": {"binary_logloss", "binary_error"}})
evals_result0 = {}
lgb.train(params, train_data,
valid_sets=[train_data, test_data],
valid_names=['v1', 'v2'],
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result0)])
lgb.train(
params,
train_data,
valid_sets=[train_data, test_data],
valid_names=["v1", "v2"],
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result0)],
)
with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
ax0 = lgb.plot_metric(evals_result0)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Metric during training'
assert ax0.get_xlabel() == 'Iterations'
assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
assert ax0.get_title() == "Metric during training"
assert ax0.get_xlabel() == "Iterations"
assert ax0.get_ylabel() in {"binary_logloss", "binary_error"}
legend_items = ax0.get_legend().get_texts()
assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1'
assert legend_items[1].get_text() == 'v2'
assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == "v2"
ax1 = lgb.plot_metric(evals_result0, metric='binary_error')
ax1 = lgb.plot_metric(evals_result0, metric="binary_error")
assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 'Metric during training'
assert ax1.get_xlabel() == 'Iterations'
assert ax1.get_ylabel() == 'binary_error'
assert ax1.get_title() == "Metric during training"
assert ax1.get_xlabel() == "Iterations"
assert ax1.get_ylabel() == "binary_error"
legend_items = ax1.get_legend().get_texts()
assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1'
assert legend_items[1].get_text() == 'v2'
assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == "v2"
ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
ax2 = lgb.plot_metric(evals_result0, metric="binary_logloss", dataset_names=["v2"])
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == 'Metric during training'
assert ax2.get_xlabel() == 'Iterations'
assert ax2.get_ylabel() == 'binary_logloss'
assert ax2.get_title() == "Metric during training"
assert ax2.get_xlabel() == "Iterations"
assert ax2.get_ylabel() == "binary_logloss"
legend_items = ax2.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v2'
assert legend_items[0].get_text() == "v2"
ax3 = lgb.plot_metric(
evals_result0,
metric='binary_logloss',
dataset_names=['v1'],
title='Metric @metric@',
xlabel='Iterations @metric@',
metric="binary_logloss",
dataset_names=["v1"],
title="Metric @metric@",
xlabel="Iterations @metric@",
ylabel='Value of "@metric@"',
figsize=(5, 5),
dpi=600,
grid=False
grid=False,
)
assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 'Metric @metric@'
assert ax3.get_xlabel() == 'Iterations @metric@'
assert ax3.get_title() == "Metric @metric@"
assert ax3.get_xlabel() == "Iterations @metric@"
assert ax3.get_ylabel() == 'Value of "binary_logloss"'
legend_items = ax3.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v1'
assert legend_items[0].get_text() == "v1"
assert ax3.get_figure().get_figheight() == 5
assert ax3.get_figure().get_figwidth() == 5
assert ax3.get_figure().get_dpi() == 600
@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
assert not grid_line.get_visible()
evals_result1 = {}
lgb.train(params, train_data,
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result1)])
lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)])
with pytest.raises(ValueError, match="eval results cannot be empty."):
lgb.plot_metric(evals_result1)
@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
assert isinstance(ax4, matplotlib.axes.Axes)
assert ax4.get_title() == ''
assert ax4.get_xlabel() == ''
assert ax4.get_ylabel() == ''
assert ax4.get_title() == ""
assert ax4.get_xlabel() == ""
assert ax4.get_ylabel() == ""
legend_items = ax4.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'valid_0'
assert legend_items[0].get_text() == "valid_0"

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -10,7 +10,7 @@ import lightgbm as lgb
def test_register_logger(tmp_path):
logger = logging.getLogger("LightGBM")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s | %(message)s')
formatter = logging.Formatter("%(levelname)s | %(message)s")
log_filename = tmp_path / "LightGBM_test_logger.log"
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
logger.addHandler(file_handler)
def dummy_metric(_, __):
logger.debug('In dummy_metric')
return 'dummy_metric', 1, True
logger.debug("In dummy_metric")
return "dummy_metric", 1, True
lgb.register_logger(logger)
X = np.array([[1, 2, 3],
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0])
lgb_train = lgb.Dataset(X, y)
lgb_valid = lgb.Dataset(X, y) # different object for early-stopping
eval_records = {}
callbacks = [
lgb.record_evaluation(eval_records),
lgb.log_evaluation(2),
lgb.early_stopping(10)
]
lgb.train({'objective': 'binary', 'metric': ['auc', 'binary_error']},
lgb_train, num_boost_round=10, feval=dummy_metric,
valid_sets=[lgb_valid], categorical_feature=[1], callbacks=callbacks)
callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)]
lgb.train(
{"objective": "binary", "metric": ["auc", "binary_error"]},
lgb_train,
num_boost_round=10,
feval=dummy_metric,
valid_sets=[lgb_valid],
categorical_feature=[1],
callbacks=callbacks,
)
lgb.plot_metric(eval_records)
@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!",
]
cuda_lines = [
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
@ -142,11 +140,7 @@ def test_register_custom_logger():
logged_messages.append(msg)
custom_logger = CustomLogger()
lgb.register_logger(
custom_logger,
info_method_name="custom_info",
warning_method_name="custom_warning"
)
lgb.register_logger(custom_logger, info_method_name="custom_info", warning_method_name="custom_warning")
lgb.basic._log_info("info message")
lgb.basic._log_warning("warning message")
@ -155,18 +149,14 @@ def test_register_custom_logger():
assert logged_messages == expected_log
logged_messages = []
X = np.array([[1, 2, 3],
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0])
lgb_data = lgb.Dataset(X, y)
lgb.train(
{'objective': 'binary', 'metric': 'auc'},
{"objective": "binary", "metric": "auc"},
lgb_data,
num_boost_round=10,
valid_sets=[lgb_data],
categorical_feature=[1]
categorical_feature=[1],
)
assert logged_messages, "custom logger was not called"

Просмотреть файл

@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
return sklearn.datasets.load_linnerud(**kwargs)
def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
group=None, random_gs=False, avg_gs=10, random_state=0):
def make_ranking(
n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
relvalues = range(gmax + 1)
# build y/target and group-id vectors with user-specified group sizes.
if group is not None and hasattr(group, '__len__'):
if group is not None and hasattr(group, "__len__"):
n_samples = np.sum(group)
for i, gsize in enumerate(group):
@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
@lru_cache(maxsize=None)
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features,
n_informative=n_informative, random_state=random_state)
return sklearn.datasets.make_regression(
n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state
)
def dummy_obj(preds, train_data):
@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
def mse_obj(y_pred, dtrain):
y_true = dtrain.get_label()
grad = (y_pred - y_true)
grad = y_pred - y_true
hess = np.ones(len(grad))
return grad, hess
@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
def pickle_obj(obj, filepath, serializer):
if serializer == 'pickle':
with open(filepath, 'wb') as f:
if serializer == "pickle":
with open(filepath, "wb") as f:
pickle.dump(obj, f)
elif serializer == 'joblib':
elif serializer == "joblib":
joblib.dump(obj, filepath)
elif serializer == 'cloudpickle':
with open(filepath, 'wb') as f:
elif serializer == "cloudpickle":
with open(filepath, "wb") as f:
cloudpickle.dump(obj, f)
else:
raise ValueError(f'Unrecognized serializer type: {serializer}')
raise ValueError(f"Unrecognized serializer type: {serializer}")
def unpickle_obj(filepath, serializer):
if serializer == 'pickle':
with open(filepath, 'rb') as f:
if serializer == "pickle":
with open(filepath, "rb") as f:
return pickle.load(f)
elif serializer == 'joblib':
elif serializer == "joblib":
return joblib.load(filepath)
elif serializer == 'cloudpickle':
with open(filepath, 'rb') as f:
elif serializer == "cloudpickle":
with open(filepath, "rb") as f:
return cloudpickle.load(f)
else:
raise ValueError(f'Unrecognized serializer type: {serializer}')
raise ValueError(f"Unrecognized serializer type: {serializer}")
def pickle_and_unpickle_object(obj, serializer):
with lgb.basic._TempFile() as tmp_file:
pickle_obj(
obj=obj,
filepath=tmp_file.name,
serializer=serializer
)
obj_from_disk = unpickle_obj(
filepath=tmp_file.name,
serializer=serializer
)
pickle_obj(obj=obj, filepath=tmp_file.name, serializer=serializer)
obj_from_disk = unpickle_obj(filepath=tmp_file.name, serializer=serializer)
return obj_from_disk # noqa: RET504
# doing this here, at import time, to ensure it only runs once_per import
# instead of once per assertion
_numpy_testing_supports_strict_kwarg = (
"strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
)
_numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
def np_assert_array_equal(*args, **kwargs):