зеркало из https://github.com/microsoft/LightGBM.git
[ci] [python-package] enable ruff-format on tests and examples (#6317)
This commit is contained in:
Родитель
b60068c810
Коммит
1b792e7166
|
@ -7,6 +7,12 @@ exclude: |
|
|||
)$
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
args: ["--settings-path", "python-package/pyproject.toml"]
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
# Ruff version.
|
||||
rev: v0.2.1
|
||||
|
@ -14,12 +20,8 @@ repos:
|
|||
# Run the linter.
|
||||
- id: ruff
|
||||
args: ["--config", "python-package/pyproject.toml"]
|
||||
types_or: [python, jupyter]
|
||||
# Run the formatter.
|
||||
- id: ruff-format
|
||||
args: ["--config", "python-package/pyproject.toml"]
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
args: ["--settings-path", "python-package/pyproject.toml"]
|
||||
types_or: [python, jupyter]
|
||||
|
|
|
@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score
|
|||
|
||||
import lightgbm as lgb
|
||||
|
||||
print('Loading data...')
|
||||
print("Loading data...")
|
||||
# load or create your dataset
|
||||
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
|
||||
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
|
||||
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
|
||||
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
|
||||
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
|
||||
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
|
||||
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
|
||||
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
|
||||
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
|
||||
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]
|
||||
|
||||
y_train = df_train[0]
|
||||
y_test = df_test[0]
|
||||
|
@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape
|
|||
|
||||
# create dataset for lightgbm
|
||||
# if you want to re-use data, remember to set free_raw_data=False
|
||||
lgb_train = lgb.Dataset(X_train, y_train,
|
||||
weight=W_train, free_raw_data=False)
|
||||
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
|
||||
weight=W_test, free_raw_data=False)
|
||||
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
|
||||
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
|
||||
|
||||
# specify your configurations as a dict
|
||||
params = {
|
||||
'boosting_type': 'gbdt',
|
||||
'objective': 'binary',
|
||||
'metric': 'binary_logloss',
|
||||
'num_leaves': 31,
|
||||
'learning_rate': 0.05,
|
||||
'feature_fraction': 0.9,
|
||||
'bagging_fraction': 0.8,
|
||||
'bagging_freq': 5,
|
||||
'verbose': 0
|
||||
"boosting_type": "gbdt",
|
||||
"objective": "binary",
|
||||
"metric": "binary_logloss",
|
||||
"num_leaves": 31,
|
||||
"learning_rate": 0.05,
|
||||
"feature_fraction": 0.9,
|
||||
"bagging_fraction": 0.8,
|
||||
"bagging_freq": 5,
|
||||
"verbose": 0,
|
||||
}
|
||||
|
||||
# generate feature names
|
||||
feature_name = [f'feature_{col}' for col in range(num_feature)]
|
||||
feature_name = [f"feature_{col}" for col in range(num_feature)]
|
||||
|
||||
print('Starting training...')
|
||||
print("Starting training...")
|
||||
# feature_name and categorical_feature
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
valid_sets=lgb_train, # eval training data
|
||||
feature_name=feature_name,
|
||||
categorical_feature=[21])
|
||||
gbm = lgb.train(
|
||||
params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
valid_sets=lgb_train, # eval training data
|
||||
feature_name=feature_name,
|
||||
categorical_feature=[21],
|
||||
)
|
||||
|
||||
print('Finished first 10 rounds...')
|
||||
print("Finished first 10 rounds...")
|
||||
# check feature name
|
||||
print(f'7th feature name is: {lgb_train.feature_name[6]}')
|
||||
print(f"7th feature name is: {lgb_train.feature_name[6]}")
|
||||
|
||||
print('Saving model...')
|
||||
print("Saving model...")
|
||||
# save model to file
|
||||
gbm.save_model('model.txt')
|
||||
gbm.save_model("model.txt")
|
||||
|
||||
print('Dumping model to JSON...')
|
||||
print("Dumping model to JSON...")
|
||||
# dump model to JSON (and save to file)
|
||||
model_json = gbm.dump_model()
|
||||
|
||||
with open('model.json', 'w+') as f:
|
||||
with open("model.json", "w+") as f:
|
||||
json.dump(model_json, f, indent=4)
|
||||
|
||||
# feature names
|
||||
print(f'Feature names: {gbm.feature_name()}')
|
||||
print(f"Feature names: {gbm.feature_name()}")
|
||||
|
||||
# feature importances
|
||||
print(f'Feature importances: {list(gbm.feature_importance())}')
|
||||
print(f"Feature importances: {list(gbm.feature_importance())}")
|
||||
|
||||
print('Loading model to predict...')
|
||||
print("Loading model to predict...")
|
||||
# load model to predict
|
||||
bst = lgb.Booster(model_file='model.txt')
|
||||
bst = lgb.Booster(model_file="model.txt")
|
||||
# can only predict with the best iteration (or the saving iteration)
|
||||
y_pred = bst.predict(X_test)
|
||||
# eval with loaded model
|
||||
auc_loaded_model = roc_auc_score(y_test, y_pred)
|
||||
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")
|
||||
|
||||
print('Dumping and loading model with pickle...')
|
||||
print("Dumping and loading model with pickle...")
|
||||
# dump model with pickle
|
||||
with open('model.pkl', 'wb') as fout:
|
||||
with open("model.pkl", "wb") as fout:
|
||||
pickle.dump(gbm, fout)
|
||||
# load model with pickle to predict
|
||||
with open('model.pkl', 'rb') as fin:
|
||||
with open("model.pkl", "rb") as fin:
|
||||
pkl_bst = pickle.load(fin)
|
||||
# can predict with any iteration when loaded in pickle way
|
||||
y_pred = pkl_bst.predict(X_test, num_iteration=7)
|
||||
|
@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
|
|||
# init_model accepts:
|
||||
# 1. model file name
|
||||
# 2. Booster()
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model='model.txt',
|
||||
valid_sets=lgb_eval)
|
||||
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)
|
||||
|
||||
print('Finished 10 - 20 rounds with model file...')
|
||||
print("Finished 10 - 20 rounds with model file...")
|
||||
|
||||
# decay learning rates
|
||||
# reset_parameter callback accepts:
|
||||
# 1. list with length = num_boost_round
|
||||
# 2. function(curr_iter)
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
valid_sets=lgb_eval,
|
||||
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
|
||||
gbm = lgb.train(
|
||||
params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
valid_sets=lgb_eval,
|
||||
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
|
||||
)
|
||||
|
||||
print('Finished 20 - 30 rounds with decay learning rates...')
|
||||
print("Finished 20 - 30 rounds with decay learning rates...")
|
||||
|
||||
# change other parameters during training
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
valid_sets=lgb_eval,
|
||||
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
|
||||
gbm = lgb.train(
|
||||
params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
valid_sets=lgb_eval,
|
||||
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
|
||||
)
|
||||
|
||||
print('Finished 30 - 40 rounds with changing bagging_fraction...')
|
||||
print("Finished 30 - 40 rounds with changing bagging_fraction...")
|
||||
|
||||
|
||||
# self-defined objective function
|
||||
|
@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...')
|
|||
# log likelihood loss
|
||||
def loglikelihood(preds, train_data):
|
||||
labels = train_data.get_label()
|
||||
preds = 1. / (1. + np.exp(-preds))
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1. - preds)
|
||||
hess = preds * (1.0 - preds)
|
||||
return grad, hess
|
||||
|
||||
|
||||
|
@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
|
|||
# Keep this in mind when you use the customization
|
||||
def binary_error(preds, train_data):
|
||||
labels = train_data.get_label()
|
||||
preds = 1. / (1. + np.exp(-preds))
|
||||
return 'error', np.mean(labels != (preds > 0.5)), False
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
return "error", np.mean(labels != (preds > 0.5)), False
|
||||
|
||||
|
||||
# Pass custom objective function through params
|
||||
params_custom_obj = copy.deepcopy(params)
|
||||
params_custom_obj['objective'] = loglikelihood
|
||||
params_custom_obj["objective"] = loglikelihood
|
||||
|
||||
gbm = lgb.train(params_custom_obj,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
feval=binary_error,
|
||||
valid_sets=lgb_eval)
|
||||
gbm = lgb.train(
|
||||
params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
|
||||
)
|
||||
|
||||
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
|
||||
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")
|
||||
|
||||
|
||||
# another self-defined eval metric
|
||||
|
@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met
|
|||
# Keep this in mind when you use the customization
|
||||
def accuracy(preds, train_data):
|
||||
labels = train_data.get_label()
|
||||
preds = 1. / (1. + np.exp(-preds))
|
||||
return 'accuracy', np.mean(labels == (preds > 0.5)), True
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
return "accuracy", np.mean(labels == (preds > 0.5)), True
|
||||
|
||||
|
||||
# Pass custom objective function through params
|
||||
params_custom_obj = copy.deepcopy(params)
|
||||
params_custom_obj['objective'] = loglikelihood
|
||||
params_custom_obj["objective"] = loglikelihood
|
||||
|
||||
gbm = lgb.train(params_custom_obj,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
feval=[binary_error, accuracy],
|
||||
valid_sets=lgb_eval)
|
||||
gbm = lgb.train(
|
||||
params_custom_obj,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
init_model=gbm,
|
||||
feval=[binary_error, accuracy],
|
||||
valid_sets=lgb_eval,
|
||||
)
|
||||
|
||||
print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
|
||||
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")
|
||||
|
||||
print('Starting a new training job...')
|
||||
print("Starting a new training job...")
|
||||
|
||||
|
||||
# callback
|
||||
|
@ -208,17 +207,14 @@ def reset_metrics():
|
|||
def callback(env):
|
||||
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
|
||||
if env.iteration - env.begin_iteration == 5:
|
||||
print('Add a new valid dataset at iteration 5...')
|
||||
env.model.add_valid(lgb_eval_new, 'new_valid')
|
||||
print("Add a new valid dataset at iteration 5...")
|
||||
env.model.add_valid(lgb_eval_new, "new_valid")
|
||||
|
||||
callback.before_iteration = True
|
||||
callback.order = 0
|
||||
return callback
|
||||
|
||||
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
valid_sets=lgb_train,
|
||||
callbacks=[reset_metrics()])
|
||||
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])
|
||||
|
||||
print('Finished first 10 rounds with callback function...')
|
||||
print("Finished first 10 rounds with callback function...")
|
||||
|
|
|
@ -10,9 +10,9 @@ import lightgbm as lgb
|
|||
if __name__ == "__main__":
|
||||
print("loading data")
|
||||
|
||||
rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank'
|
||||
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train'))
|
||||
group = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
|
||||
rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
|
||||
X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
|
||||
group = np.loadtxt(str(rank_example_dir / "rank.train.query"))
|
||||
|
||||
print("initializing a Dask cluster")
|
||||
|
||||
|
@ -32,25 +32,14 @@ if __name__ == "__main__":
|
|||
# a sparse boundary to partition the data
|
||||
X = X.toarray()
|
||||
|
||||
dX = da.from_array(
|
||||
x=X,
|
||||
chunks=[
|
||||
(rows_in_part1, rows_in_part2),
|
||||
(num_features,)
|
||||
]
|
||||
)
|
||||
dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
|
||||
dy = da.from_array(
|
||||
x=y,
|
||||
chunks=[
|
||||
(rows_in_part1, rows_in_part2),
|
||||
]
|
||||
)
|
||||
dg = da.from_array(
|
||||
x=group,
|
||||
chunks=[
|
||||
(100, group.size - 100)
|
||||
]
|
||||
],
|
||||
)
|
||||
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])
|
||||
|
||||
print("beginning training")
|
||||
|
||||
|
|
|
@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
|
|||
data = []
|
||||
ylist = []
|
||||
for f in input_flist:
|
||||
f = h5py.File(f, 'r')
|
||||
data.append(HDFSequence(f['X'], batch_size))
|
||||
ylist.append(f['Y'][:])
|
||||
f = h5py.File(f, "r")
|
||||
data.append(HDFSequence(f["X"], batch_size))
|
||||
ylist.append(f["Y"][:])
|
||||
|
||||
params = {
|
||||
'bin_construct_sample_cnt': 200000,
|
||||
'max_bin': 255,
|
||||
"bin_construct_sample_cnt": 200000,
|
||||
"max_bin": 255,
|
||||
}
|
||||
y = np.concatenate(ylist)
|
||||
dataset = lgb.Dataset(data, label=y, params=params)
|
||||
|
@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
|
|||
# The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
|
||||
# as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
|
||||
# are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
|
||||
dataset.save_binary('regression.train.from_hdf.bin')
|
||||
dataset.save_binary("regression.train.from_hdf.bin")
|
||||
|
||||
|
||||
def save2hdf(input_data, fname, batch_size):
|
||||
|
@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size):
|
|||
|
||||
Please note chunk size settings in the implementation for I/O performance optimization.
|
||||
"""
|
||||
with h5py.File(fname, 'w') as f:
|
||||
with h5py.File(fname, "w") as f:
|
||||
for name, data in input_data.items():
|
||||
nrow, ncol = data.shape
|
||||
if ncol == 1:
|
||||
|
@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size):
|
|||
# Also note that the data is stored in row major order to avoid extra copy when passing to
|
||||
# lightgbm Dataset.
|
||||
chunk = (batch_size, ncol)
|
||||
f.create_dataset(name, data=data, chunks=chunk, compression='lzf')
|
||||
f.create_dataset(name, data=data, chunks=chunk, compression="lzf")
|
||||
|
||||
|
||||
def generate_hdf(input_fname, output_basename, batch_size):
|
||||
# Save to 2 HDF5 files for demonstration.
|
||||
df = pd.read_csv(input_fname, header=None, sep='\t')
|
||||
df = pd.read_csv(input_fname, header=None, sep="\t")
|
||||
|
||||
mid = len(df) // 2
|
||||
df1 = df.iloc[:mid]
|
||||
|
@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size):
|
|||
|
||||
# We can store multiple datasets inside a single HDF5 file.
|
||||
# Separating X and Y for choosing best chunk size for data loading.
|
||||
fname1 = f'{output_basename}1.h5'
|
||||
fname2 = f'{output_basename}2.h5'
|
||||
save2hdf({'Y': df1.iloc[:, :1], 'X': df1.iloc[:, 1:]}, fname1, batch_size)
|
||||
save2hdf({'Y': df2.iloc[:, :1], 'X': df2.iloc[:, 1:]}, fname2, batch_size)
|
||||
fname1 = f"{output_basename}1.h5"
|
||||
fname2 = f"{output_basename}2.h5"
|
||||
save2hdf({"Y": df1.iloc[:, :1], "X": df1.iloc[:, 1:]}, fname1, batch_size)
|
||||
save2hdf({"Y": df2.iloc[:, :1], "X": df2.iloc[:, 1:]}, fname2, batch_size)
|
||||
|
||||
return [fname1, fname2]
|
||||
|
||||
|
||||
def main():
|
||||
batch_size = 64
|
||||
output_basename = 'regression'
|
||||
output_basename = "regression"
|
||||
hdf_files = generate_hdf(
|
||||
str(Path(__file__).absolute().parents[1] / 'regression' / 'regression.train'),
|
||||
output_basename,
|
||||
batch_size
|
||||
str(Path(__file__).absolute().parents[1] / "regression" / "regression.train"), output_basename, batch_size
|
||||
)
|
||||
|
||||
create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -24,23 +24,19 @@ import lightgbm as lgb
|
|||
# single continuous predictor
|
||||
np.random.seed(0)
|
||||
N = 1000
|
||||
X = pd.DataFrame({
|
||||
'continuous': range(N),
|
||||
'categorical': np.repeat([0, 1, 2, 3, 4], N / 5)
|
||||
})
|
||||
X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
|
||||
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
|
||||
LINEAR_TERM = np.array([
|
||||
-0.5 + 0.01 * X['continuous'][k]
|
||||
+ CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0])
|
||||
]) + np.random.normal(0, 1, X.shape[0])
|
||||
LINEAR_TERM = np.array(
|
||||
[-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
|
||||
) + np.random.normal(0, 1, X.shape[0])
|
||||
TRUE_PROB = expit(LINEAR_TERM)
|
||||
Y = np.random.binomial(1, TRUE_PROB, size=N)
|
||||
DATA = {
|
||||
'X': X,
|
||||
'probability_labels': TRUE_PROB,
|
||||
'binary_labels': Y,
|
||||
'lgb_with_binary_labels': lgb.Dataset(X, Y),
|
||||
'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB),
|
||||
"X": X,
|
||||
"probability_labels": TRUE_PROB,
|
||||
"binary_labels": Y,
|
||||
"lgb_with_binary_labels": lgb.Dataset(X, Y),
|
||||
"lgb_with_probability_labels": lgb.Dataset(X, TRUE_PROB),
|
||||
}
|
||||
|
||||
|
||||
|
@ -72,34 +68,25 @@ def experiment(objective, label_type, data):
|
|||
np.random.seed(0)
|
||||
nrounds = 5
|
||||
lgb_data = data[f"lgb_with_{label_type}_labels"]
|
||||
params = {
|
||||
'objective': objective,
|
||||
'feature_fraction': 1,
|
||||
'bagging_fraction': 1,
|
||||
'verbose': -1
|
||||
}
|
||||
params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
|
||||
time_zero = time.time()
|
||||
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
|
||||
y_fitted = gbm.predict(data['X'])
|
||||
y_fitted = gbm.predict(data["X"])
|
||||
y_true = data[f"{label_type}_labels"]
|
||||
duration = time.time() - time_zero
|
||||
return {
|
||||
'time': duration,
|
||||
'correlation': np.corrcoef(y_fitted, y_true)[0, 1],
|
||||
'logloss': log_loss(y_fitted, y_true)
|
||||
}
|
||||
return {"time": duration, "correlation": np.corrcoef(y_fitted, y_true)[0, 1], "logloss": log_loss(y_fitted, y_true)}
|
||||
|
||||
|
||||
#################
|
||||
# Observe the behavior of `binary` and `xentropy` objectives
|
||||
print('Performance of `binary` objective with binary labels:')
|
||||
print(experiment('binary', label_type='binary', data=DATA))
|
||||
print("Performance of `binary` objective with binary labels:")
|
||||
print(experiment("binary", label_type="binary", data=DATA))
|
||||
|
||||
print('Performance of `xentropy` objective with binary labels:')
|
||||
print(experiment('xentropy', label_type='binary', data=DATA))
|
||||
print("Performance of `xentropy` objective with binary labels:")
|
||||
print(experiment("xentropy", label_type="binary", data=DATA))
|
||||
|
||||
print('Performance of `xentropy` objective with probability labels:')
|
||||
print(experiment('xentropy', label_type='probability', data=DATA))
|
||||
print("Performance of `xentropy` objective with probability labels:")
|
||||
print(experiment("xentropy", label_type="probability", data=DATA))
|
||||
|
||||
# Trying this throws an error on non-binary values of y:
|
||||
# experiment('binary', label_type='probability', DATA)
|
||||
|
@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA))
|
|||
# there are reasons to suspect that `binary` should run faster when the
|
||||
# label is an integer instead of a float
|
||||
K = 10
|
||||
A = [experiment('binary', label_type='binary', data=DATA)['time']
|
||||
for k in range(K)]
|
||||
B = [experiment('xentropy', label_type='binary', data=DATA)['time']
|
||||
for k in range(K)]
|
||||
A = [experiment("binary", label_type="binary", data=DATA)["time"] for k in range(K)]
|
||||
B = [experiment("xentropy", label_type="binary", data=DATA)["time"] for k in range(K)]
|
||||
print(f"Best `binary` time: {min(A)}")
|
||||
print(f"Best `xentropy` time: {min(B)}")
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -8,13 +8,13 @@ import lightgbm as lgb
|
|||
if lgb.compat.MATPLOTLIB_INSTALLED:
|
||||
import matplotlib.pyplot as plt
|
||||
else:
|
||||
raise ImportError('You need to install matplotlib and restart your session for plot_example.py.')
|
||||
raise ImportError("You need to install matplotlib and restart your session for plot_example.py.")
|
||||
|
||||
print('Loading data...')
|
||||
print("Loading data...")
|
||||
# load or create your dataset
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
|
||||
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
|
||||
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
|
||||
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
|
||||
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
|
||||
|
||||
y_train = df_train[0]
|
||||
y_test = df_test[0]
|
||||
|
@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train)
|
|||
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
|
||||
|
||||
# specify your configurations as a dict
|
||||
params = {
|
||||
'num_leaves': 5,
|
||||
'metric': ('l1', 'l2'),
|
||||
'verbose': 0
|
||||
}
|
||||
params = {"num_leaves": 5, "metric": ("l1", "l2"), "verbose": 0}
|
||||
|
||||
evals_result = {} # to record eval results for plotting
|
||||
|
||||
print('Starting training...')
|
||||
print("Starting training...")
|
||||
# train
|
||||
gbm = lgb.train(
|
||||
params,
|
||||
lgb_train,
|
||||
num_boost_round=100,
|
||||
valid_sets=[lgb_train, lgb_test],
|
||||
feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
|
||||
feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
|
||||
categorical_feature=[21],
|
||||
callbacks=[
|
||||
lgb.log_evaluation(10),
|
||||
lgb.record_evaluation(evals_result)
|
||||
]
|
||||
callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],
|
||||
)
|
||||
|
||||
print('Plotting metrics recorded during training...')
|
||||
ax = lgb.plot_metric(evals_result, metric='l1')
|
||||
print("Plotting metrics recorded during training...")
|
||||
ax = lgb.plot_metric(evals_result, metric="l1")
|
||||
plt.show()
|
||||
|
||||
print('Plotting feature importances...')
|
||||
print("Plotting feature importances...")
|
||||
ax = lgb.plot_importance(gbm, max_num_features=10)
|
||||
plt.show()
|
||||
|
||||
print('Plotting split value histogram...')
|
||||
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
|
||||
print("Plotting split value histogram...")
|
||||
ax = lgb.plot_split_value_histogram(gbm, feature="f26", bins="auto")
|
||||
plt.show()
|
||||
|
||||
print('Plotting 54th tree...') # one tree use categorical feature to split
|
||||
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
|
||||
print("Plotting 54th tree...") # one tree use categorical feature to split
|
||||
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=["split_gain"])
|
||||
plt.show()
|
||||
|
||||
print('Plotting 54th tree with graphviz...')
|
||||
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
|
||||
print("Plotting 54th tree with graphviz...")
|
||||
graph = lgb.create_tree_digraph(gbm, tree_index=53, name="Tree54")
|
||||
graph.render(view=True)
|
||||
|
|
|
@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error
|
|||
|
||||
import lightgbm as lgb
|
||||
|
||||
print('Loading data...')
|
||||
print("Loading data...")
|
||||
# load or create your dataset
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
|
||||
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
|
||||
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
|
||||
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
|
||||
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
|
||||
|
||||
y_train = df_train[0]
|
||||
y_test = df_test[0]
|
||||
|
@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
|
|||
|
||||
# specify your configurations as a dict
|
||||
params = {
|
||||
'boosting_type': 'gbdt',
|
||||
'objective': 'regression',
|
||||
'metric': {'l2', 'l1'},
|
||||
'num_leaves': 31,
|
||||
'learning_rate': 0.05,
|
||||
'feature_fraction': 0.9,
|
||||
'bagging_fraction': 0.8,
|
||||
'bagging_freq': 5,
|
||||
'verbose': 0
|
||||
"boosting_type": "gbdt",
|
||||
"objective": "regression",
|
||||
"metric": {"l2", "l1"},
|
||||
"num_leaves": 31,
|
||||
"learning_rate": 0.05,
|
||||
"feature_fraction": 0.9,
|
||||
"bagging_fraction": 0.8,
|
||||
"bagging_freq": 5,
|
||||
"verbose": 0,
|
||||
}
|
||||
|
||||
print('Starting training...')
|
||||
print("Starting training...")
|
||||
# train
|
||||
gbm = lgb.train(params,
|
||||
lgb_train,
|
||||
num_boost_round=20,
|
||||
valid_sets=lgb_eval,
|
||||
callbacks=[lgb.early_stopping(stopping_rounds=5)])
|
||||
gbm = lgb.train(
|
||||
params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]
|
||||
)
|
||||
|
||||
print('Saving model...')
|
||||
print("Saving model...")
|
||||
# save model to file
|
||||
gbm.save_model('model.txt')
|
||||
gbm.save_model("model.txt")
|
||||
|
||||
print('Starting predicting...')
|
||||
print("Starting predicting...")
|
||||
# predict
|
||||
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
|
||||
# eval
|
||||
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
|
||||
print(f'The RMSE of prediction is: {rmse_test}')
|
||||
print(f"The RMSE of prediction is: {rmse_test}")
|
||||
|
|
|
@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV
|
|||
|
||||
import lightgbm as lgb
|
||||
|
||||
print('Loading data...')
|
||||
print("Loading data...")
|
||||
# load or create your dataset
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
|
||||
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
|
||||
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
|
||||
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
|
||||
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
|
||||
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
|
||||
|
||||
y_train = df_train[0]
|
||||
y_test = df_test[0]
|
||||
X_train = df_train.drop(0, axis=1)
|
||||
X_test = df_test.drop(0, axis=1)
|
||||
|
||||
print('Starting training...')
|
||||
print("Starting training...")
|
||||
# train
|
||||
gbm = lgb.LGBMRegressor(num_leaves=31,
|
||||
learning_rate=0.05,
|
||||
n_estimators=20)
|
||||
gbm.fit(X_train, y_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
eval_metric='l1',
|
||||
callbacks=[lgb.early_stopping(5)])
|
||||
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
|
||||
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])
|
||||
|
||||
print('Starting predicting...')
|
||||
print("Starting predicting...")
|
||||
# predict
|
||||
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
|
||||
# eval
|
||||
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
|
||||
print(f'The RMSE of prediction is: {rmse_test}')
|
||||
print(f"The RMSE of prediction is: {rmse_test}")
|
||||
|
||||
# feature importances
|
||||
print(f'Feature importances: {list(gbm.feature_importances_)}')
|
||||
print(f"Feature importances: {list(gbm.feature_importances_)}")
|
||||
|
||||
|
||||
# self-defined eval metric
|
||||
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
|
||||
# Root Mean Squared Logarithmic Error (RMSLE)
|
||||
def rmsle(y_true, y_pred):
|
||||
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
|
||||
return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
|
||||
|
||||
|
||||
print('Starting training with custom eval function...')
|
||||
print("Starting training with custom eval function...")
|
||||
# train
|
||||
gbm.fit(X_train, y_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
eval_metric=rmsle,
|
||||
callbacks=[lgb.early_stopping(5)])
|
||||
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])
|
||||
|
||||
|
||||
# another self-defined eval metric
|
||||
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
|
||||
# Relative Absolute Error (RAE)
|
||||
def rae(y_true, y_pred):
|
||||
return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
|
||||
return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
|
||||
|
||||
|
||||
print('Starting training with multiple custom eval functions...')
|
||||
print("Starting training with multiple custom eval functions...")
|
||||
# train
|
||||
gbm.fit(X_train, y_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
eval_metric=[rmsle, rae],
|
||||
callbacks=[lgb.early_stopping(5)])
|
||||
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])
|
||||
|
||||
print('Starting predicting...')
|
||||
print("Starting predicting...")
|
||||
# predict
|
||||
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
|
||||
# eval
|
||||
rmsle_test = rmsle(y_test, y_pred)[1]
|
||||
rae_test = rae(y_test, y_pred)[1]
|
||||
print(f'The RMSLE of prediction is: {rmsle_test}')
|
||||
print(f'The RAE of prediction is: {rae_test}')
|
||||
print(f"The RMSLE of prediction is: {rmsle_test}")
|
||||
print(f"The RAE of prediction is: {rae_test}")
|
||||
|
||||
# other scikit-learn modules
|
||||
estimator = lgb.LGBMRegressor(num_leaves=31)
|
||||
|
||||
param_grid = {
|
||||
'learning_rate': [0.01, 0.1, 1],
|
||||
'n_estimators': [20, 40]
|
||||
}
|
||||
param_grid = {"learning_rate": [0.01, 0.1, 1], "n_estimators": [20, 40]}
|
||||
|
||||
gbm = GridSearchCV(estimator, param_grid, cv=3)
|
||||
gbm.fit(X_train, y_train)
|
||||
|
||||
print(f'Best parameters found by grid search are: {gbm.best_params_}')
|
||||
print(f"Best parameters found by grid search are: {gbm.best_params_}")
|
||||
|
|
|
@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional,
|
|||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
|
||||
dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
|
||||
pd_CategoricalDtype, pd_DataFrame, pd_Series)
|
||||
from .compat import (
|
||||
PANDAS_INSTALLED,
|
||||
PYARROW_INSTALLED,
|
||||
arrow_cffi,
|
||||
arrow_is_floating,
|
||||
arrow_is_integer,
|
||||
concat,
|
||||
dt_DataTable,
|
||||
pa_Array,
|
||||
pa_chunked_array,
|
||||
pa_ChunkedArray,
|
||||
pa_compute,
|
||||
pa_Table,
|
||||
pd_CategoricalDtype,
|
||||
pd_DataFrame,
|
||||
pd_Series,
|
||||
)
|
||||
from .libpath import find_lib_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
|
@ -5,8 +5,14 @@ from dataclasses import dataclass
|
|||
from functools import partial
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType,
|
||||
_LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning)
|
||||
from .basic import (
|
||||
Booster,
|
||||
_ConfigAliases,
|
||||
_LGBM_BoosterEvalMethodResultType,
|
||||
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
|
||||
_log_info,
|
||||
_log_warning,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .engine import CVBooster
|
||||
|
|
|
@ -19,12 +19,36 @@ import numpy as np
|
|||
import scipy.sparse as ss
|
||||
|
||||
from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning
|
||||
from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, Future, LGBMNotFittedError, concat,
|
||||
dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
|
||||
default_client, delayed, pd_DataFrame, pd_Series, wait)
|
||||
from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction,
|
||||
_LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit,
|
||||
_lgbmmodel_doc_predict)
|
||||
from .compat import (
|
||||
DASK_INSTALLED,
|
||||
PANDAS_INSTALLED,
|
||||
SKLEARN_INSTALLED,
|
||||
Client,
|
||||
Future,
|
||||
LGBMNotFittedError,
|
||||
concat,
|
||||
dask_Array,
|
||||
dask_array_from_delayed,
|
||||
dask_bag_from_delayed,
|
||||
dask_DataFrame,
|
||||
dask_Series,
|
||||
default_client,
|
||||
delayed,
|
||||
pd_DataFrame,
|
||||
pd_Series,
|
||||
wait,
|
||||
)
|
||||
from .sklearn import (
|
||||
LGBMClassifier,
|
||||
LGBMModel,
|
||||
LGBMRanker,
|
||||
LGBMRegressor,
|
||||
_LGBM_ScikitCustomObjectiveFunction,
|
||||
_LGBM_ScikitEvalMetricType,
|
||||
_lgbmmodel_doc_custom_eval_note,
|
||||
_lgbmmodel_doc_fit,
|
||||
_lgbmmodel_doc_predict,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'DaskLGBMClassifier',
|
||||
|
|
|
@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|||
import numpy as np
|
||||
|
||||
from . import callback
|
||||
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
|
||||
_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
|
||||
_LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType,
|
||||
_LGBM_FeatureNameConfiguration, _log_warning)
|
||||
from .basic import (
|
||||
Booster,
|
||||
Dataset,
|
||||
LightGBMError,
|
||||
_choose_param_value,
|
||||
_ConfigAliases,
|
||||
_InnerPredictor,
|
||||
_LGBM_BoosterEvalMethodResultType,
|
||||
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
|
||||
_LGBM_CategoricalFeatureConfiguration,
|
||||
_LGBM_CustomObjectiveFunction,
|
||||
_LGBM_EvalFunctionResultType,
|
||||
_LGBM_FeatureNameConfiguration,
|
||||
_log_warning,
|
||||
)
|
||||
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
|
||||
|
||||
__all__ = [
|
||||
|
|
|
@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
|
||||
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
|
||||
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
|
||||
from .basic import (
|
||||
Booster,
|
||||
Dataset,
|
||||
LightGBMError,
|
||||
_choose_param_value,
|
||||
_ConfigAliases,
|
||||
_LGBM_BoosterBestScoreType,
|
||||
_LGBM_CategoricalFeatureConfiguration,
|
||||
_LGBM_EvalFunctionResultType,
|
||||
_LGBM_FeatureNameConfiguration,
|
||||
_LGBM_GroupType,
|
||||
_LGBM_InitScoreType,
|
||||
_LGBM_LabelType,
|
||||
_LGBM_WeightType,
|
||||
_log_warning,
|
||||
)
|
||||
from .callback import _EvalResultDict, record_evaluation
|
||||
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
|
||||
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
|
||||
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
|
||||
dt_DataTable, np_random_Generator, pd_DataFrame)
|
||||
from .compat import (
|
||||
SKLEARN_INSTALLED,
|
||||
LGBMNotFittedError,
|
||||
_LGBMAssertAllFinite,
|
||||
_LGBMCheckArray,
|
||||
_LGBMCheckClassificationTargets,
|
||||
_LGBMCheckSampleWeight,
|
||||
_LGBMCheckXY,
|
||||
_LGBMClassifierBase,
|
||||
_LGBMComputeSampleWeight,
|
||||
_LGBMCpuCount,
|
||||
_LGBMLabelEncoder,
|
||||
_LGBMModelBase,
|
||||
_LGBMRegressorBase,
|
||||
dt_DataTable,
|
||||
np_random_Generator,
|
||||
pd_DataFrame,
|
||||
)
|
||||
from .engine import train
|
||||
|
||||
__all__ = [
|
||||
|
|
|
@ -81,10 +81,14 @@ minimum-version = "0.4.4"
|
|||
# end:build-system
|
||||
|
||||
[tool.isort]
|
||||
include_trailing_comma = true
|
||||
line_length = 120
|
||||
# "vertical hanging indent", to match what ruff-format does
|
||||
# ref: https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html#3-vertical-hanging-indent
|
||||
multi_line_output = 3
|
||||
skip_glob = [
|
||||
"*/external_libs/*",
|
||||
"*/lightgbm-python/*"
|
||||
"*/lightgbm-python/*",
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
|
@ -108,14 +112,13 @@ docstring-code-format = false
|
|||
exclude = [
|
||||
"build/*.py",
|
||||
"compile/*.py",
|
||||
"examples/*.py",
|
||||
"external_libs/*.py",
|
||||
"lightgbm-python/*.py",
|
||||
"python-package/*.py",
|
||||
"tests/*.py"
|
||||
]
|
||||
indent-style = "space"
|
||||
quote-style = "double"
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
ignore = [
|
||||
|
|
|
@ -10,7 +10,7 @@ try:
|
|||
from lightgbm.basic import _LIB as LIB
|
||||
except ModuleNotFoundError:
|
||||
print("Could not import lightgbm Python package, looking for lib_lightgbm at the repo root")
|
||||
if system() in ('Windows', 'Microsoft'):
|
||||
if system() in ("Windows", "Microsoft"):
|
||||
lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll"
|
||||
else:
|
||||
lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so"
|
||||
|
@ -25,7 +25,7 @@ dtype_int64 = 3
|
|||
|
||||
|
||||
def c_str(string):
|
||||
return ctypes.c_char_p(string.encode('utf-8'))
|
||||
return ctypes.c_char_p(string.encode("utf-8"))
|
||||
|
||||
|
||||
def load_from_file(filename, reference):
|
||||
|
@ -33,17 +33,13 @@ def load_from_file(filename, reference):
|
|||
if reference is not None:
|
||||
ref = reference
|
||||
handle = ctypes.c_void_p()
|
||||
LIB.LGBM_DatasetCreateFromFile(
|
||||
c_str(str(filename)),
|
||||
c_str('max_bin=15'),
|
||||
ref,
|
||||
ctypes.byref(handle))
|
||||
LIB.LGBM_DatasetCreateFromFile(c_str(str(filename)), c_str("max_bin=15"), ref, ctypes.byref(handle))
|
||||
print(LIB.LGBM_GetLastError())
|
||||
num_data = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
||||
num_feature = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
||||
print(f'#data: {num_data.value} #feature: {num_feature.value}')
|
||||
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
||||
return handle
|
||||
|
||||
|
||||
|
@ -69,20 +65,22 @@ def load_from_csr(filename, reference):
|
|||
ctypes.c_int64(len(csr.indptr)),
|
||||
ctypes.c_int64(len(csr.data)),
|
||||
ctypes.c_int64(csr.shape[1]),
|
||||
c_str('max_bin=15'),
|
||||
c_str("max_bin=15"),
|
||||
ref,
|
||||
ctypes.byref(handle))
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
num_data = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
||||
num_feature = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
||||
LIB.LGBM_DatasetSetField(
|
||||
handle,
|
||||
c_str('label'),
|
||||
c_str("label"),
|
||||
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
ctypes.c_int(len(label)),
|
||||
ctypes.c_int(dtype_float32))
|
||||
print(f'#data: {num_data.value} #feature: {num_feature.value}')
|
||||
ctypes.c_int(dtype_float32),
|
||||
)
|
||||
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
||||
return handle
|
||||
|
||||
|
||||
|
@ -104,20 +102,22 @@ def load_from_csc(filename, reference):
|
|||
ctypes.c_int64(len(csc.indptr)),
|
||||
ctypes.c_int64(len(csc.data)),
|
||||
ctypes.c_int64(csc.shape[0]),
|
||||
c_str('max_bin=15'),
|
||||
c_str("max_bin=15"),
|
||||
ref,
|
||||
ctypes.byref(handle))
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
num_data = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
||||
num_feature = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
||||
LIB.LGBM_DatasetSetField(
|
||||
handle,
|
||||
c_str('label'),
|
||||
c_str("label"),
|
||||
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
ctypes.c_int(len(label)),
|
||||
ctypes.c_int(dtype_float32))
|
||||
print(f'#data: {num_data.value} #feature: {num_feature.value}')
|
||||
ctypes.c_int(dtype_float32),
|
||||
)
|
||||
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
||||
return handle
|
||||
|
||||
|
||||
|
@ -137,20 +137,22 @@ def load_from_mat(filename, reference):
|
|||
ctypes.c_int32(mat.shape[0]),
|
||||
ctypes.c_int32(mat.shape[1]),
|
||||
ctypes.c_int(1),
|
||||
c_str('max_bin=15'),
|
||||
c_str("max_bin=15"),
|
||||
ref,
|
||||
ctypes.byref(handle))
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
num_data = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
||||
num_feature = ctypes.c_int(0)
|
||||
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
||||
LIB.LGBM_DatasetSetField(
|
||||
handle,
|
||||
c_str('label'),
|
||||
c_str("label"),
|
||||
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
ctypes.c_int(len(label)),
|
||||
ctypes.c_int(dtype_float32))
|
||||
print(f'#data: {num_data.value} #feature: {num_feature.value}')
|
||||
ctypes.c_int(dtype_float32),
|
||||
)
|
||||
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
||||
return handle
|
||||
|
||||
|
||||
|
@ -159,29 +161,26 @@ def free_dataset(handle):
|
|||
|
||||
|
||||
def test_dataset():
|
||||
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
|
||||
train = load_from_file(binary_example_dir / 'binary.train', None)
|
||||
test = load_from_mat(binary_example_dir / 'binary.test', train)
|
||||
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
|
||||
train = load_from_file(binary_example_dir / "binary.train", None)
|
||||
test = load_from_mat(binary_example_dir / "binary.test", train)
|
||||
free_dataset(test)
|
||||
test = load_from_csr(binary_example_dir / 'binary.test', train)
|
||||
test = load_from_csr(binary_example_dir / "binary.test", train)
|
||||
free_dataset(test)
|
||||
test = load_from_csc(binary_example_dir / 'binary.test', train)
|
||||
test = load_from_csc(binary_example_dir / "binary.test", train)
|
||||
free_dataset(test)
|
||||
save_to_binary(train, 'train.binary.bin')
|
||||
save_to_binary(train, "train.binary.bin")
|
||||
free_dataset(train)
|
||||
train = load_from_file('train.binary.bin', None)
|
||||
train = load_from_file("train.binary.bin", None)
|
||||
free_dataset(train)
|
||||
|
||||
|
||||
def test_booster():
|
||||
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
|
||||
train = load_from_mat(binary_example_dir / 'binary.train', None)
|
||||
test = load_from_mat(binary_example_dir / 'binary.test', train)
|
||||
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
|
||||
train = load_from_mat(binary_example_dir / "binary.train", None)
|
||||
test = load_from_mat(binary_example_dir / "binary.test", train)
|
||||
booster = ctypes.c_void_p()
|
||||
LIB.LGBM_BoosterCreate(
|
||||
train,
|
||||
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
|
||||
ctypes.byref(booster))
|
||||
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
|
||||
LIB.LGBM_BoosterAddValidData(booster, test)
|
||||
is_finished = ctypes.c_int(0)
|
||||
for i in range(1, 51):
|
||||
|
@ -189,28 +188,18 @@ def test_booster():
|
|||
result = np.array([0.0], dtype=np.float64)
|
||||
out_len = ctypes.c_int(0)
|
||||
LIB.LGBM_BoosterGetEval(
|
||||
booster,
|
||||
ctypes.c_int(0),
|
||||
ctypes.byref(out_len),
|
||||
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
|
||||
booster, ctypes.c_int(0), ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
|
||||
)
|
||||
if i % 10 == 0:
|
||||
print(f'{i} iteration test AUC {result[0]:.6f}')
|
||||
LIB.LGBM_BoosterSaveModel(
|
||||
booster,
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(-1),
|
||||
ctypes.c_int(0),
|
||||
c_str('model.txt'))
|
||||
print(f"{i} iteration test AUC {result[0]:.6f}")
|
||||
LIB.LGBM_BoosterSaveModel(booster, ctypes.c_int(0), ctypes.c_int(-1), ctypes.c_int(0), c_str("model.txt"))
|
||||
LIB.LGBM_BoosterFree(booster)
|
||||
free_dataset(train)
|
||||
free_dataset(test)
|
||||
booster2 = ctypes.c_void_p()
|
||||
num_total_model = ctypes.c_int(0)
|
||||
LIB.LGBM_BoosterCreateFromModelfile(
|
||||
c_str('model.txt'),
|
||||
ctypes.byref(num_total_model),
|
||||
ctypes.byref(booster2))
|
||||
data = np.loadtxt(str(binary_example_dir / 'binary.test'), dtype=np.float64)
|
||||
LIB.LGBM_BoosterCreateFromModelfile(c_str("model.txt"), ctypes.byref(num_total_model), ctypes.byref(booster2))
|
||||
data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
|
||||
mat = data[:, 1:]
|
||||
preb = np.empty(mat.shape[0], dtype=np.float64)
|
||||
num_preb = ctypes.c_int64(0)
|
||||
|
@ -225,58 +214,51 @@ def test_booster():
|
|||
ctypes.c_int(1),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(25),
|
||||
c_str(''),
|
||||
c_str(""),
|
||||
ctypes.byref(num_preb),
|
||||
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
|
||||
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
||||
)
|
||||
LIB.LGBM_BoosterPredictForFile(
|
||||
booster2,
|
||||
c_str(str(binary_example_dir / 'binary.test')),
|
||||
c_str(str(binary_example_dir / "binary.test")),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(25),
|
||||
c_str(''),
|
||||
c_str('preb.txt'))
|
||||
c_str(""),
|
||||
c_str("preb.txt"),
|
||||
)
|
||||
LIB.LGBM_BoosterPredictForFile(
|
||||
booster2,
|
||||
c_str(str(binary_example_dir / 'binary.test')),
|
||||
c_str(str(binary_example_dir / "binary.test")),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(0),
|
||||
ctypes.c_int(10),
|
||||
ctypes.c_int(25),
|
||||
c_str(''),
|
||||
c_str('preb.txt'))
|
||||
c_str(""),
|
||||
c_str("preb.txt"),
|
||||
)
|
||||
LIB.LGBM_BoosterFree(booster2)
|
||||
|
||||
|
||||
def test_max_thread_control():
|
||||
# at initialization, should be -1
|
||||
num_threads = ctypes.c_int(0)
|
||||
ret = LIB.LGBM_GetMaxThreads(
|
||||
ctypes.byref(num_threads)
|
||||
)
|
||||
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
||||
assert ret == 0
|
||||
assert num_threads.value == -1
|
||||
|
||||
# updating that value through the C API should work
|
||||
ret = LIB.LGBM_SetMaxThreads(
|
||||
ctypes.c_int(6)
|
||||
)
|
||||
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(6))
|
||||
assert ret == 0
|
||||
|
||||
ret = LIB.LGBM_GetMaxThreads(
|
||||
ctypes.byref(num_threads)
|
||||
)
|
||||
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
||||
assert ret == 0
|
||||
assert num_threads.value == 6
|
||||
|
||||
# resetting to any negative number should set it to -1
|
||||
ret = LIB.LGBM_SetMaxThreads(
|
||||
ctypes.c_int(-123)
|
||||
)
|
||||
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(-123))
|
||||
assert ret == 0
|
||||
ret = LIB.LGBM_GetMaxThreads(
|
||||
ctypes.byref(num_threads)
|
||||
)
|
||||
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
||||
assert ret == 0
|
||||
assert num_threads.value == -1
|
||||
|
|
|
@ -3,5 +3,5 @@ from pathlib import Path
|
|||
|
||||
import numpy as np
|
||||
|
||||
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob('*.pred')]
|
||||
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob("*.pred")]
|
||||
np.testing.assert_allclose(preds[0], preds[1])
|
||||
|
|
|
@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score
|
|||
TESTS_DIR = Path(__file__).absolute().parent
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
@pytest.fixture(scope="module")
|
||||
def executable(pytestconfig) -> str:
|
||||
"""Returns the path to the lightgbm executable."""
|
||||
return pytestconfig.getoption('execfile')
|
||||
return pytestconfig.getoption("execfile")
|
||||
|
||||
|
||||
def _find_random_open_port() -> int:
|
||||
"""Find a random open port on localhost."""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(('', 0))
|
||||
s.bind(("", 0))
|
||||
port = s.getsockname()[1]
|
||||
return port # noqa: RET504
|
||||
|
||||
|
@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]:
|
|||
|
||||
def _write_dict(d: Dict, file: io.TextIOWrapper) -> None:
|
||||
for k, v in d.items():
|
||||
file.write(f'{k} = {v}\n')
|
||||
file.write(f"{k} = {v}\n")
|
||||
|
||||
|
||||
def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
|
||||
|
@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
|
|||
|
||||
The data is returned as a numpy array with the label as the first column.
|
||||
"""
|
||||
if task == 'binary-classification':
|
||||
if task == "binary-classification":
|
||||
centers = [[-4, -4], [4, 4]]
|
||||
X, y = make_blobs(n_samples, centers=centers, random_state=42)
|
||||
elif task == 'regression':
|
||||
elif task == "regression":
|
||||
X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
|
||||
return np.hstack([y.reshape(-1, 1), X])
|
||||
|
||||
|
@ -54,22 +54,22 @@ class DistributedMockup:
|
|||
"""Simulate distributed training."""
|
||||
|
||||
default_train_config = {
|
||||
'task': 'train',
|
||||
'pre_partition': True,
|
||||
'machine_list_file': TESTS_DIR / 'mlist.txt',
|
||||
'tree_learner': 'data',
|
||||
'force_row_wise': True,
|
||||
'verbose': 0,
|
||||
'num_boost_round': 20,
|
||||
'num_leaves': 15,
|
||||
'num_threads': 2,
|
||||
"task": "train",
|
||||
"pre_partition": True,
|
||||
"machine_list_file": TESTS_DIR / "mlist.txt",
|
||||
"tree_learner": "data",
|
||||
"force_row_wise": True,
|
||||
"verbose": 0,
|
||||
"num_boost_round": 20,
|
||||
"num_leaves": 15,
|
||||
"num_threads": 2,
|
||||
}
|
||||
|
||||
default_predict_config = {
|
||||
'task': 'predict',
|
||||
'data': TESTS_DIR / 'train.txt',
|
||||
'input_model': TESTS_DIR / 'model0.txt',
|
||||
'output_result': TESTS_DIR / 'predictions.txt',
|
||||
"task": "predict",
|
||||
"data": TESTS_DIR / "train.txt",
|
||||
"input_model": TESTS_DIR / "model0.txt",
|
||||
"output_result": TESTS_DIR / "predictions.txt",
|
||||
}
|
||||
|
||||
def __init__(self, executable: str):
|
||||
|
@ -77,8 +77,8 @@ class DistributedMockup:
|
|||
|
||||
def worker_train(self, i: int) -> subprocess.CompletedProcess:
|
||||
"""Start the training process on the `i`-th worker."""
|
||||
config_path = TESTS_DIR / f'train{i}.conf'
|
||||
cmd = [self.executable, f'config={config_path}']
|
||||
config_path = TESTS_DIR / f"train{i}.conf"
|
||||
cmd = [self.executable, f"config={config_path}"]
|
||||
return subprocess.run(cmd)
|
||||
|
||||
def _set_ports(self) -> None:
|
||||
|
@ -92,18 +92,18 @@ class DistributedMockup:
|
|||
ports.update(candidates)
|
||||
i += 1
|
||||
if i == max_tries:
|
||||
raise RuntimeError('Unable to find non-colliding ports.')
|
||||
raise RuntimeError("Unable to find non-colliding ports.")
|
||||
self.listen_ports = list(ports)
|
||||
with open(TESTS_DIR / 'mlist.txt', 'wt') as f:
|
||||
with open(TESTS_DIR / "mlist.txt", "wt") as f:
|
||||
for port in self.listen_ports:
|
||||
f.write(f'127.0.0.1 {port}\n')
|
||||
f.write(f"127.0.0.1 {port}\n")
|
||||
|
||||
def _write_data(self, partitions: List[np.ndarray]) -> None:
|
||||
"""Write all training data as train.txt and each training partition as train{i}.txt."""
|
||||
all_data = np.vstack(partitions)
|
||||
np.savetxt(str(TESTS_DIR / 'train.txt'), all_data, delimiter=',')
|
||||
np.savetxt(str(TESTS_DIR / "train.txt"), all_data, delimiter=",")
|
||||
for i, partition in enumerate(partitions):
|
||||
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
|
||||
np.savetxt(str(TESTS_DIR / f"train{i}.txt"), partition, delimiter=",")
|
||||
|
||||
def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
|
||||
"""Run the distributed training process on a single machine.
|
||||
|
@ -118,7 +118,7 @@ class DistributedMockup:
|
|||
"""
|
||||
self.train_config = copy.deepcopy(self.default_train_config)
|
||||
self.train_config.update(train_config)
|
||||
self.n_workers = self.train_config['num_machines']
|
||||
self.n_workers = self.train_config["num_machines"]
|
||||
self._set_ports()
|
||||
self._write_data(partitions)
|
||||
self.label_ = np.hstack([partition[:, 0] for partition in partitions])
|
||||
|
@ -131,7 +131,7 @@ class DistributedMockup:
|
|||
results = [f.result() for f in futures]
|
||||
for result in results:
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError('Error in training')
|
||||
raise RuntimeError("Error in training")
|
||||
|
||||
def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
|
||||
"""Compute the predictions using the model created in the fit step.
|
||||
|
@ -141,14 +141,14 @@ class DistributedMockup:
|
|||
"""
|
||||
self.predict_config = copy.deepcopy(self.default_predict_config)
|
||||
self.predict_config.update(predict_config)
|
||||
config_path = TESTS_DIR / 'predict.conf'
|
||||
with open(config_path, 'wt') as file:
|
||||
config_path = TESTS_DIR / "predict.conf"
|
||||
with open(config_path, "wt") as file:
|
||||
_write_dict(self.predict_config, file)
|
||||
cmd = [self.executable, f'config={config_path}']
|
||||
cmd = [self.executable, f"config={config_path}"]
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError('Error in prediction')
|
||||
return np.loadtxt(str(TESTS_DIR / 'predictions.txt'))
|
||||
raise RuntimeError("Error in prediction")
|
||||
return np.loadtxt(str(TESTS_DIR / "predictions.txt"))
|
||||
|
||||
def write_train_config(self, i: int) -> None:
|
||||
"""Create a file train{i}.conf with the required configuration to train.
|
||||
|
@ -156,41 +156,41 @@ class DistributedMockup:
|
|||
Each worker gets a different port and piece of the data, the rest are the
|
||||
model parameters contained in `self.config`.
|
||||
"""
|
||||
with open(TESTS_DIR / f'train{i}.conf', 'wt') as file:
|
||||
output_model = TESTS_DIR / f'model{i}.txt'
|
||||
data = TESTS_DIR / f'train{i}.txt'
|
||||
file.write(f'output_model = {output_model}\n')
|
||||
file.write(f'local_listen_port = {self.listen_ports[i]}\n')
|
||||
file.write(f'data = {data}\n')
|
||||
with open(TESTS_DIR / f"train{i}.conf", "wt") as file:
|
||||
output_model = TESTS_DIR / f"model{i}.txt"
|
||||
data = TESTS_DIR / f"train{i}.txt"
|
||||
file.write(f"output_model = {output_model}\n")
|
||||
file.write(f"local_listen_port = {self.listen_ports[i]}\n")
|
||||
file.write(f"data = {data}\n")
|
||||
_write_dict(self.train_config, file)
|
||||
|
||||
|
||||
def test_classifier(executable):
|
||||
"""Test the classification task."""
|
||||
num_machines = 2
|
||||
data = create_data(task='binary-classification')
|
||||
data = create_data(task="binary-classification")
|
||||
partitions = np.array_split(data, num_machines)
|
||||
train_params = {
|
||||
'objective': 'binary',
|
||||
'num_machines': num_machines,
|
||||
"objective": "binary",
|
||||
"num_machines": num_machines,
|
||||
}
|
||||
clf = DistributedMockup(executable)
|
||||
clf.fit(partitions, train_params)
|
||||
y_probas = clf.predict(predict_config={})
|
||||
y_pred = y_probas > 0.5
|
||||
assert accuracy_score(clf.label_, y_pred) == 1.
|
||||
assert accuracy_score(clf.label_, y_pred) == 1.0
|
||||
|
||||
|
||||
def test_regressor(executable):
|
||||
"""Test the regression task."""
|
||||
num_machines = 2
|
||||
data = create_data(task='regression')
|
||||
data = create_data(task="regression")
|
||||
partitions = np.array_split(data, num_machines)
|
||||
train_params = {
|
||||
'objective': 'regression',
|
||||
'num_machines': num_machines,
|
||||
"objective": "regression",
|
||||
"num_machines": num_machines,
|
||||
}
|
||||
reg = DistributedMockup(executable)
|
||||
reg.fit(partitions, train_params)
|
||||
y_pred = reg.predict(predict_config={})
|
||||
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
|
||||
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.0)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
default_exec_file = Path(__file__).absolute().parents[2] / 'lightgbm'
|
||||
default_exec_file = Path(__file__).absolute().parents[2] / "lightgbm"
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption('--execfile', action='store', default=str(default_exec_file))
|
||||
parser.addoption("--execfile", action="store", default=str(default_exec_file))
|
||||
|
|
|
@ -71,9 +71,7 @@ def generate_random_arrow_table(
|
|||
values: Optional[np.ndarray] = None,
|
||||
) -> pa.Table:
|
||||
columns = [
|
||||
generate_random_arrow_array(
|
||||
num_datapoints, seed + i, generate_nulls=generate_nulls, values=values
|
||||
)
|
||||
generate_random_arrow_array(num_datapoints, seed + i, generate_nulls=generate_nulls, values=values)
|
||||
for i in range(num_columns)
|
||||
]
|
||||
names = [f"col_{i}" for i in range(num_columns)]
|
||||
|
@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy():
|
|||
arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False)
|
||||
arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32())
|
||||
|
||||
arrow_dataset = lgb.Dataset(
|
||||
arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups
|
||||
)
|
||||
arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups)
|
||||
arrow_dataset.construct()
|
||||
|
||||
pandas_dataset = lgb.Dataset(
|
||||
|
@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy():
|
|||
|
||||
# Check for equality
|
||||
for field in ("label", "weight", "group"):
|
||||
np_assert_array_equal(
|
||||
arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True
|
||||
)
|
||||
np_assert_array_equal(arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True)
|
||||
np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True)
|
||||
np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True)
|
||||
|
||||
|
@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
|
|||
],
|
||||
)
|
||||
@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
|
||||
def test_dataset_construct_init_scores_array(
|
||||
array_type: Any, init_score_data: Any, arrow_type: Any
|
||||
):
|
||||
def test_dataset_construct_init_scores_array(array_type: Any, init_score_data: Any, arrow_type: Any):
|
||||
data = generate_dummy_arrow_table()
|
||||
init_scores = array_type(init_score_data, type=arrow_type)
|
||||
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
|
||||
|
@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
|
|||
np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True)
|
||||
|
||||
p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True)
|
||||
p_first_iter_pandas = booster.predict(
|
||||
data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True
|
||||
)
|
||||
p_first_iter_pandas = booster.predict(data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True)
|
||||
np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True)
|
||||
|
||||
|
||||
|
|
|
@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
|
|||
|
||||
|
||||
def test_basic(tmp_path):
|
||||
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True),
|
||||
test_size=0.1, random_state=2)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
|
||||
)
|
||||
feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
|
||||
feature_names[1] = "a" * 1000 # set one name to a value longer than default buffer size
|
||||
train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
|
||||
|
@ -34,7 +35,7 @@ def test_basic(tmp_path):
|
|||
"verbose": -1,
|
||||
"num_threads": 1,
|
||||
"max_bin": 255,
|
||||
"gpu_use_dp": True
|
||||
"gpu_use_dp": True,
|
||||
}
|
||||
bst = lgb.Booster(params, train_data)
|
||||
bst.add_valid(valid_data, "valid_1")
|
||||
|
@ -49,7 +50,7 @@ def test_basic(tmp_path):
|
|||
assert bst.current_iteration() == 20
|
||||
assert bst.num_trees() == 20
|
||||
assert bst.num_model_per_iteration() == 1
|
||||
if getenv('TASK', '') != 'cuda':
|
||||
if getenv("TASK", "") != "cuda":
|
||||
assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
|
||||
assert bst.upper_bound() == pytest.approx(3.3182142872462883)
|
||||
|
||||
|
@ -79,20 +80,19 @@ def test_basic(tmp_path):
|
|||
# test that shape is checked during prediction
|
||||
bad_X_test = X_test[:, 1:]
|
||||
bad_shape_error_msg = "The number of features in data*"
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
|
||||
bst.predict, bad_X_test)
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
|
||||
bst.predict, sparse.csr_matrix(bad_X_test))
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
|
||||
bst.predict, sparse.csc_matrix(bad_X_test))
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
|
||||
np.testing.assert_raises_regex(
|
||||
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
|
||||
)
|
||||
np.testing.assert_raises_regex(
|
||||
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
|
||||
)
|
||||
with open(tname, "w+b") as f:
|
||||
dump_svmlight_file(bad_X_test, y_test, f)
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
|
||||
bst.predict, tname)
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
|
||||
with open(tname, "w+b") as f:
|
||||
dump_svmlight_file(X_test, y_test, f, zero_based=False)
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
|
||||
bst.predict, tname)
|
||||
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
|
||||
|
||||
|
||||
class NumpySequence(lgb.Sequence):
|
||||
|
@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
|
|||
elif isinstance(idx, slice):
|
||||
if not (idx.step is None or idx.step == 1):
|
||||
raise NotImplementedError("No need to implement, caller will not set step by now")
|
||||
return self.ndarray[idx.start:idx.stop]
|
||||
return self.ndarray[idx.start : idx.stop]
|
||||
elif isinstance(idx, list):
|
||||
return self.ndarray[idx]
|
||||
else:
|
||||
|
@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
|
|||
return seqs
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sample_count', [11, 100, None])
|
||||
@pytest.mark.parametrize('batch_size', [3, None])
|
||||
@pytest.mark.parametrize('include_0_and_nan', [False, True])
|
||||
@pytest.mark.parametrize('num_seq', [1, 3])
|
||||
@pytest.mark.parametrize("sample_count", [11, 100, None])
|
||||
@pytest.mark.parametrize("batch_size", [3, None])
|
||||
@pytest.mark.parametrize("include_0_and_nan", [False, True])
|
||||
@pytest.mark.parametrize("num_seq", [1, 3])
|
||||
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
|
||||
params = {'bin_construct_sample_cnt': sample_count}
|
||||
params = {"bin_construct_sample_cnt": sample_count}
|
||||
|
||||
nrow = 50
|
||||
half_nrow = nrow // 2
|
||||
|
@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
|
|||
X = data[:, :-1]
|
||||
Y = data[:, -1]
|
||||
|
||||
npy_bin_fname = tmpdir / 'data_from_npy.bin'
|
||||
seq_bin_fname = tmpdir / 'data_from_seq.bin'
|
||||
npy_bin_fname = tmpdir / "data_from_npy.bin"
|
||||
seq_bin_fname = tmpdir / "data_from_seq.bin"
|
||||
|
||||
# Create dataset from numpy array directly.
|
||||
ds = lgb.Dataset(X, label=Y, params=params)
|
||||
|
@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
|
|||
valid_X = valid_data[:, :-1]
|
||||
valid_Y = valid_data[:, -1]
|
||||
|
||||
valid_npy_bin_fname = tmpdir / 'valid_data_from_npy.bin'
|
||||
valid_seq_bin_fname = tmpdir / 'valid_data_from_seq.bin'
|
||||
valid_seq2_bin_fname = tmpdir / 'valid_data_from_seq2.bin'
|
||||
valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
|
||||
valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
|
||||
valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
|
||||
|
||||
valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
|
||||
valid_ds.save_binary(valid_npy_bin_fname)
|
||||
|
@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
|
|||
assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('num_seq', [1, 2])
|
||||
@pytest.mark.parametrize("num_seq", [1, 2])
|
||||
def test_sequence_get_data(num_seq):
|
||||
nrow = 20
|
||||
ncol = 11
|
||||
|
@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
|
|||
|
||||
|
||||
def test_chunked_dataset():
|
||||
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
|
||||
random_state=2)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
|
||||
)
|
||||
|
||||
chunk_size = X_train.shape[0] // 10 + 1
|
||||
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
|
||||
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
|
||||
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
|
||||
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
|
||||
|
||||
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
|
||||
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
|
||||
|
@ -232,12 +233,13 @@ def test_chunked_dataset():
|
|||
|
||||
|
||||
def test_chunked_dataset_linear():
|
||||
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
|
||||
random_state=2)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
|
||||
)
|
||||
chunk_size = X_train.shape[0] // 10 + 1
|
||||
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
|
||||
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
|
||||
params = {"bin_construct_sample_cnt": 100, 'linear_tree': True}
|
||||
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
|
||||
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
|
||||
params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
|
||||
train_data = lgb.Dataset(X_train, label=y_train, params=params)
|
||||
valid_data = train_data.create_valid(X_test, label=y_test, params=params)
|
||||
train_data.construct()
|
||||
|
@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
|
|||
|
||||
def test_save_dataset_subset_and_load_from_file(tmp_path):
|
||||
data = np.random.rand(100, 2)
|
||||
params = {'max_bin': 50, 'min_data_in_bin': 10}
|
||||
params = {"max_bin": 50, "min_data_in_bin": 10}
|
||||
ds = lgb.Dataset(data, params=params)
|
||||
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / 'subset.bin')
|
||||
lgb.Dataset(tmp_path / 'subset.bin', params=params).construct()
|
||||
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
|
||||
lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
|
||||
|
||||
|
||||
def test_subset_group():
|
||||
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
|
||||
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
|
||||
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
|
||||
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
|
||||
X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
|
||||
q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
|
||||
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
|
||||
assert len(lgb_train.get_group()) == 201
|
||||
subset = lgb_train.subset(list(range(10))).construct()
|
||||
|
@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
|
|||
def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
|
||||
X = np.random.random((100, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
names = [f'col_{i}' for i in range(5)]
|
||||
names = [f"col_{i}" for i in range(5)]
|
||||
for j in range(1, 5):
|
||||
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
|
||||
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
|
||||
|
@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
|
|||
d = lgb.Dataset(X, feature_name=names).construct()
|
||||
dname = tmp_path / "d.txt"
|
||||
d._dump_text(dname)
|
||||
with open(d1name, 'rt') as d1f:
|
||||
with open(d1name, "rt") as d1f:
|
||||
d1txt = d1f.read()
|
||||
with open(dname, 'rt') as df:
|
||||
with open(dname, "rt") as df:
|
||||
dtxt = df.read()
|
||||
assert dtxt == d1txt
|
||||
|
||||
|
@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
|
|||
def test_add_features_same_booster_behaviour(tmp_path):
|
||||
X = np.random.random((100, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
names = [f'col_{i}' for i in range(5)]
|
||||
names = [f"col_{i}" for i in range(5)]
|
||||
for j in range(1, 5):
|
||||
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
|
||||
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
|
||||
|
@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
|
|||
d1name = tmp_path / "d1.txt"
|
||||
b1.save_model(d1name)
|
||||
b.save_model(dname)
|
||||
with open(dname, 'rt') as df:
|
||||
with open(dname, "rt") as df:
|
||||
dtxt = df.read()
|
||||
with open(d1name, 'rt') as d1f:
|
||||
with open(d1name, "rt") as d1f:
|
||||
d1txt = d1f.read()
|
||||
assert dtxt == d1txt
|
||||
|
||||
|
@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
|
|||
n_col = 5
|
||||
X = np.random.random((n_row, n_col))
|
||||
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
|
||||
names = [f'col_{i}' for i in range(n_col)]
|
||||
names = [f"col_{i}" for i in range(n_col)]
|
||||
seq = _create_sequence_from_ndarray(X, 1, 30)
|
||||
seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
|
||||
npy_list_ds = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]],
|
||||
feature_name=names, free_raw_data=False).construct()
|
||||
npy_list_ds = lgb.Dataset(
|
||||
[X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
|
||||
).construct()
|
||||
immergeable_dds = [seq_ds, npy_list_ds]
|
||||
for x_1 in xxs:
|
||||
# test that method works even with free_raw_data=True
|
||||
|
@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
|
|||
d1.add_features_from(d2)
|
||||
assert isinstance(d1.get_data(), original_type)
|
||||
assert d1.get_data().shape == (n_row, n_col * idx)
|
||||
res_feature_names += [f'D{idx}_{name}' for name in names]
|
||||
res_feature_names += [f"D{idx}_{name}" for name in names]
|
||||
assert d1.feature_name == res_feature_names
|
||||
|
||||
|
||||
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
|
||||
|
||||
arr_a = np.zeros((100, 1), dtype=np.float32)
|
||||
arr_b = np.random.normal(size=(100, 5))
|
||||
|
||||
dataset_a = lgb.Dataset(arr_a).construct()
|
||||
expected_msg = (
|
||||
'[LightGBM] [Warning] There are no meaningful features which satisfy '
|
||||
'the provided configuration. Decreasing Dataset parameters min_data_in_bin '
|
||||
'or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n'
|
||||
"[LightGBM] [Warning] There are no meaningful features which satisfy "
|
||||
"the provided configuration. Decreasing Dataset parameters min_data_in_bin "
|
||||
"or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
|
||||
)
|
||||
log_lines = capsys.readouterr().out
|
||||
assert expected_msg in log_lines
|
||||
|
@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
|
|||
X = np.random.random((100, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
y = np.random.random(100)
|
||||
names = [f'col_{i}' for i in range(5)]
|
||||
names = [f"col_{i}" for i in range(5)]
|
||||
ds = lgb.Dataset(X, feature_name=names).construct()
|
||||
ds.set_label(y)
|
||||
base = lgb.Booster(train_set=ds)
|
||||
|
@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
|
|||
base.update()
|
||||
basename = tmp_path / "basename.txt"
|
||||
base.save_model(basename)
|
||||
with open(basename, 'rt') as f:
|
||||
with open(basename, "rt") as f:
|
||||
basetxt = f.read()
|
||||
# Set extremely harsh penalties, so CEGB will block most splits.
|
||||
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
|
||||
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
|
||||
{'cegb_penalty_split': 1}]
|
||||
cases = [
|
||||
{"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
|
||||
{"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
|
||||
{"cegb_penalty_split": 1},
|
||||
]
|
||||
for case in cases:
|
||||
booster = lgb.Booster(train_set=ds, params=case)
|
||||
for _ in range(10):
|
||||
booster.update()
|
||||
casename = tmp_path / "casename.txt"
|
||||
booster.save_model(casename)
|
||||
with open(casename, 'rt') as f:
|
||||
with open(casename, "rt") as f:
|
||||
casetxt = f.read()
|
||||
assert basetxt != casetxt
|
||||
|
||||
|
@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
|
|||
X = np.random.random((100, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
y = np.random.random(100)
|
||||
names = [f'col_{i}' for i in range(5)]
|
||||
names = [f"col_{i}" for i in range(5)]
|
||||
ds = lgb.Dataset(X, feature_name=names).construct()
|
||||
ds.set_label(y)
|
||||
# Compare pairs of penalties, to ensure scaling works as intended
|
||||
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
|
||||
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
|
||||
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
|
||||
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
|
||||
({'cegb_penalty_split': 1},
|
||||
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
|
||||
for (p1, p2) in pairs:
|
||||
pairs = [
|
||||
(
|
||||
{"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
|
||||
{"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
|
||||
),
|
||||
(
|
||||
{"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
|
||||
{"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
|
||||
),
|
||||
({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
|
||||
]
|
||||
for p1, p2 in pairs:
|
||||
booster1 = lgb.Booster(train_set=ds, params=p1)
|
||||
booster2 = lgb.Booster(train_set=ds, params=p2)
|
||||
for _ in range(10):
|
||||
|
@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
|
|||
# Reset booster1's parameters to p2, so the parameter section of the file matches.
|
||||
booster1.reset_parameter(p2)
|
||||
booster1.save_model(p1name)
|
||||
with open(p1name, 'rt') as f:
|
||||
with open(p1name, "rt") as f:
|
||||
p1txt = f.read()
|
||||
p2name = tmp_path / "p2.txt"
|
||||
booster2.save_model(p2name)
|
||||
with open(p2name, 'rt') as f:
|
||||
with open(p2name, "rt") as f:
|
||||
p2txt = f.read()
|
||||
assert p1txt == p2txt
|
||||
|
||||
|
||||
def test_consistent_state_for_dataset_fields():
|
||||
|
||||
def check_asserts(data):
|
||||
np.testing.assert_allclose(data.label, data.get_label())
|
||||
np.testing.assert_allclose(data.label, data.get_field('label'))
|
||||
np.testing.assert_allclose(data.label, data.get_field("label"))
|
||||
assert not np.isnan(data.label[0])
|
||||
assert not np.isinf(data.label[1])
|
||||
np.testing.assert_allclose(data.weight, data.get_weight())
|
||||
np.testing.assert_allclose(data.weight, data.get_field('weight'))
|
||||
np.testing.assert_allclose(data.weight, data.get_field("weight"))
|
||||
assert not np.isnan(data.weight[0])
|
||||
assert not np.isinf(data.weight[1])
|
||||
np.testing.assert_allclose(data.init_score, data.get_init_score())
|
||||
np.testing.assert_allclose(data.init_score, data.get_field('init_score'))
|
||||
np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
|
||||
assert not np.isnan(data.init_score[0])
|
||||
assert not np.isinf(data.init_score[1])
|
||||
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]],
|
||||
data.label[0]))
|
||||
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
|
||||
assert data.label[1] == pytest.approx(data.weight[1])
|
||||
assert data.feature_name == data.get_feature_name()
|
||||
|
||||
|
@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
|
|||
sequence = np.ones(y.shape[0])
|
||||
sequence[0] = np.nan
|
||||
sequence[1] = np.inf
|
||||
feature_names = [f'f{i}'for i in range(X.shape[1])]
|
||||
lgb_data = lgb.Dataset(X, sequence,
|
||||
weight=sequence, init_score=sequence,
|
||||
feature_name=feature_names).construct()
|
||||
feature_names = [f"f{i}" for i in range(X.shape[1])]
|
||||
lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
|
||||
check_asserts(lgb_data)
|
||||
lgb_data = lgb.Dataset(X, y).construct()
|
||||
lgb_data.set_label(sequence)
|
||||
|
@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
|
|||
|
||||
|
||||
def test_dataset_construction_overwrites_user_provided_metadata_fields():
|
||||
|
||||
X = np.array([[1.0, 2.0], [3.0, 4.0]])
|
||||
|
||||
position = np.array([0.0, 1.0], dtype=np.float32)
|
||||
if getenv('TASK', '') == 'cuda':
|
||||
if getenv("TASK", "") == "cuda":
|
||||
position = None
|
||||
|
||||
dtrain = lgb.Dataset(
|
||||
X,
|
||||
params={
|
||||
"min_data_in_bin": 1,
|
||||
"min_data_in_leaf": 1,
|
||||
"verbosity": -1
|
||||
},
|
||||
params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
|
||||
group=[1, 1],
|
||||
init_score=[0.312, 0.708],
|
||||
label=[1, 2],
|
||||
|
@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
|
|||
assert dtrain.get_init_score() == [0.312, 0.708]
|
||||
assert dtrain.label == [1, 2]
|
||||
assert dtrain.get_label() == [1, 2]
|
||||
if getenv('TASK', '') != 'cuda':
|
||||
np_assert_array_equal(
|
||||
dtrain.position,
|
||||
np.array([0.0, 1.0], dtype=np.float32),
|
||||
strict=True
|
||||
)
|
||||
np_assert_array_equal(
|
||||
dtrain.get_position(),
|
||||
np.array([0.0, 1.0], dtype=np.float32),
|
||||
strict=True
|
||||
)
|
||||
if getenv("TASK", "") != "cuda":
|
||||
np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
|
||||
np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
|
||||
assert dtrain.weight == [0.5, 1.5]
|
||||
assert dtrain.get_weight() == [0.5, 1.5]
|
||||
|
||||
|
@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
|
|||
np_assert_array_equal(dtrain.group, expected_group, strict=True)
|
||||
np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
|
||||
# get_field("group") returns a numpy array with boundaries, instead of size
|
||||
np_assert_array_equal(
|
||||
dtrain.get_field("group"),
|
||||
np.array([0, 1, 2], dtype=np.int32),
|
||||
strict=True
|
||||
)
|
||||
np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
|
||||
|
||||
expected_init_score = np.array([0.312, 0.708],)
|
||||
expected_init_score = np.array(
|
||||
[0.312, 0.708],
|
||||
)
|
||||
np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
|
||||
np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
|
||||
np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
|
||||
|
@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
|
|||
np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
|
||||
np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
|
||||
|
||||
if getenv('TASK', '') != 'cuda':
|
||||
if getenv("TASK", "") != "cuda":
|
||||
expected_position = np.array([0.0, 1.0], dtype=np.float32)
|
||||
np_assert_array_equal(dtrain.position, expected_position, strict=True)
|
||||
np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
|
||||
# NOTE: "position" is converted to int32 on the C++ side
|
||||
np_assert_array_equal(
|
||||
dtrain.get_field("position"),
|
||||
np.array([0.0, 1.0], dtype=np.int32),
|
||||
strict=True
|
||||
)
|
||||
np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
|
||||
|
||||
expected_weight = np.array([0.5, 1.5], dtype=np.float32)
|
||||
np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
|
||||
|
@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
|
|||
|
||||
|
||||
def test_choose_param_value():
|
||||
|
||||
original_params = {
|
||||
"local_listen_port": 1234,
|
||||
"port": 2222,
|
||||
|
@ -599,30 +584,20 @@ def test_choose_param_value():
|
|||
|
||||
# should resolve duplicate aliases, and prefer the main parameter
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="local_listen_port",
|
||||
params=original_params,
|
||||
default_value=5555
|
||||
main_param_name="local_listen_port", params=original_params, default_value=5555
|
||||
)
|
||||
assert params["local_listen_port"] == 1234
|
||||
assert "port" not in params
|
||||
|
||||
# should choose the highest priority alias and set that value on main param
|
||||
# if only aliases are used
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="num_iterations",
|
||||
params=params,
|
||||
default_value=17
|
||||
)
|
||||
params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
|
||||
assert params["num_iterations"] == 13
|
||||
assert "num_trees" not in params
|
||||
assert "n_iter" not in params
|
||||
|
||||
# should use the default if main param and aliases are missing
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="learning_rate",
|
||||
params=params,
|
||||
default_value=0.789
|
||||
)
|
||||
params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
|
||||
assert params["learning_rate"] == 0.789
|
||||
|
||||
# all changes should be made on copies and not modify the original
|
||||
|
@ -637,37 +612,23 @@ def test_choose_param_value():
|
|||
|
||||
|
||||
def test_choose_param_value_preserves_nones():
|
||||
|
||||
# preserves None found for main param and still removes aliases
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="num_threads",
|
||||
params={
|
||||
"num_threads": None,
|
||||
"n_jobs": 4,
|
||||
"objective": "regression"
|
||||
},
|
||||
default_value=2
|
||||
params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
|
||||
default_value=2,
|
||||
)
|
||||
assert params == {"num_threads": None, "objective": "regression"}
|
||||
|
||||
# correctly chooses value when only an alias is provided
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="num_threads",
|
||||
params={
|
||||
"n_jobs": None,
|
||||
"objective": "regression"
|
||||
},
|
||||
default_value=2
|
||||
main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
|
||||
)
|
||||
assert params == {"num_threads": None, "objective": "regression"}
|
||||
|
||||
# adds None if that's given as the default and param not found
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="min_data_in_leaf",
|
||||
params={
|
||||
"objective": "regression"
|
||||
},
|
||||
default_value=None
|
||||
main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
|
||||
)
|
||||
assert params == {"objective": "regression", "min_data_in_leaf": None}
|
||||
|
||||
|
@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
|
|||
def test_choose_param_value_objective(objective_alias):
|
||||
# If callable is found in objective
|
||||
params = {objective_alias: dummy_obj}
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="objective",
|
||||
params=params,
|
||||
default_value=None
|
||||
)
|
||||
assert params['objective'] == dummy_obj
|
||||
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
|
||||
assert params["objective"] == dummy_obj
|
||||
|
||||
# Value in params should be preferred to the default_value passed from keyword arguments
|
||||
params = {objective_alias: dummy_obj}
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="objective",
|
||||
params=params,
|
||||
default_value=mse_obj
|
||||
)
|
||||
assert params['objective'] == dummy_obj
|
||||
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
|
||||
assert params["objective"] == dummy_obj
|
||||
|
||||
# None of objective or its aliases in params, but default_value is callable.
|
||||
params = {}
|
||||
params = lgb.basic._choose_param_value(
|
||||
main_param_name="objective",
|
||||
params=params,
|
||||
default_value=mse_obj
|
||||
)
|
||||
assert params['objective'] == mse_obj
|
||||
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
|
||||
assert params["objective"] == mse_obj
|
||||
|
||||
|
||||
@pytest.mark.parametrize('collection', ['1d_np', '2d_np', 'pd_float', 'pd_str', '1d_list', '2d_list'])
|
||||
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_list_to_1d_numpy(collection, dtype):
|
||||
collection2y = {
|
||||
'1d_np': np.random.rand(10),
|
||||
'2d_np': np.random.rand(10, 1),
|
||||
'pd_float': np.random.rand(10),
|
||||
'pd_str': ['a', 'b'],
|
||||
'1d_list': [1] * 10,
|
||||
'2d_list': [[1], [2]],
|
||||
"1d_np": np.random.rand(10),
|
||||
"2d_np": np.random.rand(10, 1),
|
||||
"pd_float": np.random.rand(10),
|
||||
"pd_str": ["a", "b"],
|
||||
"1d_list": [1] * 10,
|
||||
"2d_list": [[1], [2]],
|
||||
}
|
||||
y = collection2y[collection]
|
||||
if collection.startswith('pd'):
|
||||
if collection.startswith("pd"):
|
||||
if not PANDAS_INSTALLED:
|
||||
pytest.skip('pandas is not installed')
|
||||
pytest.skip("pandas is not installed")
|
||||
else:
|
||||
y = pd_Series(y)
|
||||
if isinstance(y, np.ndarray) and len(y.shape) == 2:
|
||||
with pytest.warns(UserWarning, match='column-vector'):
|
||||
with pytest.warns(UserWarning, match="column-vector"):
|
||||
lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
|
||||
return
|
||||
elif isinstance(y, list) and isinstance(y[0], list):
|
||||
|
@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
|
|||
assert result.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list'])
|
||||
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
|
||||
def test_init_score_for_multiclass_classification(init_score_type):
|
||||
init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
|
||||
if init_score_type == 'array':
|
||||
if init_score_type == "array":
|
||||
init_score = np.array(init_score)
|
||||
elif init_score_type == 'dataframe':
|
||||
elif init_score_type == "dataframe":
|
||||
if not PANDAS_INSTALLED:
|
||||
pytest.skip('Pandas is not installed.')
|
||||
pytest.skip("Pandas is not installed.")
|
||||
init_score = pd_DataFrame(init_score)
|
||||
data = np.random.rand(10, 2)
|
||||
ds = lgb.Dataset(data, init_score=init_score).construct()
|
||||
np.testing.assert_equal(ds.get_field('init_score'), init_score)
|
||||
np.testing.assert_equal(ds.get_field("init_score"), init_score)
|
||||
np.testing.assert_equal(ds.init_score, init_score)
|
||||
|
||||
|
||||
def test_smoke_custom_parser(tmp_path):
|
||||
data_path = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' / 'binary.train'
|
||||
parser_config_file = tmp_path / 'parser.ini'
|
||||
with open(parser_config_file, 'w') as fout:
|
||||
data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
|
||||
parser_config_file = tmp_path / "parser.ini"
|
||||
with open(parser_config_file, "w") as fout:
|
||||
fout.write('{"className": "dummy", "id": "1"}')
|
||||
|
||||
data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
|
||||
with pytest.raises(lgb.basic.LightGBMError,
|
||||
match="Cannot find parser class 'dummy', please register first or check config format"):
|
||||
with pytest.raises(
|
||||
lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
|
||||
):
|
||||
data.construct()
|
||||
|
||||
|
||||
|
@ -770,9 +720,13 @@ def test_param_aliases():
|
|||
assert all(isinstance(i, list) for i in aliases.values())
|
||||
assert all(len(i) >= 1 for i in aliases.values())
|
||||
assert all(k in v for k, v in aliases.items())
|
||||
assert lgb.basic._ConfigAliases.get('config', 'task') == {'config', 'config_file', 'task', 'task_type'}
|
||||
assert lgb.basic._ConfigAliases.get_sorted('min_data_in_leaf') == [
|
||||
'min_data_in_leaf', 'min_data', 'min_samples_leaf', 'min_child_samples', 'min_data_per_leaf'
|
||||
assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
|
||||
assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
|
||||
"min_data_in_leaf",
|
||||
"min_data",
|
||||
"min_samples_leaf",
|
||||
"min_child_samples",
|
||||
"min_data_per_leaf",
|
||||
]
|
||||
|
||||
|
||||
|
@ -793,10 +747,10 @@ def test_custom_objective_safety():
|
|||
y_multiclass = np.arange(nrows) % nclass
|
||||
ds_binary = lgb.Dataset(X, y_binary).construct()
|
||||
ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
|
||||
bad_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
|
||||
good_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
|
||||
bad_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
|
||||
good_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
|
||||
bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
|
||||
good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
|
||||
bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
|
||||
good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
|
||||
good_bst_binary.update(fobj=_good_gradients)
|
||||
with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
|
||||
bad_bst_binary.update(fobj=_bad_gradients)
|
||||
|
@ -805,33 +759,30 @@ def test_custom_objective_safety():
|
|||
bad_bst_multi.update(fobj=_bad_gradients)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
|
||||
@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto'])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
|
||||
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
|
||||
pd = pytest.importorskip('pandas')
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = np.random.rand(10, 2).astype(dtype)
|
||||
df = pd.DataFrame(X)
|
||||
built_data = lgb.basic._data_from_pandas(
|
||||
data=df,
|
||||
feature_name=feature_name,
|
||||
categorical_feature="auto",
|
||||
pandas_categorical=None
|
||||
data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
|
||||
)[0]
|
||||
assert built_data.dtype == dtype
|
||||
assert np.shares_memory(X, built_data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
|
||||
@pytest.mark.parametrize('categories', ['seen', 'unseen'])
|
||||
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
|
||||
@pytest.mark.parametrize("categories", ["seen", "unseen"])
|
||||
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
|
||||
pd = pytest.importorskip('pandas')
|
||||
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
|
||||
column_name = 'a' if feature_name == 'auto' else feature_name[0]
|
||||
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
|
||||
if categories == 'seen':
|
||||
pandas_categorical = [['a', 'b']]
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
|
||||
column_name = "a" if feature_name == "auto" else feature_name[0]
|
||||
df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
|
||||
if categories == "seen":
|
||||
pandas_categorical = [["a", "b"]]
|
||||
else:
|
||||
pandas_categorical = [['a']]
|
||||
pandas_categorical = [["a"]]
|
||||
data = lgb.basic._data_from_pandas(
|
||||
data=df,
|
||||
feature_name=feature_name,
|
||||
|
@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
|
|||
# check that the original data wasn't modified
|
||||
np.testing.assert_equal(df[column_name], X[:, 0])
|
||||
# check that the built data has the codes
|
||||
if categories == 'seen':
|
||||
if categories == "seen":
|
||||
# if all categories were seen during training we just take the codes
|
||||
codes = df[column_name].cat.codes
|
||||
else:
|
||||
# if we only saw 'a' during training we just replace its code
|
||||
# and leave the rest as nan
|
||||
a_code = df[column_name].cat.categories.get_loc('a')
|
||||
codes = np.where(df[column_name] == 'a', a_code, np.nan)
|
||||
a_code = df[column_name].cat.categories.get_loc("a")
|
||||
codes = np.where(df[column_name] == "a", a_code, np.nan)
|
||||
np.testing.assert_equal(codes, data[:, 0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('min_data_in_bin', [2, 10])
|
||||
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
|
||||
def test_feature_num_bin(min_data_in_bin):
|
||||
X = np.vstack([
|
||||
np.random.rand(100),
|
||||
np.array([1, 2] * 50),
|
||||
np.array([0, 1, 2] * 33 + [0]),
|
||||
np.array([1, 2] * 49 + 2 * [np.nan]),
|
||||
np.zeros(100),
|
||||
np.random.choice([0, 1], 100),
|
||||
]).T
|
||||
X = np.vstack(
|
||||
[
|
||||
np.random.rand(100),
|
||||
np.array([1, 2] * 50),
|
||||
np.array([0, 1, 2] * 33 + [0]),
|
||||
np.array([1, 2] * 49 + 2 * [np.nan]),
|
||||
np.zeros(100),
|
||||
np.random.choice([0, 1], 100),
|
||||
]
|
||||
).T
|
||||
n_continuous = X.shape[1] - 1
|
||||
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
|
||||
feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
|
||||
ds_kwargs = {
|
||||
"params": {'min_data_in_bin': min_data_in_bin},
|
||||
"params": {"min_data_in_bin": min_data_in_bin},
|
||||
"categorical_feature": [n_continuous], # last feature
|
||||
}
|
||||
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
|
||||
|
@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
|
|||
assert bins_by_name == expected_num_bins
|
||||
# test using default feature names
|
||||
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
|
||||
default_names = [f'Column_{i}' for i in range(X.shape[1])]
|
||||
default_names = [f"Column_{i}" for i in range(X.shape[1])]
|
||||
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
|
||||
assert bins_by_default_name == expected_num_bins
|
||||
# check for feature indices outside of range
|
||||
|
@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
|
|||
with pytest.raises(
|
||||
lgb.basic.LightGBMError,
|
||||
match=(
|
||||
f'Tried to retrieve number of bins for feature index {num_features}, '
|
||||
f'but the valid feature indices are \\[0, {num_features - 1}\\].'
|
||||
)
|
||||
f"Tried to retrieve number of bins for feature index {num_features}, "
|
||||
f"but the valid feature indices are \\[0, {num_features - 1}\\]."
|
||||
),
|
||||
):
|
||||
ds.feature_num_bin(num_features)
|
||||
|
||||
|
@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
|
|||
def test_feature_num_bin_with_max_bin_by_feature():
|
||||
X = np.random.rand(100, 3)
|
||||
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
|
||||
ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct()
|
||||
ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
|
||||
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
|
||||
np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
|
||||
|
||||
|
@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
|
|||
def test_set_leaf_output():
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
ds = lgb.Dataset(X, y)
|
||||
bst = lgb.Booster({'num_leaves': 2}, ds)
|
||||
bst = lgb.Booster({"num_leaves": 2}, ds)
|
||||
bst.update()
|
||||
y_pred = bst.predict(X)
|
||||
for leaf_id in range(2):
|
||||
|
|
|
@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
|
|||
return 0.6 if boosting_round < 15 else 0.8
|
||||
|
||||
|
||||
@pytest.mark.parametrize('serializer', SERIALIZERS)
|
||||
@pytest.mark.parametrize("serializer", SERIALIZERS)
|
||||
def test_early_stopping_callback_is_picklable(serializer):
|
||||
rounds = 5
|
||||
callback = lgb.early_stopping(stopping_rounds=rounds)
|
||||
|
@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
|
|||
lgb.early_stopping(stopping_rounds="neverrrr")
|
||||
|
||||
|
||||
@pytest.mark.parametrize('serializer', SERIALIZERS)
|
||||
@pytest.mark.parametrize("serializer", SERIALIZERS)
|
||||
def test_log_evaluation_callback_is_picklable(serializer):
|
||||
periods = 42
|
||||
callback = lgb.log_evaluation(period=periods)
|
||||
|
@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
|
|||
assert callback.period == periods
|
||||
|
||||
|
||||
@pytest.mark.parametrize('serializer', SERIALIZERS)
|
||||
@pytest.mark.parametrize("serializer", SERIALIZERS)
|
||||
def test_record_evaluation_callback_is_picklable(serializer):
|
||||
results = {}
|
||||
callback = lgb.record_evaluation(eval_result=results)
|
||||
|
@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
|
|||
assert callback.eval_result is results
|
||||
|
||||
|
||||
@pytest.mark.parametrize('serializer', SERIALIZERS)
|
||||
@pytest.mark.parametrize("serializer", SERIALIZERS)
|
||||
def test_reset_parameter_callback_is_picklable(serializer):
|
||||
params = {
|
||||
'bagging_fraction': [0.7] * 5 + [0.6] * 5,
|
||||
'feature_fraction': reset_feature_fraction
|
||||
}
|
||||
params = {"bagging_fraction": [0.7] * 5 + [0.6] * 5, "feature_fraction": reset_feature_fraction}
|
||||
callback = lgb.reset_parameter(**params)
|
||||
callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
|
||||
assert callback_from_disk.order == 10
|
||||
|
|
|
@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
|
|||
|
||||
import lightgbm as lgb
|
||||
|
||||
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / 'examples'
|
||||
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / "examples"
|
||||
|
||||
|
||||
class FileLoader:
|
||||
|
||||
def __init__(self, directory, prefix, config_file='train.conf'):
|
||||
def __init__(self, directory, prefix, config_file="train.conf"):
|
||||
self.directory = directory
|
||||
self.prefix = prefix
|
||||
self.params = {'gpu_use_dp': True}
|
||||
with open(self.directory / config_file, 'r') as f:
|
||||
self.params = {"gpu_use_dp": True}
|
||||
with open(self.directory / config_file, "r") as f:
|
||||
for line in f.readlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
key, value = [token.strip() for token in line.split('=')]
|
||||
if 'early_stopping' not in key: # disable early_stopping
|
||||
self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value)
|
||||
if line and not line.startswith("#"):
|
||||
key, value = [token.strip() for token in line.split("=")]
|
||||
if "early_stopping" not in key: # disable early_stopping
|
||||
self.params[key] = value if key not in {"num_trees", "num_threads"} else int(value)
|
||||
|
||||
def load_dataset(self, suffix, is_sparse=False):
|
||||
filename = str(self.path(suffix))
|
||||
|
@ -33,14 +32,14 @@ class FileLoader:
|
|||
return mat[:, 1:], mat[:, 0], filename
|
||||
|
||||
def load_field(self, suffix):
|
||||
return np.loadtxt(str(self.directory / f'{self.prefix}{suffix}'))
|
||||
return np.loadtxt(str(self.directory / f"{self.prefix}{suffix}"))
|
||||
|
||||
def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
|
||||
def load_cpp_result(self, result_file="LightGBM_predict_result.txt"):
|
||||
return np.loadtxt(str(self.directory / result_file))
|
||||
|
||||
def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
|
||||
params = dict(self.params)
|
||||
params['force_row_wise'] = True
|
||||
params["force_row_wise"] = True
|
||||
gbm = lgb.train(params, lgb_train)
|
||||
y_pred = gbm.predict(X_test)
|
||||
cpp_pred = gbm.predict(X_test_fn)
|
||||
|
@ -49,7 +48,7 @@ class FileLoader:
|
|||
|
||||
def file_load_check(self, lgb_train, name):
|
||||
lgb_train_f = lgb.Dataset(self.path(name), params=self.params).construct()
|
||||
for f in ('num_data', 'num_feature', 'get_label', 'get_weight', 'get_init_score', 'get_group'):
|
||||
for f in ("num_data", "num_feature", "get_label", "get_weight", "get_init_score", "get_group"):
|
||||
a = getattr(lgb_train, f)()
|
||||
b = getattr(lgb_train_f, f)()
|
||||
if a is None and b is None:
|
||||
|
@ -62,83 +61,83 @@ class FileLoader:
|
|||
assert a == b, f
|
||||
|
||||
def path(self, suffix):
|
||||
return self.directory / f'{self.prefix}{suffix}'
|
||||
return self.directory / f"{self.prefix}{suffix}"
|
||||
|
||||
|
||||
def test_binary():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary')
|
||||
X_train, y_train, _ = fd.load_dataset('.train')
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test')
|
||||
weight_train = fd.load_field('.train.weight')
|
||||
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary")
|
||||
X_train, y_train, _ = fd.load_dataset(".train")
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test")
|
||||
weight_train = fd.load_field(".train.weight")
|
||||
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
|
||||
gbm = lgb.LGBMClassifier(**fd.params)
|
||||
gbm.fit(X_train, y_train, sample_weight=weight_train)
|
||||
sk_pred = gbm.predict_proba(X_test)[:, 1]
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
||||
|
||||
def test_binary_linear():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary', 'train_linear.conf')
|
||||
X_train, y_train, _ = fd.load_dataset('.train')
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test')
|
||||
weight_train = fd.load_field('.train.weight')
|
||||
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary", "train_linear.conf")
|
||||
X_train, y_train, _ = fd.load_dataset(".train")
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test")
|
||||
weight_train = fd.load_field(".train.weight")
|
||||
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
|
||||
gbm = lgb.LGBMClassifier(**fd.params)
|
||||
gbm.fit(X_train, y_train, sample_weight=weight_train)
|
||||
sk_pred = gbm.predict_proba(X_test)[:, 1]
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
||||
|
||||
def test_multiclass():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'multiclass_classification', 'multiclass')
|
||||
X_train, y_train, _ = fd.load_dataset('.train')
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test')
|
||||
fd = FileLoader(EXAMPLES_DIR / "multiclass_classification", "multiclass")
|
||||
X_train, y_train, _ = fd.load_dataset(".train")
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test")
|
||||
lgb_train = lgb.Dataset(X_train, y_train)
|
||||
gbm = lgb.LGBMClassifier(**fd.params)
|
||||
gbm.fit(X_train, y_train)
|
||||
sk_pred = gbm.predict_proba(X_test)
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
||||
|
||||
def test_regression():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'regression', 'regression')
|
||||
X_train, y_train, _ = fd.load_dataset('.train')
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test')
|
||||
init_score_train = fd.load_field('.train.init')
|
||||
fd = FileLoader(EXAMPLES_DIR / "regression", "regression")
|
||||
X_train, y_train, _ = fd.load_dataset(".train")
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test")
|
||||
init_score_train = fd.load_field(".train.init")
|
||||
lgb_train = lgb.Dataset(X_train, y_train, init_score=init_score_train)
|
||||
gbm = lgb.LGBMRegressor(**fd.params)
|
||||
gbm.fit(X_train, y_train, init_score=init_score_train)
|
||||
sk_pred = gbm.predict(X_test)
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
||||
|
||||
def test_lambdarank():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank')
|
||||
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
|
||||
group_train = fd.load_field('.train.query')
|
||||
fd = FileLoader(EXAMPLES_DIR / "lambdarank", "rank")
|
||||
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
|
||||
group_train = fd.load_field(".train.query")
|
||||
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
|
||||
params = dict(fd.params)
|
||||
params['force_col_wise'] = True
|
||||
params["force_col_wise"] = True
|
||||
gbm = lgb.LGBMRanker(**params)
|
||||
gbm.fit(X_train, y_train, group=group_train)
|
||||
sk_pred = gbm.predict(X_test)
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
||||
|
||||
def test_xendcg():
|
||||
fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank')
|
||||
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
|
||||
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
|
||||
group_train = fd.load_field('.train.query')
|
||||
fd = FileLoader(EXAMPLES_DIR / "xendcg", "rank")
|
||||
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
|
||||
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
|
||||
group_train = fd.load_field(".train.query")
|
||||
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
|
||||
gbm = lgb.LGBMRanker(**fd.params)
|
||||
gbm.fit(X_train, y_train, group=group_train)
|
||||
sk_pred = gbm.predict(X_test)
|
||||
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
|
||||
fd.file_load_check(lgb_train, '.train')
|
||||
fd.file_load_check(lgb_train, ".train")
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
|
|||
params_gpu = params_cpu.copy()
|
||||
params_gpu["device"] = "gpu"
|
||||
# Double-precision floats are only supported on x86_64 with PoCL
|
||||
params_gpu["gpu_use_dp"] = (platform.machine() == "x86_64")
|
||||
params_gpu["gpu_use_dp"] = platform.machine() == "x86_64"
|
||||
gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
|
||||
gpu_score = log_loss(y, gpu_bst.predict(X))
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
|
|||
|
||||
if MATPLOTLIB_INSTALLED:
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
|
||||
matplotlib.use("Agg")
|
||||
if GRAPHVIZ_INSTALLED:
|
||||
import graphviz
|
||||
|
||||
|
@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
|
|||
|
||||
@pytest.fixture(scope="module")
|
||||
def breast_cancer_split():
|
||||
return train_test_split(*load_breast_cancer(return_X_y=True),
|
||||
test_size=0.1, random_state=1)
|
||||
return train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=1)
|
||||
|
||||
|
||||
def _categorical_data(category_values_lower_bound, category_values_upper_bound):
|
||||
|
@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
|
|||
|
||||
@pytest.fixture
|
||||
def params():
|
||||
return {"objective": "binary",
|
||||
"verbose": -1,
|
||||
"num_leaves": 3}
|
||||
return {"objective": "binary", "verbose": -1, "num_leaves": 3}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
|
||||
def test_plot_importance(params, breast_cancer_split, train_data):
|
||||
X_train, _, y_train, _ = breast_cancer_split
|
||||
|
||||
gbm0 = lgb.train(params, train_data, num_boost_round=10)
|
||||
ax0 = lgb.plot_importance(gbm0)
|
||||
assert isinstance(ax0, matplotlib.axes.Axes)
|
||||
assert ax0.get_title() == 'Feature importance'
|
||||
assert ax0.get_xlabel() == 'Feature importance'
|
||||
assert ax0.get_ylabel() == 'Features'
|
||||
assert ax0.get_title() == "Feature importance"
|
||||
assert ax0.get_xlabel() == "Feature importance"
|
||||
assert ax0.get_ylabel() == "Features"
|
||||
assert len(ax0.patches) <= 30
|
||||
|
||||
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
|
||||
gbm1.fit(X_train, y_train)
|
||||
|
||||
ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
|
||||
ax1 = lgb.plot_importance(gbm1, color="r", title="t", xlabel="x", ylabel="y")
|
||||
assert isinstance(ax1, matplotlib.axes.Axes)
|
||||
assert ax1.get_title() == 't'
|
||||
assert ax1.get_xlabel() == 'x'
|
||||
assert ax1.get_ylabel() == 'y'
|
||||
assert ax1.get_title() == "t"
|
||||
assert ax1.get_xlabel() == "x"
|
||||
assert ax1.get_ylabel() == "y"
|
||||
assert len(ax1.patches) <= 30
|
||||
for patch in ax1.patches:
|
||||
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
|
||||
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
|
||||
|
||||
ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None)
|
||||
ax2 = lgb.plot_importance(gbm0, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None)
|
||||
assert isinstance(ax2, matplotlib.axes.Axes)
|
||||
assert ax2.get_title() == ''
|
||||
assert ax2.get_xlabel() == ''
|
||||
assert ax2.get_ylabel() == ''
|
||||
assert ax2.get_title() == ""
|
||||
assert ax2.get_xlabel() == ""
|
||||
assert ax2.get_ylabel() == ""
|
||||
assert len(ax2.patches) <= 30
|
||||
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
|
||||
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
|
||||
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
|
||||
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
|
||||
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
|
||||
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
|
||||
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
|
||||
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
|
||||
|
||||
ax3 = lgb.plot_importance(gbm0, title='t @importance_type@', xlabel='x @importance_type@', ylabel='y @importance_type@')
|
||||
ax3 = lgb.plot_importance(
|
||||
gbm0, title="t @importance_type@", xlabel="x @importance_type@", ylabel="y @importance_type@"
|
||||
)
|
||||
assert isinstance(ax3, matplotlib.axes.Axes)
|
||||
assert ax3.get_title() == 't @importance_type@'
|
||||
assert ax3.get_xlabel() == 'x split'
|
||||
assert ax3.get_ylabel() == 'y @importance_type@'
|
||||
assert ax3.get_title() == "t @importance_type@"
|
||||
assert ax3.get_xlabel() == "x split"
|
||||
assert ax3.get_ylabel() == "y @importance_type@"
|
||||
assert len(ax3.patches) <= 30
|
||||
|
||||
gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
|
||||
|
@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
|
|||
assert first_bar1 != first_bar3
|
||||
|
||||
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
|
||||
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
|
||||
X_train, _, y_train, _ = breast_cancer_split
|
||||
|
||||
gbm0 = lgb.train(params, train_data, num_boost_round=10)
|
||||
ax0 = lgb.plot_split_value_histogram(gbm0, 27)
|
||||
assert isinstance(ax0, matplotlib.axes.Axes)
|
||||
assert ax0.get_title() == 'Split value histogram for feature with index 27'
|
||||
assert ax0.get_xlabel() == 'Feature split value'
|
||||
assert ax0.get_ylabel() == 'Count'
|
||||
assert ax0.get_title() == "Split value histogram for feature with index 27"
|
||||
assert ax0.get_xlabel() == "Feature split value"
|
||||
assert ax0.get_ylabel() == "Count"
|
||||
assert len(ax0.patches) <= 2
|
||||
|
||||
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
|
||||
gbm1.fit(X_train, y_train)
|
||||
|
||||
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
|
||||
title='Histogram for feature @index/name@ @feature@',
|
||||
xlabel='x', ylabel='y', color='r')
|
||||
ax1 = lgb.plot_split_value_histogram(
|
||||
gbm1,
|
||||
gbm1.booster_.feature_name()[27],
|
||||
figsize=(10, 5),
|
||||
title="Histogram for feature @index/name@ @feature@",
|
||||
xlabel="x",
|
||||
ylabel="y",
|
||||
color="r",
|
||||
)
|
||||
assert isinstance(ax1, matplotlib.axes.Axes)
|
||||
title = f'Histogram for feature name {gbm1.booster_.feature_name()[27]}'
|
||||
title = f"Histogram for feature name {gbm1.booster_.feature_name()[27]}"
|
||||
assert ax1.get_title() == title
|
||||
assert ax1.get_xlabel() == 'x'
|
||||
assert ax1.get_ylabel() == 'y'
|
||||
assert ax1.get_xlabel() == "x"
|
||||
assert ax1.get_ylabel() == "y"
|
||||
assert len(ax1.patches) <= 2
|
||||
for patch in ax1.patches:
|
||||
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
|
||||
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
|
||||
|
||||
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
|
||||
title=None, xlabel=None, ylabel=None)
|
||||
ax2 = lgb.plot_split_value_histogram(
|
||||
gbm0, 27, bins=10, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None
|
||||
)
|
||||
assert isinstance(ax2, matplotlib.axes.Axes)
|
||||
assert ax2.get_title() == ''
|
||||
assert ax2.get_xlabel() == ''
|
||||
assert ax2.get_ylabel() == ''
|
||||
assert ax2.get_title() == ""
|
||||
assert ax2.get_xlabel() == ""
|
||||
assert ax2.get_ylabel() == ""
|
||||
assert len(ax2.patches) == 10
|
||||
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
|
||||
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
|
||||
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
|
||||
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
|
||||
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
|
||||
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
|
||||
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
|
||||
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting
|
||||
|
||||
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED,
|
||||
reason='matplotlib or graphviz is not installed')
|
||||
@pytest.mark.skipif(
|
||||
not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, reason="matplotlib or graphviz is not installed"
|
||||
)
|
||||
def test_plot_tree(breast_cancer_split):
|
||||
X_train, _, y_train, _ = breast_cancer_split
|
||||
gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
|
||||
|
@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
|
|||
with pytest.raises(IndexError):
|
||||
lgb.plot_tree(gbm, tree_index=83)
|
||||
|
||||
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
|
||||
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=["split_gain"])
|
||||
assert isinstance(ax, matplotlib.axes.Axes)
|
||||
w, h = ax.axes.get_figure().get_size_inches()
|
||||
assert int(w) == 15
|
||||
assert int(h) == 8
|
||||
|
||||
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
|
||||
def test_create_tree_digraph(breast_cancer_split):
|
||||
X_train, _, y_train, _ = breast_cancer_split
|
||||
|
||||
|
@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
|
|||
with pytest.raises(IndexError):
|
||||
lgb.create_tree_digraph(gbm, tree_index=83)
|
||||
|
||||
graph = lgb.create_tree_digraph(gbm, tree_index=3,
|
||||
show_info=['split_gain', 'internal_value', 'internal_weight'],
|
||||
name='Tree4', node_attr={'color': 'red'})
|
||||
graph = lgb.create_tree_digraph(
|
||||
gbm,
|
||||
tree_index=3,
|
||||
show_info=["split_gain", "internal_value", "internal_weight"],
|
||||
name="Tree4",
|
||||
node_attr={"color": "red"},
|
||||
)
|
||||
graph.render(view=False)
|
||||
assert isinstance(graph, graphviz.Digraph)
|
||||
assert graph.name == 'Tree4'
|
||||
assert graph.name == "Tree4"
|
||||
assert len(graph.node_attr) == 1
|
||||
assert graph.node_attr['color'] == 'red'
|
||||
assert graph.node_attr["color"] == "red"
|
||||
assert len(graph.graph_attr) == 0
|
||||
assert len(graph.edge_attr) == 0
|
||||
graph_body = ''.join(graph.body)
|
||||
assert 'leaf' in graph_body
|
||||
assert 'gain' in graph_body
|
||||
assert 'value' in graph_body
|
||||
assert 'weight' in graph_body
|
||||
assert '#ffdddd' in graph_body
|
||||
assert '#ddffdd' in graph_body
|
||||
assert 'data' not in graph_body
|
||||
assert 'count' not in graph_body
|
||||
graph_body = "".join(graph.body)
|
||||
assert "leaf" in graph_body
|
||||
assert "gain" in graph_body
|
||||
assert "value" in graph_body
|
||||
assert "weight" in graph_body
|
||||
assert "#ffdddd" in graph_body
|
||||
assert "#ddffdd" in graph_body
|
||||
assert "data" not in graph_body
|
||||
assert "count" not in graph_body
|
||||
|
||||
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
|
||||
def test_tree_with_categories_below_max_category_values():
|
||||
X_train, y_train = _categorical_data(2, 10)
|
||||
params = {
|
||||
|
@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
|
|||
"deterministic": True,
|
||||
"num_threads": 1,
|
||||
"seed": 708,
|
||||
"verbose": -1
|
||||
"verbose": -1,
|
||||
}
|
||||
gbm = lgb.LGBMClassifier(**params)
|
||||
gbm.fit(X_train, y_train)
|
||||
|
@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
|
|||
with pytest.raises(IndexError):
|
||||
lgb.create_tree_digraph(gbm, tree_index=83)
|
||||
|
||||
graph = lgb.create_tree_digraph(gbm, tree_index=3,
|
||||
show_info=['split_gain', 'internal_value', 'internal_weight'],
|
||||
name='Tree4', node_attr={'color': 'red'},
|
||||
max_category_values=10)
|
||||
graph = lgb.create_tree_digraph(
|
||||
gbm,
|
||||
tree_index=3,
|
||||
show_info=["split_gain", "internal_value", "internal_weight"],
|
||||
name="Tree4",
|
||||
node_attr={"color": "red"},
|
||||
max_category_values=10,
|
||||
)
|
||||
graph.render(view=False)
|
||||
assert isinstance(graph, graphviz.Digraph)
|
||||
assert graph.name == 'Tree4'
|
||||
assert graph.name == "Tree4"
|
||||
assert len(graph.node_attr) == 1
|
||||
assert graph.node_attr['color'] == 'red'
|
||||
assert graph.node_attr["color"] == "red"
|
||||
assert len(graph.graph_attr) == 0
|
||||
assert len(graph.edge_attr) == 0
|
||||
graph_body = ''.join(graph.body)
|
||||
assert 'leaf' in graph_body
|
||||
assert 'gain' in graph_body
|
||||
assert 'value' in graph_body
|
||||
assert 'weight' in graph_body
|
||||
assert 'data' not in graph_body
|
||||
assert 'count' not in graph_body
|
||||
assert '||...||' not in graph_body
|
||||
graph_body = "".join(graph.body)
|
||||
assert "leaf" in graph_body
|
||||
assert "gain" in graph_body
|
||||
assert "value" in graph_body
|
||||
assert "weight" in graph_body
|
||||
assert "data" not in graph_body
|
||||
assert "count" not in graph_body
|
||||
assert "||...||" not in graph_body
|
||||
|
||||
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
|
||||
def test_tree_with_categories_above_max_category_values():
|
||||
X_train, y_train = _categorical_data(20, 30)
|
||||
params = {
|
||||
|
@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
|
|||
"deterministic": True,
|
||||
"num_threads": 1,
|
||||
"seed": 708,
|
||||
"verbose": -1
|
||||
"verbose": -1,
|
||||
}
|
||||
gbm = lgb.LGBMClassifier(**params)
|
||||
gbm.fit(X_train, y_train)
|
||||
|
@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
|
|||
with pytest.raises(IndexError):
|
||||
lgb.create_tree_digraph(gbm, tree_index=83)
|
||||
|
||||
graph = lgb.create_tree_digraph(gbm, tree_index=9,
|
||||
show_info=['split_gain', 'internal_value', 'internal_weight'],
|
||||
name='Tree4', node_attr={'color': 'red'},
|
||||
max_category_values=4)
|
||||
graph = lgb.create_tree_digraph(
|
||||
gbm,
|
||||
tree_index=9,
|
||||
show_info=["split_gain", "internal_value", "internal_weight"],
|
||||
name="Tree4",
|
||||
node_attr={"color": "red"},
|
||||
max_category_values=4,
|
||||
)
|
||||
graph.render(view=False)
|
||||
assert isinstance(graph, graphviz.Digraph)
|
||||
assert graph.name == 'Tree4'
|
||||
assert graph.name == "Tree4"
|
||||
assert len(graph.node_attr) == 1
|
||||
assert graph.node_attr['color'] == 'red'
|
||||
assert graph.node_attr["color"] == "red"
|
||||
assert len(graph.graph_attr) == 0
|
||||
assert len(graph.edge_attr) == 0
|
||||
graph_body = ''.join(graph.body)
|
||||
assert 'leaf' in graph_body
|
||||
assert 'gain' in graph_body
|
||||
assert 'value' in graph_body
|
||||
assert 'weight' in graph_body
|
||||
assert 'data' not in graph_body
|
||||
assert 'count' not in graph_body
|
||||
assert '||...||' in graph_body
|
||||
graph_body = "".join(graph.body)
|
||||
assert "leaf" in graph_body
|
||||
assert "gain" in graph_body
|
||||
assert "value" in graph_body
|
||||
assert "weight" in graph_body
|
||||
assert "data" not in graph_body
|
||||
assert "count" not in graph_body
|
||||
assert "||...||" in graph_body
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_missing', [True, False])
|
||||
@pytest.mark.parametrize('zero_as_missing', [True, False])
|
||||
@pytest.mark.parametrize("use_missing", [True, False])
|
||||
@pytest.mark.parametrize("zero_as_missing", [True, False])
|
||||
def test_numeric_split_direction(use_missing, zero_as_missing):
|
||||
if use_missing and zero_as_missing:
|
||||
pytest.skip('use_missing and zero_as_missing both set to True')
|
||||
pytest.skip("use_missing and zero_as_missing both set to True")
|
||||
X, y = make_synthetic_regression()
|
||||
rng = np.random.RandomState(0)
|
||||
zero_mask = rng.rand(X.shape[0]) < 0.05
|
||||
|
@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
|
|||
X[nan_mask, :] = np.nan
|
||||
ds = lgb.Dataset(X, y)
|
||||
params = {
|
||||
'num_leaves': 127,
|
||||
'min_child_samples': 1,
|
||||
'use_missing': use_missing,
|
||||
'zero_as_missing': zero_as_missing,
|
||||
"num_leaves": 127,
|
||||
"min_child_samples": 1,
|
||||
"use_missing": use_missing,
|
||||
"zero_as_missing": zero_as_missing,
|
||||
}
|
||||
bst = lgb.train(params, ds, num_boost_round=1)
|
||||
|
||||
case_with_zero = X[zero_mask][[0]]
|
||||
expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
|
||||
node = bst.dump_model()['tree_info'][0]['tree_structure']
|
||||
while 'decision_type' in node:
|
||||
node = bst.dump_model()["tree_info"][0]["tree_structure"]
|
||||
while "decision_type" in node:
|
||||
direction = lgb.plotting._determine_direction_for_numeric_split(
|
||||
case_with_zero[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
|
||||
case_with_zero[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
|
||||
)
|
||||
node = node['left_child'] if direction == 'left' else node['right_child']
|
||||
assert node['leaf_index'] == expected_leaf_zero
|
||||
node = node["left_child"] if direction == "left" else node["right_child"]
|
||||
assert node["leaf_index"] == expected_leaf_zero
|
||||
|
||||
if use_missing:
|
||||
case_with_nan = X[nan_mask][[0]]
|
||||
expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
|
||||
node = bst.dump_model()['tree_info'][0]['tree_structure']
|
||||
while 'decision_type' in node:
|
||||
node = bst.dump_model()["tree_info"][0]["tree_structure"]
|
||||
while "decision_type" in node:
|
||||
direction = lgb.plotting._determine_direction_for_numeric_split(
|
||||
case_with_nan[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
|
||||
case_with_nan[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
|
||||
)
|
||||
node = node['left_child'] if direction == 'left' else node['right_child']
|
||||
assert node['leaf_index'] == expected_leaf_nan
|
||||
node = node["left_child"] if direction == "left" else node["right_child"]
|
||||
assert node["leaf_index"] == expected_leaf_nan
|
||||
assert expected_leaf_zero != expected_leaf_nan
|
||||
|
||||
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
|
||||
def test_example_case_in_tree_digraph():
|
||||
rng = np.random.RandomState(0)
|
||||
x1 = rng.rand(100)
|
||||
cat = rng.randint(1, 3, size=x1.size)
|
||||
X = np.vstack([x1, cat]).T
|
||||
y = x1 + 2 * cat
|
||||
feature_name = ['x1', 'cat']
|
||||
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=['cat'])
|
||||
feature_name = ["x1", "cat"]
|
||||
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=["cat"])
|
||||
|
||||
num_round = 3
|
||||
bst = lgb.train({'num_leaves': 7}, ds, num_boost_round=num_round)
|
||||
bst = lgb.train({"num_leaves": 7}, ds, num_boost_round=num_round)
|
||||
mod = bst.dump_model()
|
||||
example_case = X[[0]]
|
||||
makes_categorical_splits = False
|
||||
|
@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
|
|||
for i in range(num_round):
|
||||
graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
|
||||
gbody = graph.body
|
||||
node = mod['tree_info'][i]['tree_structure']
|
||||
while 'decision_type' in node: # iterate through the splits
|
||||
split_index = node['split_index']
|
||||
node = mod["tree_info"][i]["tree_structure"]
|
||||
while "decision_type" in node: # iterate through the splits
|
||||
split_index = node["split_index"]
|
||||
|
||||
node_in_graph = [n for n in gbody if f'split{split_index}' in n and '->' not in n]
|
||||
node_in_graph = [n for n in gbody if f"split{split_index}" in n and "->" not in n]
|
||||
assert len(node_in_graph) == 1
|
||||
seen_indices.add(gbody.index(node_in_graph[0]))
|
||||
|
||||
edge_to_node = [e for e in gbody if f'-> split{split_index}' in e]
|
||||
if node['decision_type'] == '<=':
|
||||
edge_to_node = [e for e in gbody if f"-> split{split_index}" in e]
|
||||
if node["decision_type"] == "<=":
|
||||
direction = lgb.plotting._determine_direction_for_numeric_split(
|
||||
example_case[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'])
|
||||
example_case[0][node["split_feature"]],
|
||||
node["threshold"],
|
||||
node["missing_type"],
|
||||
node["default_left"],
|
||||
)
|
||||
else:
|
||||
makes_categorical_splits = True
|
||||
direction = lgb.plotting._determine_direction_for_categorical_split(
|
||||
example_case[0][node['split_feature']], node['threshold']
|
||||
example_case[0][node["split_feature"]], node["threshold"]
|
||||
)
|
||||
node = node['left_child'] if direction == 'left' else node['right_child']
|
||||
assert 'color=blue' in node_in_graph[0]
|
||||
node = node["left_child"] if direction == "left" else node["right_child"]
|
||||
assert "color=blue" in node_in_graph[0]
|
||||
if edge_to_node:
|
||||
assert len(edge_to_node) == 1
|
||||
assert 'color=blue' in edge_to_node[0]
|
||||
assert "color=blue" in edge_to_node[0]
|
||||
seen_indices.add(gbody.index(edge_to_node[0]))
|
||||
# we're in a leaf now
|
||||
leaf_index = node['leaf_index']
|
||||
leaf_in_graph = [n for n in gbody if f'leaf{leaf_index}' in n and '->' not in n]
|
||||
edge_to_leaf = [e for e in gbody if f'-> leaf{leaf_index}' in e]
|
||||
leaf_index = node["leaf_index"]
|
||||
leaf_in_graph = [n for n in gbody if f"leaf{leaf_index}" in n and "->" not in n]
|
||||
edge_to_leaf = [e for e in gbody if f"-> leaf{leaf_index}" in e]
|
||||
assert len(leaf_in_graph) == 1
|
||||
assert 'color=blue' in leaf_in_graph[0]
|
||||
assert "color=blue" in leaf_in_graph[0]
|
||||
assert len(edge_to_leaf) == 1
|
||||
assert 'color=blue' in edge_to_leaf[0]
|
||||
assert "color=blue" in edge_to_leaf[0]
|
||||
seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])
|
||||
|
||||
# check that the rest of the elements have black color
|
||||
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and 'graph' not in e]
|
||||
assert all('color=black' in e for e in remaining_elements)
|
||||
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and "graph" not in e]
|
||||
assert all("color=black" in e for e in remaining_elements)
|
||||
|
||||
# check that we got to the expected leaf
|
||||
expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
|
||||
|
@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
|
|||
assert makes_categorical_splits
|
||||
|
||||
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
|
||||
@pytest.mark.parametrize('input_type', ['array', 'dataframe'])
|
||||
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
|
||||
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
|
||||
def test_empty_example_case_on_tree_digraph_raises_error(input_type):
|
||||
X, y = make_synthetic_regression()
|
||||
if input_type == 'dataframe':
|
||||
if input_type == "dataframe":
|
||||
if not PANDAS_INSTALLED:
|
||||
pytest.skip(reason='pandas is not installed')
|
||||
pytest.skip(reason="pandas is not installed")
|
||||
X = pd_DataFrame(X)
|
||||
ds = lgb.Dataset(X, y)
|
||||
bst = lgb.train({'num_leaves': 3}, ds, num_boost_round=1)
|
||||
bst = lgb.train({"num_leaves": 3}, ds, num_boost_round=1)
|
||||
example_case = X[:0]
|
||||
if input_type == 'dataframe':
|
||||
if input_type == "dataframe":
|
||||
example_case = pd_DataFrame(example_case)
|
||||
with pytest.raises(ValueError, match='example_case must have a single row.'):
|
||||
with pytest.raises(ValueError, match="example_case must have a single row."):
|
||||
lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
|
||||
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
|
||||
def test_plot_metrics(params, breast_cancer_split, train_data):
|
||||
X_train, X_test, y_train, y_test = breast_cancer_split
|
||||
test_data = lgb.Dataset(X_test, y_test, reference=train_data)
|
||||
params.update({"metric": {"binary_logloss", "binary_error"}})
|
||||
|
||||
evals_result0 = {}
|
||||
lgb.train(params, train_data,
|
||||
valid_sets=[train_data, test_data],
|
||||
valid_names=['v1', 'v2'],
|
||||
num_boost_round=10,
|
||||
callbacks=[lgb.record_evaluation(evals_result0)])
|
||||
lgb.train(
|
||||
params,
|
||||
train_data,
|
||||
valid_sets=[train_data, test_data],
|
||||
valid_names=["v1", "v2"],
|
||||
num_boost_round=10,
|
||||
callbacks=[lgb.record_evaluation(evals_result0)],
|
||||
)
|
||||
with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
|
||||
ax0 = lgb.plot_metric(evals_result0)
|
||||
assert isinstance(ax0, matplotlib.axes.Axes)
|
||||
assert ax0.get_title() == 'Metric during training'
|
||||
assert ax0.get_xlabel() == 'Iterations'
|
||||
assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
|
||||
assert ax0.get_title() == "Metric during training"
|
||||
assert ax0.get_xlabel() == "Iterations"
|
||||
assert ax0.get_ylabel() in {"binary_logloss", "binary_error"}
|
||||
legend_items = ax0.get_legend().get_texts()
|
||||
assert len(legend_items) == 2
|
||||
assert legend_items[0].get_text() == 'v1'
|
||||
assert legend_items[1].get_text() == 'v2'
|
||||
assert legend_items[0].get_text() == "v1"
|
||||
assert legend_items[1].get_text() == "v2"
|
||||
|
||||
ax1 = lgb.plot_metric(evals_result0, metric='binary_error')
|
||||
ax1 = lgb.plot_metric(evals_result0, metric="binary_error")
|
||||
assert isinstance(ax1, matplotlib.axes.Axes)
|
||||
assert ax1.get_title() == 'Metric during training'
|
||||
assert ax1.get_xlabel() == 'Iterations'
|
||||
assert ax1.get_ylabel() == 'binary_error'
|
||||
assert ax1.get_title() == "Metric during training"
|
||||
assert ax1.get_xlabel() == "Iterations"
|
||||
assert ax1.get_ylabel() == "binary_error"
|
||||
legend_items = ax1.get_legend().get_texts()
|
||||
assert len(legend_items) == 2
|
||||
assert legend_items[0].get_text() == 'v1'
|
||||
assert legend_items[1].get_text() == 'v2'
|
||||
assert legend_items[0].get_text() == "v1"
|
||||
assert legend_items[1].get_text() == "v2"
|
||||
|
||||
ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
|
||||
ax2 = lgb.plot_metric(evals_result0, metric="binary_logloss", dataset_names=["v2"])
|
||||
assert isinstance(ax2, matplotlib.axes.Axes)
|
||||
assert ax2.get_title() == 'Metric during training'
|
||||
assert ax2.get_xlabel() == 'Iterations'
|
||||
assert ax2.get_ylabel() == 'binary_logloss'
|
||||
assert ax2.get_title() == "Metric during training"
|
||||
assert ax2.get_xlabel() == "Iterations"
|
||||
assert ax2.get_ylabel() == "binary_logloss"
|
||||
legend_items = ax2.get_legend().get_texts()
|
||||
assert len(legend_items) == 1
|
||||
assert legend_items[0].get_text() == 'v2'
|
||||
assert legend_items[0].get_text() == "v2"
|
||||
|
||||
ax3 = lgb.plot_metric(
|
||||
evals_result0,
|
||||
metric='binary_logloss',
|
||||
dataset_names=['v1'],
|
||||
title='Metric @metric@',
|
||||
xlabel='Iterations @metric@',
|
||||
metric="binary_logloss",
|
||||
dataset_names=["v1"],
|
||||
title="Metric @metric@",
|
||||
xlabel="Iterations @metric@",
|
||||
ylabel='Value of "@metric@"',
|
||||
figsize=(5, 5),
|
||||
dpi=600,
|
||||
grid=False
|
||||
grid=False,
|
||||
)
|
||||
assert isinstance(ax3, matplotlib.axes.Axes)
|
||||
assert ax3.get_title() == 'Metric @metric@'
|
||||
assert ax3.get_xlabel() == 'Iterations @metric@'
|
||||
assert ax3.get_title() == "Metric @metric@"
|
||||
assert ax3.get_xlabel() == "Iterations @metric@"
|
||||
assert ax3.get_ylabel() == 'Value of "binary_logloss"'
|
||||
legend_items = ax3.get_legend().get_texts()
|
||||
assert len(legend_items) == 1
|
||||
assert legend_items[0].get_text() == 'v1'
|
||||
assert legend_items[0].get_text() == "v1"
|
||||
assert ax3.get_figure().get_figheight() == 5
|
||||
assert ax3.get_figure().get_figwidth() == 5
|
||||
assert ax3.get_figure().get_dpi() == 600
|
||||
|
@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
|
|||
assert not grid_line.get_visible()
|
||||
|
||||
evals_result1 = {}
|
||||
lgb.train(params, train_data,
|
||||
num_boost_round=10,
|
||||
callbacks=[lgb.record_evaluation(evals_result1)])
|
||||
lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)])
|
||||
with pytest.raises(ValueError, match="eval results cannot be empty."):
|
||||
lgb.plot_metric(evals_result1)
|
||||
|
||||
|
@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
|
|||
gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
|
||||
assert isinstance(ax4, matplotlib.axes.Axes)
|
||||
assert ax4.get_title() == ''
|
||||
assert ax4.get_xlabel() == ''
|
||||
assert ax4.get_ylabel() == ''
|
||||
assert ax4.get_title() == ""
|
||||
assert ax4.get_xlabel() == ""
|
||||
assert ax4.get_ylabel() == ""
|
||||
legend_items = ax4.get_legend().get_texts()
|
||||
assert len(legend_items) == 1
|
||||
assert legend_items[0].get_text() == 'valid_0'
|
||||
assert legend_items[0].get_text() == "valid_0"
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -10,7 +10,7 @@ import lightgbm as lgb
|
|||
def test_register_logger(tmp_path):
|
||||
logger = logging.getLogger("LightGBM")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
formatter = logging.Formatter('%(levelname)s | %(message)s')
|
||||
formatter = logging.Formatter("%(levelname)s | %(message)s")
|
||||
log_filename = tmp_path / "LightGBM_test_logger.log"
|
||||
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
|
@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
|
|||
logger.addHandler(file_handler)
|
||||
|
||||
def dummy_metric(_, __):
|
||||
logger.debug('In dummy_metric')
|
||||
return 'dummy_metric', 1, True
|
||||
logger.debug("In dummy_metric")
|
||||
return "dummy_metric", 1, True
|
||||
|
||||
lgb.register_logger(logger)
|
||||
|
||||
X = np.array([[1, 2, 3],
|
||||
[1, 2, 4],
|
||||
[1, 2, 4],
|
||||
[1, 2, 3]],
|
||||
dtype=np.float32)
|
||||
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
|
||||
y = np.array([0, 1, 1, 0])
|
||||
lgb_train = lgb.Dataset(X, y)
|
||||
lgb_valid = lgb.Dataset(X, y) # different object for early-stopping
|
||||
|
||||
eval_records = {}
|
||||
callbacks = [
|
||||
lgb.record_evaluation(eval_records),
|
||||
lgb.log_evaluation(2),
|
||||
lgb.early_stopping(10)
|
||||
]
|
||||
lgb.train({'objective': 'binary', 'metric': ['auc', 'binary_error']},
|
||||
lgb_train, num_boost_round=10, feval=dummy_metric,
|
||||
valid_sets=[lgb_valid], categorical_feature=[1], callbacks=callbacks)
|
||||
callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)]
|
||||
lgb.train(
|
||||
{"objective": "binary", "metric": ["auc", "binary_error"]},
|
||||
lgb_train,
|
||||
num_boost_round=10,
|
||||
feval=dummy_metric,
|
||||
valid_sets=[lgb_valid],
|
||||
categorical_feature=[1],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
|
||||
lgb.plot_metric(eval_records)
|
||||
|
||||
|
@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
|
|||
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
|
||||
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
|
||||
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
|
||||
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
|
||||
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!",
|
||||
]
|
||||
cuda_lines = [
|
||||
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
|
||||
|
@ -142,11 +140,7 @@ def test_register_custom_logger():
|
|||
logged_messages.append(msg)
|
||||
|
||||
custom_logger = CustomLogger()
|
||||
lgb.register_logger(
|
||||
custom_logger,
|
||||
info_method_name="custom_info",
|
||||
warning_method_name="custom_warning"
|
||||
)
|
||||
lgb.register_logger(custom_logger, info_method_name="custom_info", warning_method_name="custom_warning")
|
||||
|
||||
lgb.basic._log_info("info message")
|
||||
lgb.basic._log_warning("warning message")
|
||||
|
@ -155,18 +149,14 @@ def test_register_custom_logger():
|
|||
assert logged_messages == expected_log
|
||||
|
||||
logged_messages = []
|
||||
X = np.array([[1, 2, 3],
|
||||
[1, 2, 4],
|
||||
[1, 2, 4],
|
||||
[1, 2, 3]],
|
||||
dtype=np.float32)
|
||||
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
|
||||
y = np.array([0, 1, 1, 0])
|
||||
lgb_data = lgb.Dataset(X, y)
|
||||
lgb.train(
|
||||
{'objective': 'binary', 'metric': 'auc'},
|
||||
{"objective": "binary", "metric": "auc"},
|
||||
lgb_data,
|
||||
num_boost_round=10,
|
||||
valid_sets=[lgb_data],
|
||||
categorical_feature=[1]
|
||||
categorical_feature=[1],
|
||||
)
|
||||
assert logged_messages, "custom logger was not called"
|
||||
|
|
|
@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
|
|||
return sklearn.datasets.load_linnerud(**kwargs)
|
||||
|
||||
|
||||
def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
|
||||
group=None, random_gs=False, avg_gs=10, random_state=0):
|
||||
def make_ranking(
|
||||
n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0
|
||||
):
|
||||
"""Generate a learning-to-rank dataset - feature vectors grouped together with
|
||||
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
|
||||
if ranking objective becomes supported in sklearn.datasets module.
|
||||
|
@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
|
|||
relvalues = range(gmax + 1)
|
||||
|
||||
# build y/target and group-id vectors with user-specified group sizes.
|
||||
if group is not None and hasattr(group, '__len__'):
|
||||
if group is not None and hasattr(group, "__len__"):
|
||||
n_samples = np.sum(group)
|
||||
|
||||
for i, gsize in enumerate(group):
|
||||
|
@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
|
|||
|
||||
@lru_cache(maxsize=None)
|
||||
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
|
||||
return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_informative, random_state=random_state)
|
||||
return sklearn.datasets.make_regression(
|
||||
n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state
|
||||
)
|
||||
|
||||
|
||||
def dummy_obj(preds, train_data):
|
||||
|
@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
|
|||
|
||||
def mse_obj(y_pred, dtrain):
|
||||
y_true = dtrain.get_label()
|
||||
grad = (y_pred - y_true)
|
||||
grad = y_pred - y_true
|
||||
hess = np.ones(len(grad))
|
||||
return grad, hess
|
||||
|
||||
|
@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
|
|||
|
||||
|
||||
def pickle_obj(obj, filepath, serializer):
|
||||
if serializer == 'pickle':
|
||||
with open(filepath, 'wb') as f:
|
||||
if serializer == "pickle":
|
||||
with open(filepath, "wb") as f:
|
||||
pickle.dump(obj, f)
|
||||
elif serializer == 'joblib':
|
||||
elif serializer == "joblib":
|
||||
joblib.dump(obj, filepath)
|
||||
elif serializer == 'cloudpickle':
|
||||
with open(filepath, 'wb') as f:
|
||||
elif serializer == "cloudpickle":
|
||||
with open(filepath, "wb") as f:
|
||||
cloudpickle.dump(obj, f)
|
||||
else:
|
||||
raise ValueError(f'Unrecognized serializer type: {serializer}')
|
||||
raise ValueError(f"Unrecognized serializer type: {serializer}")
|
||||
|
||||
|
||||
def unpickle_obj(filepath, serializer):
|
||||
if serializer == 'pickle':
|
||||
with open(filepath, 'rb') as f:
|
||||
if serializer == "pickle":
|
||||
with open(filepath, "rb") as f:
|
||||
return pickle.load(f)
|
||||
elif serializer == 'joblib':
|
||||
elif serializer == "joblib":
|
||||
return joblib.load(filepath)
|
||||
elif serializer == 'cloudpickle':
|
||||
with open(filepath, 'rb') as f:
|
||||
elif serializer == "cloudpickle":
|
||||
with open(filepath, "rb") as f:
|
||||
return cloudpickle.load(f)
|
||||
else:
|
||||
raise ValueError(f'Unrecognized serializer type: {serializer}')
|
||||
raise ValueError(f"Unrecognized serializer type: {serializer}")
|
||||
|
||||
|
||||
def pickle_and_unpickle_object(obj, serializer):
|
||||
with lgb.basic._TempFile() as tmp_file:
|
||||
pickle_obj(
|
||||
obj=obj,
|
||||
filepath=tmp_file.name,
|
||||
serializer=serializer
|
||||
)
|
||||
obj_from_disk = unpickle_obj(
|
||||
filepath=tmp_file.name,
|
||||
serializer=serializer
|
||||
)
|
||||
pickle_obj(obj=obj, filepath=tmp_file.name, serializer=serializer)
|
||||
obj_from_disk = unpickle_obj(filepath=tmp_file.name, serializer=serializer)
|
||||
return obj_from_disk # noqa: RET504
|
||||
|
||||
|
||||
# doing this here, at import time, to ensure it only runs once_per import
|
||||
# instead of once per assertion
|
||||
_numpy_testing_supports_strict_kwarg = (
|
||||
"strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
|
||||
)
|
||||
_numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
|
||||
|
||||
|
||||
def np_assert_array_equal(*args, **kwargs):
|
||||
|
|
Загрузка…
Ссылка в новой задаче