2017-01-04 10:19:12 +03:00
|
|
|
# coding: utf-8
|
2016-11-08 16:39:19 +03:00
|
|
|
import ctypes
|
2021-07-04 07:31:41 +03:00
|
|
|
from pathlib import Path
|
2018-06-03 12:46:59 +03:00
|
|
|
from platform import system
|
|
|
|
|
2016-11-08 16:39:19 +03:00
|
|
|
import numpy as np
|
|
|
|
from scipy import sparse
|
|
|
|
|
2023-03-07 21:46:56 +03:00
|
|
|
try:
|
|
|
|
from lightgbm.basic import _LIB as LIB
|
|
|
|
except ModuleNotFoundError:
|
2024-10-07 05:49:20 +03:00
|
|
|
print("Could not import lightgbm Python-package, looking for lib_lightgbm at the repo root")
|
2018-06-03 12:46:59 +03:00
|
|
|
if system() in ("Windows", "Microsoft"):
|
2023-03-07 21:46:56 +03:00
|
|
|
lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll"
|
2016-11-09 14:30:52 +03:00
|
|
|
else:
|
2023-03-07 21:46:56 +03:00
|
|
|
lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so"
|
|
|
|
LIB = ctypes.cdll.LoadLibrary(lib_file)
|
2016-11-08 16:39:19 +03:00
|
|
|
|
2016-11-26 09:44:48 +03:00
|
|
|
LIB.LGBM_GetLastError.restype = ctypes.c_char_p
|
|
|
|
|
2016-11-09 11:06:09 +03:00
|
|
|
dtype_float32 = 0
|
|
|
|
dtype_float64 = 1
|
|
|
|
dtype_int32 = 2
|
|
|
|
dtype_int64 = 3
|
|
|
|
|
|
|
|
|
2016-11-08 16:39:19 +03:00
|
|
|
def c_str(string):
|
2024-09-04 06:30:30 +03:00
|
|
|
return ctypes.c_char_p(str(string).encode("utf-8"))
|
2016-11-08 16:39:19 +03:00
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def load_from_file(filename, reference):
|
2016-11-09 11:06:09 +03:00
|
|
|
ref = None
|
2017-01-04 10:19:12 +03:00
|
|
|
if reference is not None:
|
2016-12-23 09:35:05 +03:00
|
|
|
ref = reference
|
2016-11-09 11:06:09 +03:00
|
|
|
handle = ctypes.c_void_p()
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetCreateFromFile(c_str(str(filename)), c_str("max_bin=15"), ref, ctypes.byref(handle))
|
2016-11-26 09:44:48 +03:00
|
|
|
print(LIB.LGBM_GetLastError())
|
2021-05-20 15:22:18 +03:00
|
|
|
num_data = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
2021-05-20 15:22:18 +03:00
|
|
|
num_feature = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
2021-05-17 02:13:16 +03:00
|
|
|
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
2016-11-09 11:06:09 +03:00
|
|
|
return handle
|
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def save_to_binary(handle, filename):
|
2016-11-09 11:06:09 +03:00
|
|
|
LIB.LGBM_DatasetSaveBinary(handle, c_str(filename))
|
|
|
|
|
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def load_from_csr(filename, reference):
|
2021-07-05 01:10:48 +03:00
|
|
|
data = np.loadtxt(str(filename), dtype=np.float64)
|
|
|
|
csr = sparse.csr_matrix(data[:, 1:])
|
|
|
|
label = data[:, 0].astype(np.float32)
|
2016-11-08 16:39:19 +03:00
|
|
|
handle = ctypes.c_void_p()
|
|
|
|
ref = None
|
2017-01-04 10:19:12 +03:00
|
|
|
if reference is not None:
|
2016-12-23 09:35:05 +03:00
|
|
|
ref = reference
|
2016-11-08 16:39:19 +03:00
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetCreateFromCSR(
|
2021-05-20 15:22:18 +03:00
|
|
|
csr.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
|
|
|
ctypes.c_int(dtype_int32),
|
|
|
|
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
|
|
|
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
|
|
|
ctypes.c_int(dtype_float64),
|
2019-10-03 18:19:25 +03:00
|
|
|
ctypes.c_int64(len(csr.indptr)),
|
|
|
|
ctypes.c_int64(len(csr.data)),
|
|
|
|
ctypes.c_int64(csr.shape[1]),
|
2017-01-04 10:19:12 +03:00
|
|
|
c_str("max_bin=15"),
|
|
|
|
ref,
|
|
|
|
ctypes.byref(handle),
|
|
|
|
)
|
2021-05-20 15:22:18 +03:00
|
|
|
num_data = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
2021-05-20 15:22:18 +03:00
|
|
|
num_feature = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
2021-05-20 15:22:18 +03:00
|
|
|
LIB.LGBM_DatasetSetField(
|
|
|
|
handle,
|
|
|
|
c_str("label"),
|
|
|
|
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
|
|
|
ctypes.c_int(len(label)),
|
|
|
|
ctypes.c_int(dtype_float32),
|
|
|
|
)
|
2021-05-17 02:13:16 +03:00
|
|
|
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
2016-11-09 11:06:09 +03:00
|
|
|
return handle
|
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def load_from_csc(filename, reference):
|
2021-07-05 01:10:48 +03:00
|
|
|
data = np.loadtxt(str(filename), dtype=np.float64)
|
|
|
|
csc = sparse.csc_matrix(data[:, 1:])
|
|
|
|
label = data[:, 0].astype(np.float32)
|
2016-11-09 11:06:09 +03:00
|
|
|
handle = ctypes.c_void_p()
|
|
|
|
ref = None
|
2017-01-04 10:19:12 +03:00
|
|
|
if reference is not None:
|
2016-12-23 09:35:05 +03:00
|
|
|
ref = reference
|
2016-11-08 16:39:19 +03:00
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetCreateFromCSC(
|
2021-05-20 15:22:18 +03:00
|
|
|
csc.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
|
|
|
ctypes.c_int(dtype_int32),
|
|
|
|
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
|
|
|
csc.data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
|
|
|
ctypes.c_int(dtype_float64),
|
|
|
|
ctypes.c_int64(len(csc.indptr)),
|
|
|
|
ctypes.c_int64(len(csc.data)),
|
|
|
|
ctypes.c_int64(csc.shape[0]),
|
2017-01-04 10:19:12 +03:00
|
|
|
c_str("max_bin=15"),
|
|
|
|
ref,
|
|
|
|
ctypes.byref(handle),
|
|
|
|
)
|
2021-05-20 15:22:18 +03:00
|
|
|
num_data = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
2021-05-20 15:22:18 +03:00
|
|
|
num_feature = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
2021-05-20 15:22:18 +03:00
|
|
|
LIB.LGBM_DatasetSetField(
|
|
|
|
handle,
|
|
|
|
c_str("label"),
|
|
|
|
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
|
|
|
ctypes.c_int(len(label)),
|
|
|
|
ctypes.c_int(dtype_float32),
|
|
|
|
)
|
2021-05-17 02:13:16 +03:00
|
|
|
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
2016-11-09 11:06:09 +03:00
|
|
|
return handle
|
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def load_from_mat(filename, reference):
|
2021-07-05 01:10:48 +03:00
|
|
|
mat = np.loadtxt(str(filename), dtype=np.float64)
|
|
|
|
label = mat[:, 0].astype(np.float32)
|
|
|
|
mat = mat[:, 1:]
|
2024-06-13 07:22:22 +03:00
|
|
|
data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
|
2016-11-09 11:06:09 +03:00
|
|
|
handle = ctypes.c_void_p()
|
|
|
|
ref = None
|
2017-01-04 10:19:12 +03:00
|
|
|
if reference is not None:
|
2016-12-23 09:35:05 +03:00
|
|
|
ref = reference
|
2016-11-08 16:39:19 +03:00
|
|
|
|
2018-10-11 11:46:04 +03:00
|
|
|
LIB.LGBM_DatasetCreateFromMat(
|
2021-05-20 15:22:18 +03:00
|
|
|
data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
|
|
|
ctypes.c_int(dtype_float64),
|
|
|
|
ctypes.c_int32(mat.shape[0]),
|
|
|
|
ctypes.c_int32(mat.shape[1]),
|
|
|
|
ctypes.c_int(1),
|
2017-01-04 10:19:12 +03:00
|
|
|
c_str("max_bin=15"),
|
|
|
|
ref,
|
|
|
|
ctypes.byref(handle),
|
|
|
|
)
|
2021-05-20 15:22:18 +03:00
|
|
|
num_data = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
|
2021-05-20 15:22:18 +03:00
|
|
|
num_feature = ctypes.c_int(0)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
|
2021-05-20 15:22:18 +03:00
|
|
|
LIB.LGBM_DatasetSetField(
|
|
|
|
handle,
|
|
|
|
c_str("label"),
|
|
|
|
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
|
|
|
ctypes.c_int(len(label)),
|
|
|
|
ctypes.c_int(dtype_float32),
|
|
|
|
)
|
2021-05-17 02:13:16 +03:00
|
|
|
print(f"#data: {num_data.value} #feature: {num_feature.value}")
|
2016-11-08 16:39:19 +03:00
|
|
|
return handle
|
2017-01-04 10:19:12 +03:00
|
|
|
|
|
|
|
|
2018-09-10 06:27:44 +03:00
|
|
|
def free_dataset(handle):
|
2016-11-09 11:06:09 +03:00
|
|
|
LIB.LGBM_DatasetFree(handle)
|
|
|
|
|
2017-01-04 10:19:12 +03:00
|
|
|
|
2024-08-15 07:56:35 +03:00
|
|
|
def test_dataset(tmp_path):
|
2021-07-04 07:31:41 +03:00
|
|
|
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
|
|
|
|
train = load_from_file(binary_example_dir / "binary.train", None)
|
|
|
|
test = load_from_mat(binary_example_dir / "binary.test", train)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(test)
|
2021-07-04 07:31:41 +03:00
|
|
|
test = load_from_csr(binary_example_dir / "binary.test", train)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(test)
|
2021-07-04 07:31:41 +03:00
|
|
|
test = load_from_csc(binary_example_dir / "binary.test", train)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(test)
|
2024-08-15 07:56:35 +03:00
|
|
|
train_binary = str(tmp_path / "train.binary.bin")
|
|
|
|
save_to_binary(train, train_binary)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(train)
|
2024-08-15 07:56:35 +03:00
|
|
|
train = load_from_file(train_binary, None)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(train)
|
2017-01-04 10:19:12 +03:00
|
|
|
|
|
|
|
|
2024-08-06 04:21:04 +03:00
|
|
|
def test_booster(tmp_path):
|
2021-07-04 07:31:41 +03:00
|
|
|
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
|
|
|
|
train = load_from_mat(binary_example_dir / "binary.train", None)
|
|
|
|
test = load_from_mat(binary_example_dir / "binary.test", train)
|
2016-11-09 11:06:09 +03:00
|
|
|
booster = ctypes.c_void_p()
|
2024-08-06 04:21:04 +03:00
|
|
|
model_path = tmp_path / "model.txt"
|
2018-10-11 11:46:04 +03:00
|
|
|
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
|
2016-11-23 22:17:25 +03:00
|
|
|
LIB.LGBM_BoosterAddValidData(booster, test)
|
2016-11-09 11:06:09 +03:00
|
|
|
is_finished = ctypes.c_int(0)
|
2019-10-27 03:58:31 +03:00
|
|
|
for i in range(1, 51):
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
|
2016-12-31 09:48:19 +03:00
|
|
|
result = np.array([0.0], dtype=np.float64)
|
2021-05-20 15:22:18 +03:00
|
|
|
out_len = ctypes.c_int(0)
|
2018-10-11 11:46:04 +03:00
|
|
|
LIB.LGBM_BoosterGetEval(
|
|
|
|
booster, ctypes.c_int(0), ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
|
2024-02-21 21:15:38 +03:00
|
|
|
)
|
2017-01-05 07:40:30 +03:00
|
|
|
if i % 10 == 0:
|
2021-05-17 02:13:16 +03:00
|
|
|
print(f"{i} iteration test AUC {result[0]:.6f}")
|
2024-08-06 04:21:04 +03:00
|
|
|
LIB.LGBM_BoosterSaveModel(booster, ctypes.c_int(0), ctypes.c_int(-1), ctypes.c_int(0), c_str(str(model_path)))
|
2016-11-09 11:06:09 +03:00
|
|
|
LIB.LGBM_BoosterFree(booster)
|
2018-09-10 06:27:44 +03:00
|
|
|
free_dataset(train)
|
|
|
|
free_dataset(test)
|
2016-11-09 11:06:09 +03:00
|
|
|
booster2 = ctypes.c_void_p()
|
2021-05-20 15:22:18 +03:00
|
|
|
num_total_model = ctypes.c_int(0)
|
2024-08-06 04:21:04 +03:00
|
|
|
LIB.LGBM_BoosterCreateFromModelfile(c_str(str(model_path)), ctypes.byref(num_total_model), ctypes.byref(booster2))
|
2021-07-05 01:10:48 +03:00
|
|
|
data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
|
|
|
|
mat = data[:, 1:]
|
2024-09-04 06:30:30 +03:00
|
|
|
preds = np.empty(mat.shape[0], dtype=np.float64)
|
|
|
|
num_preds = ctypes.c_int64(0)
|
2024-06-13 07:22:22 +03:00
|
|
|
data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
|
2017-01-04 10:19:12 +03:00
|
|
|
LIB.LGBM_BoosterPredictForMat(
|
|
|
|
booster2,
|
2021-05-20 15:22:18 +03:00
|
|
|
data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
|
|
|
ctypes.c_int(dtype_float64),
|
|
|
|
ctypes.c_int32(mat.shape[0]),
|
|
|
|
ctypes.c_int32(mat.shape[1]),
|
|
|
|
ctypes.c_int(1),
|
|
|
|
ctypes.c_int(1),
|
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(25),
|
2017-05-30 13:28:17 +03:00
|
|
|
c_str(""),
|
2024-09-04 06:30:30 +03:00
|
|
|
ctypes.byref(num_preds),
|
|
|
|
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
|
2016-11-09 11:06:09 +03:00
|
|
|
)
|
2018-08-27 05:06:43 +03:00
|
|
|
LIB.LGBM_BoosterPredictForFile(
|
|
|
|
booster2,
|
2021-07-04 07:31:41 +03:00
|
|
|
c_str(str(binary_example_dir / "binary.test")),
|
2021-05-20 15:22:18 +03:00
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(25),
|
2020-08-06 15:40:33 +03:00
|
|
|
c_str(""),
|
2024-09-04 06:30:30 +03:00
|
|
|
c_str(tmp_path / "preds.txt"),
|
2020-08-06 15:40:33 +03:00
|
|
|
)
|
|
|
|
LIB.LGBM_BoosterPredictForFile(
|
|
|
|
booster2,
|
2021-07-04 07:31:41 +03:00
|
|
|
c_str(str(binary_example_dir / "binary.test")),
|
2021-05-20 15:22:18 +03:00
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(0),
|
|
|
|
ctypes.c_int(10),
|
|
|
|
ctypes.c_int(25),
|
2018-08-27 05:06:43 +03:00
|
|
|
c_str(""),
|
2024-09-04 06:30:30 +03:00
|
|
|
c_str(tmp_path / "preds.txt"),
|
2018-08-27 05:06:43 +03:00
|
|
|
)
|
2016-11-09 11:06:09 +03:00
|
|
|
LIB.LGBM_BoosterFree(booster2)
|
2023-12-08 02:03:16 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_max_thread_control():
|
|
|
|
# at initialization, should be -1
|
|
|
|
num_threads = ctypes.c_int(0)
|
|
|
|
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
|
|
|
assert ret == 0
|
|
|
|
assert num_threads.value == -1
|
|
|
|
|
|
|
|
# updating that value through the C API should work
|
|
|
|
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(6))
|
|
|
|
assert ret == 0
|
|
|
|
|
|
|
|
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
|
|
|
assert ret == 0
|
|
|
|
assert num_threads.value == 6
|
|
|
|
|
|
|
|
# resetting to any negative number should set it to -1
|
|
|
|
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(-123))
|
|
|
|
assert ret == 0
|
|
|
|
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
|
|
|
|
assert ret == 0
|
|
|
|
assert num_threads.value == -1
|