зеркало из https://github.com/microsoft/CBM.git
added categorical, date/time and continuous feature support added feature importance plotting
* try to fix c++ build * add MANIFEST.in to fix missing header files * making sure it really gets included * added categorical, date/time and continuous feature support added feature importance plotting * make sure the model is serializable
This commit is contained in:
Родитель
cd5f46fb86
Коммит
a5a992afe1
|
@ -41,6 +41,9 @@ jobs:
|
|||
- name: Build a source tarball
|
||||
run: python -m build --sdist --outdir dist/ .
|
||||
|
||||
- name: Install from source tarball
|
||||
run: python -m pip install dist/*.tar.gz --user
|
||||
|
||||
- name: Store the binary wheel
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
include README.md
|
||||
include setup.py
|
||||
recursive-include cbm *.py
|
||||
recursive-include src *.c *.h
|
172
cbm/CBM.py
172
cbm/CBM.py
|
@ -3,11 +3,14 @@
|
|||
|
||||
import cbm_cpp
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
from typing import List, Union
|
||||
from pandas.api.types import CategoricalDtype
|
||||
|
||||
|
||||
class CBM(BaseEstimator):
|
||||
|
@ -19,7 +22,21 @@ class CBM(BaseEstimator):
|
|||
min_iterations_early_stopping:int = 20,
|
||||
epsilon_early_stopping:float = 1e-3,
|
||||
single_update_per_iteration:bool = True,
|
||||
date_features: Union[str, List[str]] = 'day,month',
|
||||
binning: Union[int, lambda x: int] = 10,
|
||||
) -> None:
|
||||
"""Initialize the CBM model.
|
||||
|
||||
Args:
|
||||
learning_rate_step_size (float, optional): [description]. Defaults to 1/100.
|
||||
max_iterations (int, optional): [description]. Defaults to 100.
|
||||
min_iterations_early_stopping (int, optional): [description]. Defaults to 20.
|
||||
epsilon_early_stopping (float, optional): [description]. Defaults to 1e-3.
|
||||
single_update_per_iteration (bool, optional): [description]. Defaults to True.
|
||||
date_features (List[str], optional): [description]. Defaults to ['day', 'month'].
|
||||
binning (Union[int, lambda x, optional): [description]. Defaults to 10.
|
||||
The number of bins to create for continuous features. Supply lambda for flexible binning.
|
||||
"""
|
||||
|
||||
self.learning_rate_step_size = learning_rate_step_size
|
||||
self.max_iterations = max_iterations
|
||||
|
@ -27,13 +44,90 @@ class CBM(BaseEstimator):
|
|||
self.epsilon_early_stopping = epsilon_early_stopping
|
||||
self.single_update_per_iteration = single_update_per_iteration
|
||||
|
||||
# lets make sure it's serializable
|
||||
if isinstance(date_features, list):
|
||||
date_features = ",".join(date_features)
|
||||
self.date_features = date_features
|
||||
self.binning = binning
|
||||
|
||||
def get_date_features(self) -> List[str]:
|
||||
return self.date_features.split(",")
|
||||
|
||||
def fit(self,
|
||||
X: np.ndarray,
|
||||
X: Union[np.ndarray, pd.DataFrame],
|
||||
y: np.ndarray
|
||||
) -> "CBM":
|
||||
|
||||
# keep feature names around
|
||||
|
||||
if isinstance(X, pd.DataFrame):
|
||||
self._feature_names = []
|
||||
self._feature_categories = []
|
||||
self._feature_bins = []
|
||||
|
||||
X_numeric = []
|
||||
|
||||
for col in X.columns:
|
||||
col_dtype = X[col].dtype
|
||||
|
||||
if pd.api.types.is_datetime64_any_dtype(col_dtype):
|
||||
for expansion in self.get_date_features():
|
||||
import calendar
|
||||
|
||||
if expansion == 'day':
|
||||
self._feature_names.append(f'{col}_day')
|
||||
self._feature_categories.append(calendar.day_abbr)
|
||||
self._feature_bins.append(None)
|
||||
|
||||
X_numeric.append(X[col].dt.dayofweek.values)
|
||||
|
||||
elif expansion == 'month':
|
||||
self._feature_names.append(f'{col}_month')
|
||||
self._feature_categories.append(calendar.month_abbr)
|
||||
self._feature_bins.append(None)
|
||||
|
||||
X_numeric.append(X[col].dt.month.values)
|
||||
|
||||
elif pd.api.types.is_float_dtype(col_dtype):
|
||||
# deal with continuous features
|
||||
bin_num = self.binning if isinstance(self.binning, int) else self.binning(X[col])
|
||||
|
||||
X_binned, bins = pd.qcut(X[col].fillna(0), bin_num, retbins=True)
|
||||
|
||||
self._feature_names.append(col)
|
||||
self._feature_categories.append(X_binned.cat.categories.astype(str).tolist())
|
||||
self._feature_bins.append(bins)
|
||||
|
||||
X_numeric.append(pd.cut(X[col].fillna(0), bins, include_lowest=True).cat.codes)
|
||||
|
||||
elif not pd.api.types.is_integer_dtype(col_dtype):
|
||||
self._feature_names.append(col)
|
||||
|
||||
# convert to categorical
|
||||
X_cat = (X[col]
|
||||
.fillna('CBM_UnknownCategory')
|
||||
.astype('category'))
|
||||
|
||||
# keep track of categories
|
||||
self._feature_categories.append(X_cat.cat.categories.tolist())
|
||||
self._feature_bins.append(None)
|
||||
|
||||
# convert to 0-based index
|
||||
X_numeric.append(X_cat.cat.codes)
|
||||
else:
|
||||
self._feature_names.append(col)
|
||||
self._feature_categories.append(None)
|
||||
self._feature_bins.append(None)
|
||||
|
||||
X_numeric.append(X[col])
|
||||
|
||||
X = np.column_stack(X_numeric)
|
||||
else:
|
||||
self._feature_names = None
|
||||
|
||||
X, y = check_X_y(X, y, y_numeric=True)
|
||||
|
||||
# pre-processing
|
||||
y_mean = np.average(y)
|
||||
|
||||
# determine max bin per categorical
|
||||
|
@ -60,6 +154,43 @@ class CBM(BaseEstimator):
|
|||
return self
|
||||
|
||||
def predict(self, X: np.ndarray, explain: bool = False):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
X_numeric = []
|
||||
|
||||
offset = 0 # correct for date expansion
|
||||
for i, col in enumerate(X.columns):
|
||||
col_dtype = X[col].dtype
|
||||
|
||||
if pd.api.types.is_datetime64_any_dtype(col_dtype):
|
||||
for expansion in self.get_date_features():
|
||||
if expansion == 'day':
|
||||
X_numeric.append(X[col].dt.dayofweek.values)
|
||||
offset += 1
|
||||
|
||||
elif expansion == 'month':
|
||||
X_numeric.append(X[col].dt.month.values)
|
||||
offset += 1
|
||||
|
||||
offset -= 1
|
||||
|
||||
elif pd.api.types.is_float_dtype(col_dtype):
|
||||
# re-use binning from training
|
||||
X_numeric.append(pd.cut(X[col].fillna(0), self._feature_bins[i + offset], include_lowest=True).cat.codes)
|
||||
|
||||
elif not pd.api.types.is_integer_dtype(col_dtype):
|
||||
# convert to categorical
|
||||
X_cat = (X[col]
|
||||
.fillna('CBM_UnknownCategory')
|
||||
# re-use categories from training
|
||||
.astype(CategoricalDtype(categories=self._feature_categories[i + offset], ordered=True)))
|
||||
|
||||
# convert to 0-based index
|
||||
X_numeric.append(X_cat.cat.codes)
|
||||
else:
|
||||
X_numeric.append(X[col])
|
||||
|
||||
X = np.column_stack(X_numeric)
|
||||
|
||||
X = check_array(X)
|
||||
check_is_fitted(self, "is_fitted_")
|
||||
|
||||
|
@ -73,6 +204,41 @@ class CBM(BaseEstimator):
|
|||
|
||||
self.is_fitted_ = True
|
||||
|
||||
def plot_importance(self, feature_names: list = None, **kwargs):
|
||||
check_is_fitted(self, "is_fitted_")
|
||||
|
||||
if feature_names is None:
|
||||
feature_names = self._feature_names
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
n_features = len(self.weights)
|
||||
|
||||
n_cols = int(np.ceil( np.sqrt(n_features)))
|
||||
n_rows = int(np.floor(np.sqrt(n_features)))
|
||||
|
||||
if n_cols * n_rows < n_features:
|
||||
n_rows += 1
|
||||
|
||||
fig, ax = plt.subplots(n_rows, n_cols, sharex=True, **kwargs)
|
||||
|
||||
fig.suptitle(f'Response mean: {self.y_mean:0.4f} | Iterations {self.iterations}')
|
||||
|
||||
for f in range(n_features):
|
||||
w = np.array(self.weights[f])
|
||||
|
||||
color = np.where(w < 1, 'xkcd:tomato', 'xkcd:green')
|
||||
|
||||
ax_sub = ax[f // n_cols, f % n_cols]
|
||||
ax_sub.barh(range(len(w)), w - 1, color=color)
|
||||
|
||||
ax_sub.set_title(feature_names[f] if feature_names is not None else f'Feature {f}')
|
||||
|
||||
if self._feature_categories[f] is not None:
|
||||
ax_sub.set_yticks(range(len(self._feature_categories[f])))
|
||||
ax_sub.set_yticklabels(self._feature_categories[f])
|
||||
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
return self._cpp.weights
|
||||
|
@ -80,3 +246,7 @@ class CBM(BaseEstimator):
|
|||
@property
|
||||
def y_mean(self):
|
||||
return self._cpp.y_mean
|
||||
|
||||
@property
|
||||
def iterations(self):
|
||||
return self._cpp.iterations
|
||||
|
|
14
setup.py
14
setup.py
|
@ -22,7 +22,7 @@ def get_extra_compile_args():
|
|||
cflags = ""
|
||||
|
||||
return cflags.split() \
|
||||
+ ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"]
|
||||
+ ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"] #, "-fopenmp"]
|
||||
|
||||
def get_libraries():
|
||||
if platform.system() == "Windows":
|
||||
|
@ -36,7 +36,7 @@ long_description = (this_directory / "README.md").read_text()
|
|||
|
||||
setup(
|
||||
name="cyclicbm",
|
||||
version="0.0.6",
|
||||
version="0.0.7",
|
||||
description="Cyclic Boosting Machines",
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
|
@ -55,19 +55,23 @@ setup(
|
|||
"Topic :: Scientific/Engineering :: Mathematics",
|
||||
],
|
||||
setup_requires=["pytest-runner"],
|
||||
install_requires=["pybind11>=2.2", "numpy", "scikit-learn"],
|
||||
install_requires=["pybind11>=2.2", "numpy", "scikit-learn", "pandas"],
|
||||
tests_require=["pytest", "lightgbm"], #, "interpret"],
|
||||
extras_require={
|
||||
'interactive': ['matplotlib>=2.2.0'],
|
||||
},
|
||||
packages=["cbm"],
|
||||
package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
|
||||
# package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
|
||||
ext_modules=[
|
||||
Extension(
|
||||
"cbm_cpp",
|
||||
["src/pycbm.cpp", "src/cbm.cpp" ],
|
||||
include_dirs=[get_pybind_include(), get_pybind_include(user=True)],
|
||||
include_dirs=[get_pybind_include(), get_pybind_include(user=True), "src"],
|
||||
extra_compile_args=get_extra_compile_args(),
|
||||
libraries=get_libraries(),
|
||||
language="c++11",
|
||||
)
|
||||
],
|
||||
headers=["src/pycbm.h", "src/cbm.h"],
|
||||
zip_safe=False,
|
||||
)
|
||||
|
|
18
src/cbm.cpp
18
src/cbm.cpp
|
@ -9,14 +9,13 @@
|
|||
|
||||
namespace cbm
|
||||
{
|
||||
CBM::CBM()
|
||||
CBM::CBM() : _iterations(0)
|
||||
{
|
||||
}
|
||||
|
||||
CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean)
|
||||
CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean) :
|
||||
_f(f), _y_mean(y_mean), _iterations(0)
|
||||
{
|
||||
_f = f;
|
||||
_y_mean = y_mean;
|
||||
}
|
||||
|
||||
void CBM::update_y_hat_sum(
|
||||
|
@ -64,6 +63,11 @@ namespace cbm
|
|||
_y_mean = y_mean;
|
||||
}
|
||||
|
||||
size_t CBM::get_iterations() const
|
||||
{
|
||||
return _iterations;
|
||||
}
|
||||
|
||||
void CBM::fit(
|
||||
const uint32_t *y,
|
||||
const char *x_data,
|
||||
|
@ -119,7 +123,7 @@ namespace cbm
|
|||
double learning_rate = learning_rate_step_size;
|
||||
double rmse0 = std::numeric_limits<double>::infinity();
|
||||
|
||||
for (size_t t = 0; t < max_iterations; t++, learning_rate += learning_rate_step_size)
|
||||
for (_iterations = 0; _iterations < max_iterations; _iterations++, learning_rate += learning_rate_step_size)
|
||||
{
|
||||
// cap at 1
|
||||
if (learning_rate > 1)
|
||||
|
@ -128,7 +132,7 @@ namespace cbm
|
|||
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
|
||||
|
||||
// compute g
|
||||
// TODO: parallelize
|
||||
// #pragma omp for // didn't observe improvement
|
||||
for (size_t j = 0; j < n_features; j++)
|
||||
{
|
||||
for (size_t k = 0; k <= x_max[j]; k++)
|
||||
|
@ -171,7 +175,7 @@ namespace cbm
|
|||
|
||||
// check for early stopping
|
||||
// TODO: expose minimum number of rounds
|
||||
if (t > min_iterations_early_stopping &&
|
||||
if (_iterations > min_iterations_early_stopping &&
|
||||
(rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping))
|
||||
{
|
||||
// TODO: record diagnostics?
|
||||
|
|
|
@ -15,6 +15,8 @@ namespace cbm
|
|||
std::vector<std::vector<double>> _f;
|
||||
double _y_mean;
|
||||
|
||||
size_t _iterations;
|
||||
|
||||
void update_y_hat_sum(
|
||||
std::vector<std::vector<uint64_t>> &y_hat_sum,
|
||||
std::vector<std::vector<uint8_t>> &x,
|
||||
|
@ -81,5 +83,7 @@ namespace cbm
|
|||
|
||||
float get_y_mean() const;
|
||||
void set_y_mean(float mean);
|
||||
|
||||
size_t get_iterations() const;
|
||||
};
|
||||
}
|
|
@ -145,6 +145,11 @@ namespace cbm
|
|||
{
|
||||
_cbm.set_y_mean(y_mean);
|
||||
}
|
||||
|
||||
size_t PyCBM::get_iterations() const
|
||||
{
|
||||
return _cbm.get_iterations();
|
||||
}
|
||||
};
|
||||
|
||||
PYBIND11_MODULE(cbm_cpp, m)
|
||||
|
@ -157,8 +162,10 @@ PYBIND11_MODULE(cbm_cpp, m)
|
|||
.def("predict", &cbm::PyCBM::predict)
|
||||
.def_property("y_mean", &cbm::PyCBM::get_y_mean, &cbm::PyCBM::set_y_mean)
|
||||
.def_property("weights", &cbm::PyCBM::get_weights, &cbm::PyCBM::set_weights)
|
||||
.def_property_readonly("iterations", &cbm::PyCBM::get_iterations)
|
||||
.def(py::pickle(
|
||||
[](const cbm::PyCBM &p) { // __getstate__
|
||||
/* TODO: this does not include the feature pre-processing */
|
||||
/* Return a tuple that fully encodes the state of the object */
|
||||
return py::make_tuple(p.get_weights(), p.get_y_mean());
|
||||
},
|
||||
|
|
|
@ -43,5 +43,7 @@ namespace cbm
|
|||
float get_y_mean() const;
|
||||
|
||||
void set_y_mean(float mean);
|
||||
|
||||
size_t get_iterations() const;
|
||||
};
|
||||
}
|
Загрузка…
Ссылка в новой задаче