added categorical, date/time and continuous feature support added feature importance plotting

* try to fix c++ build

* add MANIFEST.in to fix missing header files

* making sure it really gets included

* added categorical, date/time and continuous feature support
added feature importance plotting

* make sure the model is serializable
This commit is contained in:
Markus Cozowicz 2021-11-24 11:28:26 +01:00 коммит произвёл GitHub
Родитель cd5f46fb86
Коммит a5a992afe1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 211 добавлений и 13 удалений

3
.github/workflows/publish-to-test-pypi.yml поставляемый
Просмотреть файл

@ -41,6 +41,9 @@ jobs:
- name: Build a source tarball
run: python -m build --sdist --outdir dist/ .
- name: Install from source tarball
run: python -m pip install dist/*.tar.gz --user
- name: Store the binary wheel
uses: actions/upload-artifact@v2
with:

4
MANIFEST.in Normal file
Просмотреть файл

@ -0,0 +1,4 @@
include README.md
include setup.py
recursive-include cbm *.py
recursive-include src *.c *.h

Просмотреть файл

@ -3,11 +3,14 @@
import cbm_cpp
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from typing import List, Union
from pandas.api.types import CategoricalDtype
class CBM(BaseEstimator):
@ -19,7 +22,21 @@ class CBM(BaseEstimator):
min_iterations_early_stopping:int = 20,
epsilon_early_stopping:float = 1e-3,
single_update_per_iteration:bool = True,
date_features: Union[str, List[str]] = 'day,month',
binning: Union[int, lambda x: int] = 10,
) -> None:
"""Initialize the CBM model.
Args:
learning_rate_step_size (float, optional): [description]. Defaults to 1/100.
max_iterations (int, optional): [description]. Defaults to 100.
min_iterations_early_stopping (int, optional): [description]. Defaults to 20.
epsilon_early_stopping (float, optional): [description]. Defaults to 1e-3.
single_update_per_iteration (bool, optional): [description]. Defaults to True.
date_features (List[str], optional): [description]. Defaults to ['day', 'month'].
binning (Union[int, lambda x, optional): [description]. Defaults to 10.
The number of bins to create for continuous features. Supply lambda for flexible binning.
"""
self.learning_rate_step_size = learning_rate_step_size
self.max_iterations = max_iterations
@ -27,13 +44,90 @@ class CBM(BaseEstimator):
self.epsilon_early_stopping = epsilon_early_stopping
self.single_update_per_iteration = single_update_per_iteration
# lets make sure it's serializable
if isinstance(date_features, list):
date_features = ",".join(date_features)
self.date_features = date_features
self.binning = binning
def get_date_features(self) -> List[str]:
return self.date_features.split(",")
def fit(self,
X: np.ndarray,
X: Union[np.ndarray, pd.DataFrame],
y: np.ndarray
) -> "CBM":
# keep feature names around
if isinstance(X, pd.DataFrame):
self._feature_names = []
self._feature_categories = []
self._feature_bins = []
X_numeric = []
for col in X.columns:
col_dtype = X[col].dtype
if pd.api.types.is_datetime64_any_dtype(col_dtype):
for expansion in self.get_date_features():
import calendar
if expansion == 'day':
self._feature_names.append(f'{col}_day')
self._feature_categories.append(calendar.day_abbr)
self._feature_bins.append(None)
X_numeric.append(X[col].dt.dayofweek.values)
elif expansion == 'month':
self._feature_names.append(f'{col}_month')
self._feature_categories.append(calendar.month_abbr)
self._feature_bins.append(None)
X_numeric.append(X[col].dt.month.values)
elif pd.api.types.is_float_dtype(col_dtype):
# deal with continuous features
bin_num = self.binning if isinstance(self.binning, int) else self.binning(X[col])
X_binned, bins = pd.qcut(X[col].fillna(0), bin_num, retbins=True)
self._feature_names.append(col)
self._feature_categories.append(X_binned.cat.categories.astype(str).tolist())
self._feature_bins.append(bins)
X_numeric.append(pd.cut(X[col].fillna(0), bins, include_lowest=True).cat.codes)
elif not pd.api.types.is_integer_dtype(col_dtype):
self._feature_names.append(col)
# convert to categorical
X_cat = (X[col]
.fillna('CBM_UnknownCategory')
.astype('category'))
# keep track of categories
self._feature_categories.append(X_cat.cat.categories.tolist())
self._feature_bins.append(None)
# convert to 0-based index
X_numeric.append(X_cat.cat.codes)
else:
self._feature_names.append(col)
self._feature_categories.append(None)
self._feature_bins.append(None)
X_numeric.append(X[col])
X = np.column_stack(X_numeric)
else:
self._feature_names = None
X, y = check_X_y(X, y, y_numeric=True)
# pre-processing
y_mean = np.average(y)
# determine max bin per categorical
@ -60,6 +154,43 @@ class CBM(BaseEstimator):
return self
def predict(self, X: np.ndarray, explain: bool = False):
if isinstance(X, pd.DataFrame):
X_numeric = []
offset = 0 # correct for date expansion
for i, col in enumerate(X.columns):
col_dtype = X[col].dtype
if pd.api.types.is_datetime64_any_dtype(col_dtype):
for expansion in self.get_date_features():
if expansion == 'day':
X_numeric.append(X[col].dt.dayofweek.values)
offset += 1
elif expansion == 'month':
X_numeric.append(X[col].dt.month.values)
offset += 1
offset -= 1
elif pd.api.types.is_float_dtype(col_dtype):
# re-use binning from training
X_numeric.append(pd.cut(X[col].fillna(0), self._feature_bins[i + offset], include_lowest=True).cat.codes)
elif not pd.api.types.is_integer_dtype(col_dtype):
# convert to categorical
X_cat = (X[col]
.fillna('CBM_UnknownCategory')
# re-use categories from training
.astype(CategoricalDtype(categories=self._feature_categories[i + offset], ordered=True)))
# convert to 0-based index
X_numeric.append(X_cat.cat.codes)
else:
X_numeric.append(X[col])
X = np.column_stack(X_numeric)
X = check_array(X)
check_is_fitted(self, "is_fitted_")
@ -73,6 +204,41 @@ class CBM(BaseEstimator):
self.is_fitted_ = True
def plot_importance(self, feature_names: list = None, **kwargs):
check_is_fitted(self, "is_fitted_")
if feature_names is None:
feature_names = self._feature_names
import matplotlib.pyplot as plt
n_features = len(self.weights)
n_cols = int(np.ceil( np.sqrt(n_features)))
n_rows = int(np.floor(np.sqrt(n_features)))
if n_cols * n_rows < n_features:
n_rows += 1
fig, ax = plt.subplots(n_rows, n_cols, sharex=True, **kwargs)
fig.suptitle(f'Response mean: {self.y_mean:0.4f} | Iterations {self.iterations}')
for f in range(n_features):
w = np.array(self.weights[f])
color = np.where(w < 1, 'xkcd:tomato', 'xkcd:green')
ax_sub = ax[f // n_cols, f % n_cols]
ax_sub.barh(range(len(w)), w - 1, color=color)
ax_sub.set_title(feature_names[f] if feature_names is not None else f'Feature {f}')
if self._feature_categories[f] is not None:
ax_sub.set_yticks(range(len(self._feature_categories[f])))
ax_sub.set_yticklabels(self._feature_categories[f])
@property
def weights(self):
return self._cpp.weights
@ -80,3 +246,7 @@ class CBM(BaseEstimator):
@property
def y_mean(self):
return self._cpp.y_mean
@property
def iterations(self):
return self._cpp.iterations

Просмотреть файл

@ -22,7 +22,7 @@ def get_extra_compile_args():
cflags = ""
return cflags.split() \
+ ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"]
+ ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"] #, "-fopenmp"]
def get_libraries():
if platform.system() == "Windows":
@ -36,7 +36,7 @@ long_description = (this_directory / "README.md").read_text()
setup(
name="cyclicbm",
version="0.0.6",
version="0.0.7",
description="Cyclic Boosting Machines",
long_description=long_description,
long_description_content_type='text/markdown',
@ -55,19 +55,23 @@ setup(
"Topic :: Scientific/Engineering :: Mathematics",
],
setup_requires=["pytest-runner"],
install_requires=["pybind11>=2.2", "numpy", "scikit-learn"],
install_requires=["pybind11>=2.2", "numpy", "scikit-learn", "pandas"],
tests_require=["pytest", "lightgbm"], #, "interpret"],
extras_require={
'interactive': ['matplotlib>=2.2.0'],
},
packages=["cbm"],
package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
# package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
ext_modules=[
Extension(
"cbm_cpp",
["src/pycbm.cpp", "src/cbm.cpp" ],
include_dirs=[get_pybind_include(), get_pybind_include(user=True)],
include_dirs=[get_pybind_include(), get_pybind_include(user=True), "src"],
extra_compile_args=get_extra_compile_args(),
libraries=get_libraries(),
language="c++11",
)
],
headers=["src/pycbm.h", "src/cbm.h"],
zip_safe=False,
)

Просмотреть файл

@ -9,14 +9,13 @@
namespace cbm
{
CBM::CBM()
CBM::CBM() : _iterations(0)
{
}
CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean)
CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean) :
_f(f), _y_mean(y_mean), _iterations(0)
{
_f = f;
_y_mean = y_mean;
}
void CBM::update_y_hat_sum(
@ -64,6 +63,11 @@ namespace cbm
_y_mean = y_mean;
}
size_t CBM::get_iterations() const
{
return _iterations;
}
void CBM::fit(
const uint32_t *y,
const char *x_data,
@ -119,7 +123,7 @@ namespace cbm
double learning_rate = learning_rate_step_size;
double rmse0 = std::numeric_limits<double>::infinity();
for (size_t t = 0; t < max_iterations; t++, learning_rate += learning_rate_step_size)
for (_iterations = 0; _iterations < max_iterations; _iterations++, learning_rate += learning_rate_step_size)
{
// cap at 1
if (learning_rate > 1)
@ -128,7 +132,7 @@ namespace cbm
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
// compute g
// TODO: parallelize
// #pragma omp for // didn't observe improvement
for (size_t j = 0; j < n_features; j++)
{
for (size_t k = 0; k <= x_max[j]; k++)
@ -171,7 +175,7 @@ namespace cbm
// check for early stopping
// TODO: expose minimum number of rounds
if (t > min_iterations_early_stopping &&
if (_iterations > min_iterations_early_stopping &&
(rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping))
{
// TODO: record diagnostics?

Просмотреть файл

@ -15,6 +15,8 @@ namespace cbm
std::vector<std::vector<double>> _f;
double _y_mean;
size_t _iterations;
void update_y_hat_sum(
std::vector<std::vector<uint64_t>> &y_hat_sum,
std::vector<std::vector<uint8_t>> &x,
@ -81,5 +83,7 @@ namespace cbm
float get_y_mean() const;
void set_y_mean(float mean);
size_t get_iterations() const;
};
}

Просмотреть файл

@ -145,6 +145,11 @@ namespace cbm
{
_cbm.set_y_mean(y_mean);
}
size_t PyCBM::get_iterations() const
{
return _cbm.get_iterations();
}
};
PYBIND11_MODULE(cbm_cpp, m)
@ -157,8 +162,10 @@ PYBIND11_MODULE(cbm_cpp, m)
.def("predict", &cbm::PyCBM::predict)
.def_property("y_mean", &cbm::PyCBM::get_y_mean, &cbm::PyCBM::set_y_mean)
.def_property("weights", &cbm::PyCBM::get_weights, &cbm::PyCBM::set_weights)
.def_property_readonly("iterations", &cbm::PyCBM::get_iterations)
.def(py::pickle(
[](const cbm::PyCBM &p) { // __getstate__
/* TODO: this does not include the feature pre-processing */
/* Return a tuple that fully encodes the state of the object */
return py::make_tuple(p.get_weights(), p.get_y_mean());
},

Просмотреть файл

@ -43,5 +43,7 @@ namespace cbm
float get_y_mean() const;
void set_y_mean(float mean);
size_t get_iterations() const;
};
}