зеркало из https://github.com/microsoft/CBM.git
switched to qcut for discretizing
expose explainations expose weights
This commit is contained in:
Родитель
d474ce15ad
Коммит
cd0f164d61
24
cbm/CBM.py
24
cbm/CBM.py
|
@ -4,12 +4,18 @@ import numpy as np
|
|||
class CBM:
|
||||
cpp: cbm_cpp.PyCBM
|
||||
|
||||
# TODO: scitkit-learn estimator?
|
||||
# def fit_pandas(self, df):
|
||||
# TODO include binning of continuous features
|
||||
|
||||
def fit(self,
|
||||
y: np.ndarray,
|
||||
x: np.ndarray,
|
||||
learning_rate_step_size = 1/100,
|
||||
max_iterations = 100,
|
||||
epsilon_early_stopping = 1e-3):
|
||||
learning_rate_step_size:float = 1/100,
|
||||
max_iterations:int = 100,
|
||||
min_iterations_early_stopping:int = 20,
|
||||
epsilon_early_stopping:float = 1e-3,
|
||||
single_update_per_iteration:bool = True):
|
||||
y_mean = np.average(y)
|
||||
|
||||
# determine max bin per categorical
|
||||
|
@ -26,7 +32,13 @@ class CBM:
|
|||
x_max.astype('uint8'),
|
||||
learning_rate_step_size,
|
||||
max_iterations,
|
||||
epsilon_early_stopping)
|
||||
min_iterations_early_stopping,
|
||||
epsilon_early_stopping,
|
||||
single_update_per_iteration)
|
||||
|
||||
def predict(self, x: np.ndarray):
|
||||
return self.cpp.predict(x)
|
||||
def predict(self, x: np.ndarray, explain: bool = False):
|
||||
return self.cpp.predict(x, explain)
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
return self.cpp.weights
|
83
src/cbm.cpp
83
src/cbm.cpp
|
@ -5,6 +5,33 @@
|
|||
#include <cmath>
|
||||
|
||||
namespace cbm {
|
||||
void CBM::update_y_hat_sum(
|
||||
std::vector<std::vector<uint64_t>>& y_hat_sum,
|
||||
std::vector<std::vector<uint8_t>>& x,
|
||||
size_t n_examples,
|
||||
size_t n_features) {
|
||||
|
||||
// reset y_hat_sum
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
|
||||
|
||||
// compute y_hat and y_hat_sum
|
||||
for (size_t i=0;i<n_examples;i++) {
|
||||
// TODO: parallelize & vectorize
|
||||
// TODO: use log to stabilize?
|
||||
auto y_hat_i = _y_mean;
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
y_hat_i *= _f[j][x[j][i]];
|
||||
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
y_hat_sum[j][x[j][i]] += y_hat_i;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>>& CBM::get_weights() {
|
||||
return _f;
|
||||
}
|
||||
|
||||
void CBM::fit(
|
||||
const uint32_t* y,
|
||||
const char* x_data,
|
||||
|
@ -16,7 +43,9 @@ namespace cbm {
|
|||
const uint8_t* x_max,
|
||||
double learning_rate_step_size,
|
||||
size_t max_iterations,
|
||||
double epsilon_early_stopping) {
|
||||
size_t min_iterations_early_stopping,
|
||||
double epsilon_early_stopping,
|
||||
bool single_update_per_iteration) {
|
||||
|
||||
_y_mean = y_mean;
|
||||
|
||||
|
@ -54,28 +83,13 @@ namespace cbm {
|
|||
// iterations
|
||||
double learning_rate = learning_rate_step_size;
|
||||
double rmse0 = std::numeric_limits<double>::infinity();
|
||||
|
||||
|
||||
for (size_t t=0;t<max_iterations;t++,learning_rate+=learning_rate_step_size) {
|
||||
// cap at 1
|
||||
if (learning_rate > 1)
|
||||
learning_rate = 1;
|
||||
|
||||
// reset y_hat_sum
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
|
||||
|
||||
// compute y_hat and y_hat_sum
|
||||
// Note: deviation from the paper as f is
|
||||
for (size_t i=0;i<n_examples;i++) {
|
||||
// TODO: parallelize & vectorize
|
||||
// TODO: use log to stabilize?
|
||||
auto y_hat_i = _y_mean;
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
y_hat_i *= _f[j][x[j][i]];
|
||||
|
||||
for (size_t j=0;j<n_features;j++)
|
||||
y_hat_sum[j][x[j][i]] += y_hat_i;
|
||||
}
|
||||
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
|
||||
|
||||
// compute g
|
||||
// TODO: parallelize
|
||||
|
@ -94,6 +108,9 @@ namespace cbm {
|
|||
_f[j][k] *= g;
|
||||
else
|
||||
_f[j][k] *= std::exp(learning_rate * std::log(g)); // eqn 2 (b) + eqn 4
|
||||
|
||||
if (!single_update_per_iteration)
|
||||
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -113,37 +130,13 @@ namespace cbm {
|
|||
|
||||
// check for early stopping
|
||||
// TODO: expose minimum number of rounds
|
||||
if (t > 20 && (rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
|
||||
if (t > min_iterations_early_stopping &&
|
||||
(rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
|
||||
// TODO: record diagnostics?
|
||||
// printf("early stopping %1.4f vs %1.4f after t=%d\n", rmse, rmse0, (int)t);
|
||||
break;
|
||||
}
|
||||
rmse0 = rmse;
|
||||
rmse0 = rmse;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<double> CBM::predict(
|
||||
const char* x_data,
|
||||
size_t x_stride0,
|
||||
size_t x_stride1,
|
||||
size_t n_examples,
|
||||
size_t n_features) {
|
||||
|
||||
if (n_features != _f.size())
|
||||
throw std::runtime_error("Features need to match!");
|
||||
|
||||
std::vector<double> y_hat(n_examples, _y_mean);
|
||||
|
||||
// TODO: batch parallelization
|
||||
for (size_t i=0;i<n_examples;i++) {
|
||||
double& y_hat_i = y_hat[i];
|
||||
for (size_t j=0;j<n_features;j++) {
|
||||
// TODO: simd gather?
|
||||
uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
|
||||
y_hat_i *= _f[j][x_ij];
|
||||
}
|
||||
}
|
||||
|
||||
return y_hat;
|
||||
}
|
||||
}
|
43
src/cbm.h
43
src/cbm.h
|
@ -2,13 +2,19 @@
|
|||
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace cbm {
|
||||
class CBM {
|
||||
// n_features x max_bin[j] (jagged)
|
||||
std::vector<std::vector<double>> _f;
|
||||
std::vector<std::vector<double>> _f;
|
||||
double _y_mean;
|
||||
|
||||
void update_y_hat_sum(
|
||||
std::vector<std::vector<uint64_t>>& y_hat_sum,
|
||||
std::vector<std::vector<uint8_t>>& x,
|
||||
size_t n_examples,
|
||||
size_t n_features);
|
||||
public:
|
||||
void fit(
|
||||
const uint32_t* y,
|
||||
|
@ -21,13 +27,42 @@ namespace cbm {
|
|||
const uint8_t* x_max,
|
||||
double learning_rate_step_size,
|
||||
size_t max_iterations,
|
||||
double epsilon_early_stopping);
|
||||
size_t min_iterations_early_stopping,
|
||||
double epsilon_early_stopping,
|
||||
bool single_update_per_iteration);
|
||||
|
||||
std::vector<double> predict(
|
||||
template<bool explain>
|
||||
void predict(
|
||||
const char* x_data,
|
||||
size_t x_stride0,
|
||||
size_t x_stride1,
|
||||
size_t n_examples,
|
||||
size_t n_features);
|
||||
size_t n_features,
|
||||
double* out_data) {
|
||||
|
||||
if (n_features != _f.size())
|
||||
throw std::runtime_error("Features need to match!");
|
||||
|
||||
// column-wise oriented output data
|
||||
double* out_y_hat = out_data;
|
||||
std::fill(out_y_hat, out_y_hat + n_examples, _y_mean);
|
||||
|
||||
// TODO: batch parallelization
|
||||
for (size_t i=0;i<n_examples;i++) {
|
||||
double& y_hat_i = *(out_y_hat +i);
|
||||
|
||||
for (size_t j=0;j<n_features;j++) {
|
||||
// TODO: simd gather?
|
||||
uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
|
||||
y_hat_i *= _f[j][x_ij];
|
||||
|
||||
if (explain) {
|
||||
*(out_data + (j + 1) * n_examples + i) = _f[j][x_ij];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>>& get_weights();
|
||||
};
|
||||
}
|
|
@ -11,7 +11,9 @@ namespace cbm {
|
|||
py::buffer x_max_b,
|
||||
double learning_rate_step_size,
|
||||
size_t max_iterations,
|
||||
double epsilon_early_stopping) {
|
||||
size_t min_iterations_early_stopping,
|
||||
double epsilon_early_stopping,
|
||||
bool single_update_per_iteration) {
|
||||
|
||||
// TODO: fix error messages
|
||||
py::buffer_info y_info = y_b.request();
|
||||
|
@ -58,10 +60,12 @@ namespace cbm {
|
|||
x_max,
|
||||
learning_rate_step_size,
|
||||
max_iterations,
|
||||
epsilon_early_stopping);
|
||||
min_iterations_early_stopping,
|
||||
epsilon_early_stopping,
|
||||
single_update_per_iteration);
|
||||
}
|
||||
|
||||
std::vector<double> PyCBM::predict(py::buffer x_b) {
|
||||
py::array_t<double> PyCBM::predict(py::buffer x_b, bool explain) {
|
||||
// TODO: fix error messages
|
||||
py::buffer_info x_info = x_b.request();
|
||||
if (x_info.format != py::format_descriptor<uint8_t>::format())
|
||||
|
@ -76,7 +80,21 @@ namespace cbm {
|
|||
ssize_t n_examples = x_info.shape[0];
|
||||
ssize_t n_features = x_info.shape[1];
|
||||
|
||||
return _cbm.predict(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features);
|
||||
py::array_t<double, py::array::f_style> out_data(
|
||||
{(int)n_examples, explain ? (int)(1 + n_features) : 1}
|
||||
);
|
||||
|
||||
if (explain)
|
||||
_cbm.predict<true>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
|
||||
else
|
||||
_cbm.predict<false>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
|
||||
|
||||
return out_data;
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::vector<double>>& PyCBM::get_weights() {
|
||||
return _cbm.get_weights();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -85,5 +103,6 @@ PYBIND11_MODULE(cbm_cpp, m) {
|
|||
|
||||
estimator.def(py::init([]() { return new cbm::PyCBM(); }))
|
||||
.def("fit", &cbm::PyCBM::fit)
|
||||
.def("predict", &cbm::PyCBM::predict);
|
||||
.def("predict", &cbm::PyCBM::predict)
|
||||
.def_property_readonly("weights", &cbm::PyCBM::get_weights);
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
#include <pybind11/numpy.h>
|
||||
|
||||
#include "cbm.h"
|
||||
|
||||
|
@ -21,8 +22,12 @@ namespace cbm {
|
|||
py::buffer x_max_b,
|
||||
double learning_rate_step_size,
|
||||
size_t max_iterations,
|
||||
double epsilon_early_stopping);
|
||||
size_t min_iterations_early_stopping,
|
||||
double epsilon_early_stopping,
|
||||
bool single_update_per_iteration);
|
||||
|
||||
std::vector<double> predict(py::buffer x_b);
|
||||
py::array_t<double> predict(py::buffer x_b, bool explain);
|
||||
|
||||
std::vector<std::vector<double>>& get_weights();
|
||||
};
|
||||
}
|
|
@ -48,18 +48,24 @@ def test_nyc_bicycle():
|
|||
|
||||
# TODO: move to CBM.py and support pandas interface?
|
||||
# CBM can only handle categorical information
|
||||
def histedges_equalN(x, nbin):
|
||||
npt = len(x)
|
||||
return np.interp(np.linspace(0, npt, nbin + 1),
|
||||
np.arange(npt),
|
||||
np.sort(x))
|
||||
# def histedges_equalN(x, nbin):
|
||||
# npt = len(x)
|
||||
# return np.interp(np.linspace(0, npt, nbin + 1),
|
||||
# np.arange(npt),
|
||||
# np.sort(x))
|
||||
|
||||
# def histedges_equalN(x, nbin):
|
||||
# return pd.qcut(x, nbin)
|
||||
|
||||
# some hyper-parameter che.. ehm tuning
|
||||
for bins in [2, 3, 4, 5, 6, 10]:
|
||||
x = np.stack([
|
||||
bic['Weekday'].values,
|
||||
np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
|
||||
np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))], axis=1)\
|
||||
pd.qcut(bic['HIGH_T'], bins).cat.codes,
|
||||
pd.qcut(bic['LOW_T'], bins).cat.codes],
|
||||
# np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
|
||||
# np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))],
|
||||
axis=1)\
|
||||
.astype('uint8')
|
||||
|
||||
x_train = x[train_idx, ]
|
||||
|
@ -69,9 +75,21 @@ def test_nyc_bicycle():
|
|||
|
||||
# fit CBM model
|
||||
model = cbm.CBM()
|
||||
model.fit(y_train, x_train)
|
||||
model.fit(y_train, x_train, single_update_per_iteration=False)
|
||||
|
||||
y_pred = model.predict(x_test)
|
||||
|
||||
# y_pred_explain[:, 0] --> predictions
|
||||
# y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean
|
||||
y_pred_explain = model.predict(x_test, explain=True)
|
||||
|
||||
# print("x", x_test[:3])
|
||||
# print("y", y_pred_explain[:3])
|
||||
# print("f", model.weights)
|
||||
|
||||
# validate data predictions line up
|
||||
# print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))
|
||||
|
||||
print(f"CMB: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec bins={bins}")
|
||||
# print(np.stack((y, y_pred))[:5,].transpose())
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче