switched to qcut for discretizing

expose explainations
expose weights
This commit is contained in:
Markus Cozowicz 2021-05-11 18:26:15 +02:00
Родитель d474ce15ad
Коммит cd0f164d61
6 изменённых файлов: 152 добавлений и 70 удалений

Просмотреть файл

@ -4,12 +4,18 @@ import numpy as np
class CBM:
cpp: cbm_cpp.PyCBM
# TODO: scitkit-learn estimator?
# def fit_pandas(self, df):
# TODO include binning of continuous features
def fit(self,
y: np.ndarray,
x: np.ndarray,
learning_rate_step_size = 1/100,
max_iterations = 100,
epsilon_early_stopping = 1e-3):
learning_rate_step_size:float = 1/100,
max_iterations:int = 100,
min_iterations_early_stopping:int = 20,
epsilon_early_stopping:float = 1e-3,
single_update_per_iteration:bool = True):
y_mean = np.average(y)
# determine max bin per categorical
@ -26,7 +32,13 @@ class CBM:
x_max.astype('uint8'),
learning_rate_step_size,
max_iterations,
epsilon_early_stopping)
min_iterations_early_stopping,
epsilon_early_stopping,
single_update_per_iteration)
def predict(self, x: np.ndarray):
return self.cpp.predict(x)
def predict(self, x: np.ndarray, explain: bool = False):
return self.cpp.predict(x, explain)
@property
def weights(self):
return self.cpp.weights

Просмотреть файл

@ -5,6 +5,33 @@
#include <cmath>
namespace cbm {
void CBM::update_y_hat_sum(
std::vector<std::vector<uint64_t>>& y_hat_sum,
std::vector<std::vector<uint8_t>>& x,
size_t n_examples,
size_t n_features) {
// reset y_hat_sum
for (size_t j=0;j<n_features;j++)
std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
// compute y_hat and y_hat_sum
for (size_t i=0;i<n_examples;i++) {
// TODO: parallelize & vectorize
// TODO: use log to stabilize?
auto y_hat_i = _y_mean;
for (size_t j=0;j<n_features;j++)
y_hat_i *= _f[j][x[j][i]];
for (size_t j=0;j<n_features;j++)
y_hat_sum[j][x[j][i]] += y_hat_i;
}
}
std::vector<std::vector<double>>& CBM::get_weights() {
return _f;
}
void CBM::fit(
const uint32_t* y,
const char* x_data,
@ -16,7 +43,9 @@ namespace cbm {
const uint8_t* x_max,
double learning_rate_step_size,
size_t max_iterations,
double epsilon_early_stopping) {
size_t min_iterations_early_stopping,
double epsilon_early_stopping,
bool single_update_per_iteration) {
_y_mean = y_mean;
@ -54,28 +83,13 @@ namespace cbm {
// iterations
double learning_rate = learning_rate_step_size;
double rmse0 = std::numeric_limits<double>::infinity();
for (size_t t=0;t<max_iterations;t++,learning_rate+=learning_rate_step_size) {
// cap at 1
if (learning_rate > 1)
learning_rate = 1;
// reset y_hat_sum
for (size_t j=0;j<n_features;j++)
std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
// compute y_hat and y_hat_sum
// Note: deviation from the paper as f is
for (size_t i=0;i<n_examples;i++) {
// TODO: parallelize & vectorize
// TODO: use log to stabilize?
auto y_hat_i = _y_mean;
for (size_t j=0;j<n_features;j++)
y_hat_i *= _f[j][x[j][i]];
for (size_t j=0;j<n_features;j++)
y_hat_sum[j][x[j][i]] += y_hat_i;
}
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
// compute g
// TODO: parallelize
@ -94,6 +108,9 @@ namespace cbm {
_f[j][k] *= g;
else
_f[j][k] *= std::exp(learning_rate * std::log(g)); // eqn 2 (b) + eqn 4
if (!single_update_per_iteration)
update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
}
}
}
@ -113,37 +130,13 @@ namespace cbm {
// check for early stopping
// TODO: expose minimum number of rounds
if (t > 20 && (rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
if (t > min_iterations_early_stopping &&
(rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
// TODO: record diagnostics?
// printf("early stopping %1.4f vs %1.4f after t=%d\n", rmse, rmse0, (int)t);
break;
}
rmse0 = rmse;
rmse0 = rmse;
}
}
std::vector<double> CBM::predict(
const char* x_data,
size_t x_stride0,
size_t x_stride1,
size_t n_examples,
size_t n_features) {
if (n_features != _f.size())
throw std::runtime_error("Features need to match!");
std::vector<double> y_hat(n_examples, _y_mean);
// TODO: batch parallelization
for (size_t i=0;i<n_examples;i++) {
double& y_hat_i = y_hat[i];
for (size_t j=0;j<n_features;j++) {
// TODO: simd gather?
uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
y_hat_i *= _f[j][x_ij];
}
}
return y_hat;
}
}

Просмотреть файл

@ -2,13 +2,19 @@
#include <vector>
#include <stdint.h>
#include <stdexcept>
namespace cbm {
class CBM {
// n_features x max_bin[j] (jagged)
std::vector<std::vector<double>> _f;
std::vector<std::vector<double>> _f;
double _y_mean;
void update_y_hat_sum(
std::vector<std::vector<uint64_t>>& y_hat_sum,
std::vector<std::vector<uint8_t>>& x,
size_t n_examples,
size_t n_features);
public:
void fit(
const uint32_t* y,
@ -21,13 +27,42 @@ namespace cbm {
const uint8_t* x_max,
double learning_rate_step_size,
size_t max_iterations,
double epsilon_early_stopping);
size_t min_iterations_early_stopping,
double epsilon_early_stopping,
bool single_update_per_iteration);
std::vector<double> predict(
template<bool explain>
void predict(
const char* x_data,
size_t x_stride0,
size_t x_stride1,
size_t n_examples,
size_t n_features);
size_t n_features,
double* out_data) {
if (n_features != _f.size())
throw std::runtime_error("Features need to match!");
// column-wise oriented output data
double* out_y_hat = out_data;
std::fill(out_y_hat, out_y_hat + n_examples, _y_mean);
// TODO: batch parallelization
for (size_t i=0;i<n_examples;i++) {
double& y_hat_i = *(out_y_hat +i);
for (size_t j=0;j<n_features;j++) {
// TODO: simd gather?
uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
y_hat_i *= _f[j][x_ij];
if (explain) {
*(out_data + (j + 1) * n_examples + i) = _f[j][x_ij];
}
}
}
}
std::vector<std::vector<double>>& get_weights();
};
}

Просмотреть файл

@ -11,7 +11,9 @@ namespace cbm {
py::buffer x_max_b,
double learning_rate_step_size,
size_t max_iterations,
double epsilon_early_stopping) {
size_t min_iterations_early_stopping,
double epsilon_early_stopping,
bool single_update_per_iteration) {
// TODO: fix error messages
py::buffer_info y_info = y_b.request();
@ -58,10 +60,12 @@ namespace cbm {
x_max,
learning_rate_step_size,
max_iterations,
epsilon_early_stopping);
min_iterations_early_stopping,
epsilon_early_stopping,
single_update_per_iteration);
}
std::vector<double> PyCBM::predict(py::buffer x_b) {
py::array_t<double> PyCBM::predict(py::buffer x_b, bool explain) {
// TODO: fix error messages
py::buffer_info x_info = x_b.request();
if (x_info.format != py::format_descriptor<uint8_t>::format())
@ -76,7 +80,21 @@ namespace cbm {
ssize_t n_examples = x_info.shape[0];
ssize_t n_features = x_info.shape[1];
return _cbm.predict(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features);
py::array_t<double, py::array::f_style> out_data(
{(int)n_examples, explain ? (int)(1 + n_features) : 1}
);
if (explain)
_cbm.predict<true>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
else
_cbm.predict<false>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
return out_data;
}
std::vector<std::vector<double>>& PyCBM::get_weights() {
return _cbm.get_weights();
}
};
@ -85,5 +103,6 @@ PYBIND11_MODULE(cbm_cpp, m) {
estimator.def(py::init([]() { return new cbm::PyCBM(); }))
.def("fit", &cbm::PyCBM::fit)
.def("predict", &cbm::PyCBM::predict);
.def("predict", &cbm::PyCBM::predict)
.def_property_readonly("weights", &cbm::PyCBM::get_weights);
}

Просмотреть файл

@ -2,6 +2,7 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include "cbm.h"
@ -21,8 +22,12 @@ namespace cbm {
py::buffer x_max_b,
double learning_rate_step_size,
size_t max_iterations,
double epsilon_early_stopping);
size_t min_iterations_early_stopping,
double epsilon_early_stopping,
bool single_update_per_iteration);
std::vector<double> predict(py::buffer x_b);
py::array_t<double> predict(py::buffer x_b, bool explain);
std::vector<std::vector<double>>& get_weights();
};
}

Просмотреть файл

@ -48,18 +48,24 @@ def test_nyc_bicycle():
# TODO: move to CBM.py and support pandas interface?
# CBM can only handle categorical information
def histedges_equalN(x, nbin):
npt = len(x)
return np.interp(np.linspace(0, npt, nbin + 1),
np.arange(npt),
np.sort(x))
# def histedges_equalN(x, nbin):
# npt = len(x)
# return np.interp(np.linspace(0, npt, nbin + 1),
# np.arange(npt),
# np.sort(x))
# def histedges_equalN(x, nbin):
# return pd.qcut(x, nbin)
# some hyper-parameter che.. ehm tuning
for bins in [2, 3, 4, 5, 6, 10]:
x = np.stack([
bic['Weekday'].values,
np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))], axis=1)\
pd.qcut(bic['HIGH_T'], bins).cat.codes,
pd.qcut(bic['LOW_T'], bins).cat.codes],
# np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
# np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))],
axis=1)\
.astype('uint8')
x_train = x[train_idx, ]
@ -69,9 +75,21 @@ def test_nyc_bicycle():
# fit CBM model
model = cbm.CBM()
model.fit(y_train, x_train)
model.fit(y_train, x_train, single_update_per_iteration=False)
y_pred = model.predict(x_test)
# y_pred_explain[:, 0] --> predictions
# y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean
y_pred_explain = model.predict(x_test, explain=True)
# print("x", x_test[:3])
# print("y", y_pred_explain[:3])
# print("f", model.weights)
# validate data predictions line up
# print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))
print(f"CMB: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec bins={bins}")
# print(np.stack((y, y_pred))[:5,].transpose())