switched to qcut for discretizing

expose explainations expose weights
2021-05-11 18:26:15 +02:00 · 2021-05-11 18:26:15 +02:00 · cd0f164d61
--- a/cbm/CBM.py
+++ b/cbm/CBM.py
@ -4,12 +4,18 @@ import numpy as np
 class CBM:
    cpp: cbm_cpp.PyCBM

+    # TODO: scitkit-learn estimator?
+    # def fit_pandas(self, df):
+    # TODO include binning of continuous features
+
    def fit(self,
            y: np.ndarray, 
            x: np.ndarray,
-            learning_rate_step_size = 1/100,
-            max_iterations = 100,
-            epsilon_early_stopping = 1e-3):
+            learning_rate_step_size:float = 1/100,
+            max_iterations:int = 100,
+            min_iterations_early_stopping:int = 20,
+            epsilon_early_stopping:float = 1e-3,
+            single_update_per_iteration:bool = True):
        y_mean = np.average(y)

        # determine max bin per categorical
@ -26,7 +32,13 @@ class CBM:
            x_max.astype('uint8'),
            learning_rate_step_size,
            max_iterations,
-            epsilon_early_stopping)
+            min_iterations_early_stopping,
+            epsilon_early_stopping,
+            single_update_per_iteration)

-    def predict(self, x: np.ndarray):
-        return self.cpp.predict(x)
+    def predict(self, x: np.ndarray, explain: bool = False):
+        return self.cpp.predict(x, explain)
+
+    @property
+    def weights(self):
+        return self.cpp.weights
--- a/src/cbm.cpp
+++ b/src/cbm.cpp
@ -5,6 +5,33 @@
 #include <cmath>

 namespace cbm {
+    void CBM::update_y_hat_sum(
+        std::vector<std::vector<uint64_t>>& y_hat_sum,
+        std::vector<std::vector<uint8_t>>& x,
+        size_t n_examples,
+        size_t n_features) {
+
+        // reset y_hat_sum
+        for (size_t j=0;j<n_features;j++)
+            std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
+
+        // compute y_hat and y_hat_sum
+        for (size_t i=0;i<n_examples;i++) {
+            // TODO: parallelize & vectorize
+            // TODO: use log to stabilize?
+            auto y_hat_i = _y_mean;
+            for (size_t j=0;j<n_features;j++)
+                y_hat_i *= _f[j][x[j][i]];
+
+            for (size_t j=0;j<n_features;j++)
+                y_hat_sum[j][x[j][i]] += y_hat_i;
+        }
+    }
+
+    std::vector<std::vector<double>>& CBM::get_weights() {
+        return _f;
+    }
+
    void CBM::fit(
        const uint32_t* y,
        const char* x_data,
@ -16,7 +43,9 @@ namespace cbm {
        const uint8_t* x_max,
        double learning_rate_step_size,
        size_t max_iterations,
-        double epsilon_early_stopping) {
+        size_t min_iterations_early_stopping,
+        double epsilon_early_stopping,
+        bool single_update_per_iteration) {

        _y_mean = y_mean;

@ -54,28 +83,13 @@ namespace cbm {
        // iterations
        double learning_rate = learning_rate_step_size;
        double rmse0 = std::numeric_limits<double>::infinity();
-
+ 
        for (size_t t=0;t<max_iterations;t++,learning_rate+=learning_rate_step_size) {
            // cap at 1
            if (learning_rate > 1)
                learning_rate = 1;

-            // reset y_hat_sum
-            for (size_t j=0;j<n_features;j++)
-                std::fill(y_hat_sum[j].begin(), y_hat_sum[j].end(), 0);
-
-            // compute y_hat and y_hat_sum
-            // Note: deviation from the paper as f is 
-            for (size_t i=0;i<n_examples;i++) {
-                // TODO: parallelize & vectorize
-                // TODO: use log to stabilize?
-                auto y_hat_i = _y_mean;
-                for (size_t j=0;j<n_features;j++)
-                    y_hat_i *= _f[j][x[j][i]];
-
-                for (size_t j=0;j<n_features;j++)
-                    y_hat_sum[j][x[j][i]] += y_hat_i;
-            }
+            update_y_hat_sum(y_hat_sum, x, n_examples, n_features);

            // compute g
            // TODO: parallelize
@ -94,6 +108,9 @@ namespace cbm {
                            _f[j][k] *= g;
                        else
                            _f[j][k] *= std::exp(learning_rate * std::log(g)); // eqn 2 (b) + eqn 4
+
+                        if (!single_update_per_iteration)
+                            update_y_hat_sum(y_hat_sum, x, n_examples, n_features);
                    }
                }
            }
@ -113,37 +130,13 @@ namespace cbm {

            // check for early stopping
            // TODO: expose minimum number of rounds
-            if (t > 20 && (rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
+            if (t > min_iterations_early_stopping && 
+                (rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping)) {
                // TODO: record diagnostics?
                // printf("early stopping %1.4f vs %1.4f after t=%d\n", rmse, rmse0, (int)t);
                break;
            }
-            rmse0 = rmse;
+            rmse0 = rmse; 
        }
    }
-
-    std::vector<double> CBM::predict(
-            const char* x_data,
-            size_t x_stride0,
-            size_t x_stride1,
-            size_t n_examples,
-            size_t n_features) {
-
-        if (n_features != _f.size())
-            throw std::runtime_error("Features need to match!");
-
-        std::vector<double> y_hat(n_examples, _y_mean);
-
-        // TODO: batch parallelization
-        for (size_t i=0;i<n_examples;i++) {
-            double& y_hat_i = y_hat[i];
-            for (size_t j=0;j<n_features;j++) {
-                // TODO: simd gather?
-                uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
-                y_hat_i *= _f[j][x_ij];
-            }
-        }
-
-        return y_hat;
-    }
 }
--- a/src/cbm.h
+++ b/src/cbm.h
@ -2,13 +2,19 @@

 #include <vector>
 #include <stdint.h>
+#include <stdexcept>

 namespace cbm {
    class CBM {
        // n_features x max_bin[j] (jagged)
-        std::vector<std::vector<double>> _f;       
+        std::vector<std::vector<double>> _f;
        double _y_mean;

+        void update_y_hat_sum(
+        std::vector<std::vector<uint64_t>>& y_hat_sum,
+            std::vector<std::vector<uint8_t>>& x,
+            size_t n_examples,
+            size_t n_features);
    public:
        void fit(
            const uint32_t* y,
@ -21,13 +27,42 @@ namespace cbm {
            const uint8_t* x_max,
            double learning_rate_step_size,
            size_t max_iterations,
-            double epsilon_early_stopping);
+            size_t min_iterations_early_stopping,
+            double epsilon_early_stopping,
+            bool single_update_per_iteration);

-        std::vector<double> predict(
+        template<bool explain>
+        void predict(
            const char* x_data,
            size_t x_stride0,
            size_t x_stride1,
            size_t n_examples,
-            size_t n_features);
+            size_t n_features,
+            double* out_data) {
+
+            if (n_features != _f.size())
+                throw std::runtime_error("Features need to match!");
+
+            // column-wise oriented output data
+            double* out_y_hat = out_data;
+            std::fill(out_y_hat, out_y_hat + n_examples, _y_mean);
+
+            // TODO: batch parallelization
+            for (size_t i=0;i<n_examples;i++) {
+                double& y_hat_i = *(out_y_hat +i);
+                
+                for (size_t j=0;j<n_features;j++) {
+                    // TODO: simd gather?
+                    uint8_t x_ij = *reinterpret_cast<const uint8_t*>(x_data + i * x_stride0 + j * x_stride1);
+                    y_hat_i *= _f[j][x_ij];
+
+                    if (explain) {
+                        *(out_data + (j + 1) * n_examples + i) = _f[j][x_ij];
+                    }
+                }
+            }
+        }
+
+        std::vector<std::vector<double>>& get_weights();
    };
 }
--- a/src/pycbm.cpp
+++ b/src/pycbm.cpp
@ -11,7 +11,9 @@ namespace cbm {
        py::buffer x_max_b,
        double learning_rate_step_size,
        size_t max_iterations,
-        double epsilon_early_stopping) {
+        size_t min_iterations_early_stopping,
+        double epsilon_early_stopping,
+        bool single_update_per_iteration) {

        // TODO: fix error messages
        py::buffer_info y_info = y_b.request();
@ -58,10 +60,12 @@ namespace cbm {
            x_max,
            learning_rate_step_size,
            max_iterations,
-            epsilon_early_stopping);
+            min_iterations_early_stopping,
+            epsilon_early_stopping,
+            single_update_per_iteration);
    }

-    std::vector<double> PyCBM::predict(py::buffer x_b) {
+    py::array_t<double> PyCBM::predict(py::buffer x_b, bool explain) {
        // TODO: fix error messages
        py::buffer_info x_info = x_b.request();
        if (x_info.format != py::format_descriptor<uint8_t>::format())
@ -76,7 +80,21 @@ namespace cbm {
        ssize_t n_examples = x_info.shape[0];
        ssize_t n_features = x_info.shape[1];

-        return _cbm.predict(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features);
+        py::array_t<double, py::array::f_style> out_data(
+            {(int)n_examples, explain ? (int)(1 + n_features) : 1}
+        );
+
+        if (explain)
+            _cbm.predict<true>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
+        else
+            _cbm.predict<false>(x_data, x_info.strides[0], x_info.strides[1], n_examples, n_features, out_data.mutable_data());
+
+        return out_data;
+    }
+
+
+    std::vector<std::vector<double>>& PyCBM::get_weights() {
+        return _cbm.get_weights();
    }
 };

@ -85,5 +103,6 @@ PYBIND11_MODULE(cbm_cpp, m) {

    estimator.def(py::init([]() { return new cbm::PyCBM(); }))
             .def("fit", &cbm::PyCBM::fit)
-             .def("predict", &cbm::PyCBM::predict);
+             .def("predict", &cbm::PyCBM::predict)
+             .def_property_readonly("weights", &cbm::PyCBM::get_weights);
 }
--- a/src/pycbm.h
+++ b/src/pycbm.h
@ -2,6 +2,7 @@

 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <pybind11/numpy.h>

 #include "cbm.h"

@ -21,8 +22,12 @@ namespace cbm {
            py::buffer x_max_b,
            double learning_rate_step_size,
            size_t max_iterations,
-            double epsilon_early_stopping);
+            size_t min_iterations_early_stopping,
+            double epsilon_early_stopping,
+            bool single_update_per_iteration);

-        std::vector<double> predict(py::buffer x_b);
+        py::array_t<double> predict(py::buffer x_b, bool explain);
+
+        std::vector<std::vector<double>>& get_weights();
    };
 }
--- a/tests/test_cbm.py
+++ b/tests/test_cbm.py
@ -48,18 +48,24 @@ def test_nyc_bicycle():

    # TODO: move to CBM.py and support pandas interface?
    # CBM can only handle categorical information
-    def histedges_equalN(x, nbin):
-        npt = len(x)
-        return np.interp(np.linspace(0, npt, nbin + 1),
-                         np.arange(npt),
-                         np.sort(x))
+    # def histedges_equalN(x, nbin):
+    #     npt = len(x)
+    #     return np.interp(np.linspace(0, npt, nbin + 1),
+    #                      np.arange(npt),
+    #                      np.sort(x))
+
+    # def histedges_equalN(x, nbin):
+        # return pd.qcut(x, nbin)

    # some hyper-parameter che.. ehm tuning
    for bins in [2, 3, 4, 5, 6, 10]:
        x = np.stack([
            bic['Weekday'].values,
-            np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
-            np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))], axis=1)\
+            pd.qcut(bic['HIGH_T'], bins).cat.codes,
+            pd.qcut(bic['LOW_T'], bins).cat.codes],
+            # np.digitize(bic['HIGH_T'], histedges_equalN(bic['HIGH_T'], bins)),
+            # np.digitize(bic['LOW_T'], histedges_equalN(bic['LOW_T'], bins))], 
+            axis=1)\
                .astype('uint8')

        x_train = x[train_idx, ]
@ -69,9 +75,21 @@ def test_nyc_bicycle():

        # fit CBM model
        model = cbm.CBM()
-        model.fit(y_train, x_train)
+        model.fit(y_train, x_train, single_update_per_iteration=False)

        y_pred = model.predict(x_test)
+
+        # y_pred_explain[:, 0]  --> predictions
+        # y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean
+        y_pred_explain = model.predict(x_test, explain=True)
+
+        # print("x", x_test[:3])
+        # print("y", y_pred_explain[:3])
+        # print("f", model.weights)
+
+        # validate data predictions line up
+        # print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))
+
        print(f"CMB:          {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec bins={bins}")
        # print(np.stack((y, y_pred))[:5,].transpose())