added categorical, date/time and continuous feature support added feature importance plotting

* try to fix c++ build * add MANIFEST.in to fix missing header files * making sure it really gets included * added categorical, date/time and continuous feature support added feature importance plotting * make sure the model is serializable
2021-11-24 11:28:26 +01:00 · 2021-11-24 11:28:26 +01:00 · a5a992afe1
--- a/.github/workflows/publish-to-test-pypi.yml
+++ b/.github/workflows/publish-to-test-pypi.yml
@ -41,6 +41,9 @@ jobs:
    - name: Build a source tarball
      run: python -m build --sdist --outdir dist/ .        

+    - name: Install from source tarball
+      run: python -m pip install dist/*.tar.gz --user
+
    - name: Store the binary wheel
      uses: actions/upload-artifact@v2
      with:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,4 @@
+include README.md
+include setup.py
+recursive-include cbm *.py
+recursive-include src *.c *.h
--- a/cbm/CBM.py
+++ b/cbm/CBM.py
@ -3,11 +3,14 @@

 import cbm_cpp
 import numpy as np
+import pandas as pd

 from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from typing import List, Union
+from pandas.api.types import CategoricalDtype


 class CBM(BaseEstimator):
@ -19,7 +22,21 @@ class CBM(BaseEstimator):
        min_iterations_early_stopping:int = 20,
        epsilon_early_stopping:float = 1e-3,
        single_update_per_iteration:bool = True,
+        date_features: Union[str, List[str]] = 'day,month',
+        binning: Union[int, lambda x: int] = 10,
        ) -> None:
+        """Initialize the CBM model.
+
+        Args:
+            learning_rate_step_size (float, optional): [description]. Defaults to 1/100.
+            max_iterations (int, optional): [description]. Defaults to 100.
+            min_iterations_early_stopping (int, optional): [description]. Defaults to 20.
+            epsilon_early_stopping (float, optional): [description]. Defaults to 1e-3.
+            single_update_per_iteration (bool, optional): [description]. Defaults to True.
+            date_features (List[str], optional): [description]. Defaults to ['day', 'month'].
+            binning (Union[int, lambda x, optional): [description]. Defaults to 10. 
+                The number of bins to create for continuous features. Supply lambda for flexible binning.
+        """        

        self.learning_rate_step_size = learning_rate_step_size
        self.max_iterations = max_iterations
@ -27,13 +44,90 @@ class CBM(BaseEstimator):
        self.epsilon_early_stopping = epsilon_early_stopping
        self.single_update_per_iteration = single_update_per_iteration

+        # lets make sure it's serializable
+        if isinstance(date_features, list):
+            date_features = ",".join(date_features)
+        self.date_features = date_features
+        self.binning = binning
+
+    def get_date_features(self) -> List[str]:
+        return self.date_features.split(",")
+
    def fit(self,
-            X: np.ndarray,
+            X: Union[np.ndarray, pd.DataFrame],
            y: np.ndarray
            ) -> "CBM":

+        # keep feature names around
+
+        if isinstance(X, pd.DataFrame):
+            self._feature_names = []
+            self._feature_categories = []
+            self._feature_bins = []
+
+            X_numeric = []
+
+            for col in X.columns:
+                col_dtype = X[col].dtype
+
+                if pd.api.types.is_datetime64_any_dtype(col_dtype):
+                    for expansion in self.get_date_features():
+                        import calendar
+
+                        if expansion == 'day':
+                            self._feature_names.append(f'{col}_day')
+                            self._feature_categories.append(calendar.day_abbr)
+                            self._feature_bins.append(None)
+
+                            X_numeric.append(X[col].dt.dayofweek.values)
+
+                        elif expansion == 'month':
+                            self._feature_names.append(f'{col}_month')
+                            self._feature_categories.append(calendar.month_abbr)
+                            self._feature_bins.append(None)
+
+                            X_numeric.append(X[col].dt.month.values)
+
+                elif pd.api.types.is_float_dtype(col_dtype):
+                    # deal with continuous features
+                    bin_num = self.binning if isinstance(self.binning, int) else self.binning(X[col])
+
+                    X_binned, bins  = pd.qcut(X[col].fillna(0), bin_num, retbins=True)
+
+                    self._feature_names.append(col)
+                    self._feature_categories.append(X_binned.cat.categories.astype(str).tolist())
+                    self._feature_bins.append(bins)
+
+                    X_numeric.append(pd.cut(X[col].fillna(0), bins, include_lowest=True).cat.codes)
+
+                elif not pd.api.types.is_integer_dtype(col_dtype):
+                    self._feature_names.append(col)
+
+                    # convert to categorical
+                    X_cat = (X[col]
+                        .fillna('CBM_UnknownCategory')
+                        .astype('category'))
+
+                    # keep track of categories
+                    self._feature_categories.append(X_cat.cat.categories.tolist())
+                    self._feature_bins.append(None)
+
+                    # convert to 0-based index
+                    X_numeric.append(X_cat.cat.codes)
+                else:
+                    self._feature_names.append(col)
+                    self._feature_categories.append(None)
+                    self._feature_bins.append(None)
+
+                    X_numeric.append(X[col])
+
+            X = np.column_stack(X_numeric)
+        else:
+            self._feature_names = None
+
        X, y = check_X_y(X, y, y_numeric=True)

+        # pre-processing
        y_mean = np.average(y)

        # determine max bin per categorical
@ -60,6 +154,43 @@ class CBM(BaseEstimator):
        return self

    def predict(self, X: np.ndarray, explain: bool = False):
+        if isinstance(X, pd.DataFrame):
+            X_numeric = []
+
+            offset = 0 # correct for date expansion
+            for i, col in enumerate(X.columns):
+                col_dtype = X[col].dtype
+
+                if pd.api.types.is_datetime64_any_dtype(col_dtype):
+                    for expansion in self.get_date_features():
+                        if expansion == 'day':
+                            X_numeric.append(X[col].dt.dayofweek.values)
+                            offset += 1
+
+                        elif expansion == 'month':
+                            X_numeric.append(X[col].dt.month.values)
+                            offset += 1
+
+                    offset -= 1                           
+
+                elif pd.api.types.is_float_dtype(col_dtype):
+                    # re-use binning from training
+                    X_numeric.append(pd.cut(X[col].fillna(0), self._feature_bins[i + offset], include_lowest=True).cat.codes)
+
+                elif not pd.api.types.is_integer_dtype(col_dtype):
+                    # convert to categorical
+                    X_cat = (X[col]
+                        .fillna('CBM_UnknownCategory')
+                        # re-use categories from training
+                        .astype(CategoricalDtype(categories=self._feature_categories[i + offset], ordered=True)))
+
+                    # convert to 0-based index
+                    X_numeric.append(X_cat.cat.codes)
+                else:
+                    X_numeric.append(X[col])
+
+            X = np.column_stack(X_numeric)
+
        X = check_array(X)
        check_is_fitted(self, "is_fitted_")

@ -73,6 +204,41 @@ class CBM(BaseEstimator):

        self.is_fitted_ = True

+    def plot_importance(self, feature_names: list = None, **kwargs):
+        check_is_fitted(self, "is_fitted_")
+
+        if feature_names is None:
+            feature_names = self._feature_names
+
+        import matplotlib.pyplot as plt
+
+        n_features = len(self.weights)
+
+        n_cols = int(np.ceil( np.sqrt(n_features)))
+        n_rows = int(np.floor(np.sqrt(n_features)))
+
+        if n_cols * n_rows < n_features:
+            n_rows += 1
+
+        fig, ax = plt.subplots(n_rows, n_cols, sharex=True, **kwargs)
+
+        fig.suptitle(f'Response mean: {self.y_mean:0.4f} | Iterations {self.iterations}')
+
+        for f in range(n_features):
+            w = np.array(self.weights[f])
+            
+            color = np.where(w < 1, 'xkcd:tomato', 'xkcd:green')
+
+            ax_sub = ax[f // n_cols, f % n_cols]
+            ax_sub.barh(range(len(w)), w - 1, color=color)
+
+            ax_sub.set_title(feature_names[f] if feature_names is not None else f'Feature {f}')
+
+            if self._feature_categories[f] is not None:
+                ax_sub.set_yticks(range(len(self._feature_categories[f])))
+                ax_sub.set_yticklabels(self._feature_categories[f])
+
+
    @property
    def weights(self):
        return self._cpp.weights
@ -80,3 +246,7 @@ class CBM(BaseEstimator):
    @property
    def y_mean(self):
        return self._cpp.y_mean
+
+    @property
+    def iterations(self):
+        return self._cpp.iterations
--- a/setup.py
+++ b/setup.py
@ -22,7 +22,7 @@ def get_extra_compile_args():
        cflags = ""

    return cflags.split() \
-            + ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"]
+            + ["-std=c++11", "-Wall", "-Wextra", "-march=native", "-msse2", "-ffast-math", "-mfpmath=sse"] #, "-fopenmp"]

 def get_libraries():
    if platform.system() == "Windows":
@ -36,7 +36,7 @@ long_description = (this_directory / "README.md").read_text()

 setup(
    name="cyclicbm",
-    version="0.0.6",
+    version="0.0.7",
    description="Cyclic Boosting Machines",
    long_description=long_description,
    long_description_content_type='text/markdown',
@ -55,19 +55,23 @@ setup(
        "Topic :: Scientific/Engineering :: Mathematics",
    ],
    setup_requires=["pytest-runner"],
-    install_requires=["pybind11>=2.2", "numpy", "scikit-learn"],
+    install_requires=["pybind11>=2.2", "numpy", "scikit-learn", "pandas"],
    tests_require=["pytest", "lightgbm"], #, "interpret"],
+    extras_require={
+        'interactive': ['matplotlib>=2.2.0'],
+    },
    packages=["cbm"],
-    package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
+    # package_data={ "cbm": ["src/pycbm.h", "src/cbm.h"] },
    ext_modules=[
        Extension(
            "cbm_cpp",
            ["src/pycbm.cpp", "src/cbm.cpp" ],
-            include_dirs=[get_pybind_include(), get_pybind_include(user=True)],
+            include_dirs=[get_pybind_include(), get_pybind_include(user=True), "src"],
            extra_compile_args=get_extra_compile_args(),
            libraries=get_libraries(),
            language="c++11",
        )
    ],
+    headers=["src/pycbm.h", "src/cbm.h"],
    zip_safe=False,
 )
--- a/src/cbm.cpp
+++ b/src/cbm.cpp
@ -9,14 +9,13 @@

 namespace cbm
 {
-    CBM::CBM()
+    CBM::CBM() : _iterations(0)
    {
    }

-    CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean)
+    CBM::CBM(const std::vector<std::vector<double>> &f, double y_mean) :
+        _f(f), _y_mean(y_mean), _iterations(0)
    {
-        _f = f;
-        _y_mean = y_mean;
    }

    void CBM::update_y_hat_sum(
@ -64,6 +63,11 @@ namespace cbm
        _y_mean = y_mean;
    }

+    size_t CBM::get_iterations() const
+    {
+        return _iterations;
+    }
+
    void CBM::fit(
        const uint32_t *y,
        const char *x_data,
@ -119,7 +123,7 @@ namespace cbm
        double learning_rate = learning_rate_step_size;
        double rmse0 = std::numeric_limits<double>::infinity();

-        for (size_t t = 0; t < max_iterations; t++, learning_rate += learning_rate_step_size)
+        for (_iterations = 0; _iterations < max_iterations; _iterations++, learning_rate += learning_rate_step_size)
        {
            // cap at 1
            if (learning_rate > 1)
@ -128,7 +132,7 @@ namespace cbm
            update_y_hat_sum(y_hat_sum, x, n_examples, n_features);

            // compute g
-            // TODO: parallelize
+            // #pragma omp for // didn't observe improvement
            for (size_t j = 0; j < n_features; j++)
            {
                for (size_t k = 0; k <= x_max[j]; k++)
@ -171,7 +175,7 @@ namespace cbm

            // check for early stopping
            // TODO: expose minimum number of rounds
-            if (t > min_iterations_early_stopping &&
+            if (_iterations > min_iterations_early_stopping &&
                (rmse > rmse0 || (rmse0 - rmse) < epsilon_early_stopping))
            {
                // TODO: record diagnostics?
--- a/src/cbm.h
+++ b/src/cbm.h
@ -15,6 +15,8 @@ namespace cbm
        std::vector<std::vector<double>> _f;
        double _y_mean;

+        size_t _iterations;
+
        void update_y_hat_sum(
            std::vector<std::vector<uint64_t>> &y_hat_sum,
            std::vector<std::vector<uint8_t>> &x,
@ -81,5 +83,7 @@ namespace cbm

        float get_y_mean() const;
        void set_y_mean(float mean);
+
+        size_t get_iterations() const;
    };
 }
--- a/src/pycbm.cpp
+++ b/src/pycbm.cpp
@ -145,6 +145,11 @@ namespace cbm
    {
        _cbm.set_y_mean(y_mean);
    }
+
+    size_t PyCBM::get_iterations() const
+    {
+        return _cbm.get_iterations();
+    }
 };

 PYBIND11_MODULE(cbm_cpp, m)
@ -157,8 +162,10 @@ PYBIND11_MODULE(cbm_cpp, m)
        .def("predict", &cbm::PyCBM::predict)
        .def_property("y_mean", &cbm::PyCBM::get_y_mean, &cbm::PyCBM::set_y_mean)
        .def_property("weights", &cbm::PyCBM::get_weights, &cbm::PyCBM::set_weights)
+        .def_property_readonly("iterations", &cbm::PyCBM::get_iterations)
        .def(py::pickle(
            [](const cbm::PyCBM &p) { // __getstate__
+                /* TODO: this does not include the feature pre-processing */
                /* Return a tuple that fully encodes the state of the object */
                return py::make_tuple(p.get_weights(), p.get_y_mean());
            },
--- a/src/pycbm.h
+++ b/src/pycbm.h
@ -43,5 +43,7 @@ namespace cbm
        float get_y_mean() const;

        void set_y_mean(float mean);
+
+        size_t get_iterations() const;
    };
 }