CBM/tests/test_cbm.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import pytest
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
# from interpret.glassbox import ExplainableBoostingRegressor

import lightgbm as lgb
import timeit
import cbm

def test_poisson_random():
    np.random.seed(42)

    # n = 1000 # test w/ 100, 1000, 10000, 100000
    # features = 2
    # bins = 2

    # y_base = np.random.poisson([[1, 3], [7, 20]], (n, features, bins))

    # x = np.random.randint(0, bins, (n, features), dtype='uint8')

    # y = np.zeros(n, dtype='uint32')
    # # TODO: figure out proper take, take_along_axis, ...
    # for i, idx in enumerate(x):
    #     y[i] = y_base[i, idx[0], idx[1]]

def test_nyc_bicycle_validate():
    np.random.seed(42)

    # read data
    bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')
    bic['Date'] = pd.to_datetime(bic['Date'])
    bic['Weekday'] = bic['Date'].dt.weekday

    y = bic['BB_COUNT'].values.astype('uint32')

    # train/test split
    split = int(len(y) * 0.8)
    train_idx = np.arange(0, split)
    test_idx  = np.arange(split + 1, len(y))

    y_train = y[train_idx]
    y_test  = y[test_idx]

    test_err_expected = {2: 449.848, 3: 533.465, 4: 503.399, 5: 534.738, 6: 527.854, 7: 529.942, 8: 597.041, 9: 615.646, 10: 560.182}
    train_err_expected = {2: 632.521, 3: 578.816, 4: 588.342, 5: 563.843, 6: 552.219, 7: 547.073, 8: 518.893, 9: 525.629, 10: 523.194}

    for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
        x = np.stack([
            bic['Weekday'].values,
            pd.qcut(bic['HIGH_T'], bins).cat.codes,
            pd.qcut(bic['LOW_T'], bins).cat.codes,
            pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes
        ],
            axis=1)\
                .astype('uint8')

        x_train = x[train_idx, ]
        x_test  = x[test_idx, ]

        # fit CBM model
        model = cbm.CBM(single_update_per_iteration=False)
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)
        y_pred_train = model.predict(x_train)

        test_err = mean_squared_error(y_test, y_pred, squared=False)
        train_err = mean_squared_error(y_train, y_pred_train, squared=False)

        assert test_err_expected[bins] == pytest.approx(test_err, abs=1e-2)
        assert train_err_expected[bins] == pytest.approx(train_err, abs=1e-2)

def test_nyc_bicycle():
    np.random.seed(42)

    # read data
    bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')
    bic['Date'] = pd.to_datetime(bic['Date'])
    bic['Weekday'] = bic['Date'].dt.weekday

    y = bic['BB_COUNT'].values.astype('uint32')

    # train/test split
    split = int(len(y) * 0.8)
    train_idx = np.arange(0, split)
    test_idx  = np.arange(split + 1, len(y))

    y_train = y[train_idx]
    y_test  = y[test_idx]

    #### CBM

    # TODO: move to CBM.py and support pandas interface?
    # CBM can only handle categorical information
    # def histedges_equalN(x, nbin):
    #     npt = len(x)
    #     return np.interp(np.linspace(0, npt, nbin + 1),
    #                      np.arange(npt),
    #                      np.sort(x))

    # def histedges_equalN(x, nbin):
        # return pd.qcut(x, nbin)

    print()
    # some hyper-parameter che.. ehm tuning
    for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
        x = np.stack([
            bic['Weekday'].values,
            pd.qcut(bic['HIGH_T'], bins).cat.codes,
            pd.qcut(bic['LOW_T'], bins).cat.codes,
            pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes
        ],
            axis=1)\
                .astype('uint8')

        x_train = x[train_idx, ]
        x_test  = x[test_idx, ]

        start = timeit.timeit()

        # fit CBM model
        model = cbm.CBM(single_update_per_iteration=False)
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)
        y_pred_train = model.predict(x_train)

        # y_pred_explain[:, 0]  --> predictions
        # y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean
        y_pred_explain = model.predict(x_test, explain=True)

        # print("x", x_test[:3])
        # print("y", y_pred_explain[:3])
        # print("f", model.weights)

        # validate data predictions line up
        # print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))

        print(f"CMB:          {mean_squared_error(y_test, y_pred, squared=False):1.4f} (train {mean_squared_error(y_train, y_pred_train, squared=False):1.4f}) bins={bins} {timeit.timeit() - start}sec")
        print("weights", model.weights)
        print(f"y_mean: {model.y_mean}")
        # print(np.stack((y, y_pred))[:5,].transpose())   

        model2  = cbm.CBM()
        model2.update(model.weights, model.y_mean)

        y_pred_train2 = model2.predict(x_train)

        # print(y_pred_train[:10])
        # print(y_pred_train2[:10])
        print('Must match: ', np.allclose(y_pred_train, y_pred_train2))

    #### Poisson Regression

    # one-hot encode categorical
    start = timeit.timeit()

    x = bic['Weekday'].values.reshape((-1,1)).astype('uint8')

    enc = OneHotEncoder()
    enc.fit(x)
    x = enc.transform(x)

    x = np.hstack([x.todense(), bic[['HIGH_T', 'LOW_T', 'PRECIP']].values])

    clf = linear_model.PoissonRegressor()
    clf.fit(x[train_idx, ], y_train)

    y_pred = clf.predict(x[test_idx, ])
    print(f"Poisson Reg:  {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
    # print(np.stack((y, y_pred))[:5,].transpose())   

    #### LightGBM

    start = timeit.timeit()

    # train_data = lgb.Dataset(x, label=y, categorical_feature=[0, 1])
    x = bic[['Weekday', 'HIGH_T', 'LOW_T', 'PRECIP']].values

    train_data = lgb.Dataset(x[train_idx, ], label=y_train, categorical_feature=[0])
    model = lgb.train({
        'objective': 'poisson',
        'metric': ['poisson', 'rmse'],
        'verbose': -1,
    }, train_data)

    y_pred = model.predict(x[test_idx, ])
    print(f"LightGBM Reg: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
    # print(np.stack((y, y_pred))[:5,].transpose())   


    #### EBM
    # start = timeit.timeit()

    # ebm = ExplainableBoostingRegressor(random_state=23, max_bins=8) #, outer_bags=25, inner_bags=25)
    # ebm.fit(x[train_idx], y_train)

    # y_pred = ebm.predict(x[test_idx,])
    # print(f"EBM:          {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
Fancy badges (#4) * add license add linting * more license right default branch * remove secret scanning as it's covered by GH default 2021-09-29 01:02:44 +03:00			`# Copyright (c) Microsoft Corporation.`
			`# Licensed under the MIT License.`

initial clean code 2021-05-10 10:53:54 +03:00			`import pytest`
			`import numpy as np`
			`import pandas as pd`
			`from sklearn import linear_model`
			`from sklearn.metrics import mean_squared_error`
			`from sklearn.preprocessing import OneHotEncoder`
Badge improvements (#3) * try to fix windows * try python 3.6 * remove interpret to get 3.6 work 2021-09-20 19:32:06 +03:00			`# from interpret.glassbox import ExplainableBoostingRegressor`
initial clean code 2021-05-10 10:53:54 +03:00
			`import lightgbm as lgb`
			`import timeit`
			`import cbm`

			`def test_poisson_random():`
			`np.random.seed(42)`

			`# n = 1000 # test w/ 100, 1000, 10000, 100000`
			`# features = 2`
			`# bins = 2`

			`# y_base = np.random.poisson([[1, 3], [7, 20]], (n, features, bins))`

			`# x = np.random.randint(0, bins, (n, features), dtype='uint8')`

			`# y = np.zeros(n, dtype='uint32')`
			`# # TODO: figure out proper take, take_along_axis, ...`
			`# for i, idx in enumerate(x):`
			`# y[i] = y_base[i, idx[0], idx[1]]`

Openmp (#11) * add openmp support for parallelization add bin counting for debugging * add algo precision test * remove parallelization messing with model weights * fix windows/mac build 2021-12-10 22:59:22 +03:00			`def test_nyc_bicycle_validate():`
			`np.random.seed(42)`

			`# read data`
			`bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')`
			`bic['Date'] = pd.to_datetime(bic['Date'])`
			`bic['Weekday'] = bic['Date'].dt.weekday`

			`y = bic['BB_COUNT'].values.astype('uint32')`

			`# train/test split`
			`split = int(len(y) * 0.8)`
			`train_idx = np.arange(0, split)`
			`test_idx = np.arange(split + 1, len(y))`

			`y_train = y[train_idx]`
			`y_test = y[test_idx]`

			`test_err_expected = {2: 449.848, 3: 533.465, 4: 503.399, 5: 534.738, 6: 527.854, 7: 529.942, 8: 597.041, 9: 615.646, 10: 560.182}`
			`train_err_expected = {2: 632.521, 3: 578.816, 4: 588.342, 5: 563.843, 6: 552.219, 7: 547.073, 8: 518.893, 9: 525.629, 10: 523.194}`

			`for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:`
			`x = np.stack([`
			`bic['Weekday'].values,`
			`pd.qcut(bic['HIGH_T'], bins).cat.codes,`
			`pd.qcut(bic['LOW_T'], bins).cat.codes,`
			`pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes`
			`],`
			`axis=1)\`
			`.astype('uint8')`

			`x_train = x[train_idx, ]`
			`x_test = x[test_idx, ]`

			`# fit CBM model`
			`model = cbm.CBM(single_update_per_iteration=False)`
			`model.fit(x_train, y_train)`

			`y_pred = model.predict(x_test)`
			`y_pred_train = model.predict(x_train)`

			`test_err = mean_squared_error(y_test, y_pred, squared=False)`
			`train_err = mean_squared_error(y_train, y_pred_train, squared=False)`

			`assert test_err_expected[bins] == pytest.approx(test_err, abs=1e-2)`
			`assert train_err_expected[bins] == pytest.approx(train_err, abs=1e-2)`
initial clean code 2021-05-10 10:53:54 +03:00
			`def test_nyc_bicycle():`
			`np.random.seed(42)`

			`# read data`
			`bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')`
			`bic['Date'] = pd.to_datetime(bic['Date'])`
			`bic['Weekday'] = bic['Date'].dt.weekday`

			`y = bic['BB_COUNT'].values.astype('uint32')`

			`# train/test split`
			`split = int(len(y) * 0.8)`
			`train_idx = np.arange(0, split)`
			`test_idx = np.arange(split + 1, len(y))`

			`y_train = y[train_idx]`
			`y_test = y[test_idx]`

			`#### CBM`

			`# TODO: move to CBM.py and support pandas interface?`
			`# CBM can only handle categorical information`
switched to qcut for discretizing expose explainations expose weights 2021-05-11 19:26:15 +03:00			`# def histedges_equalN(x, nbin):`
			`# npt = len(x)`
			`# return np.interp(np.linspace(0, npt, nbin + 1),`
			`# np.arange(npt),`
			`# np.sort(x))`

			`# def histedges_equalN(x, nbin):`
			`# return pd.qcut(x, nbin)`
initial clean code 2021-05-10 10:53:54 +03:00
fixed bug that didn't update the highest bucket 2021-05-12 17:32:55 +03:00			`print()`
initial clean code 2021-05-10 10:53:54 +03:00			`# some hyper-parameter che.. ehm tuning`
fixed bug that didn't update the highest bucket 2021-05-12 17:32:55 +03:00			`for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:`
initial clean code 2021-05-10 10:53:54 +03:00			`x = np.stack([`
			`bic['Weekday'].values,`
switched to qcut for discretizing expose explainations expose weights 2021-05-11 19:26:15 +03:00			`pd.qcut(bic['HIGH_T'], bins).cat.codes,`
fixed bug that didn't update the highest bucket 2021-05-12 17:32:55 +03:00			`pd.qcut(bic['LOW_T'], bins).cat.codes,`
			`pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes`
			`],`
switched to qcut for discretizing expose explainations expose weights 2021-05-11 19:26:15 +03:00			`axis=1)\`
initial clean code 2021-05-10 10:53:54 +03:00			`.astype('uint8')`

			`x_train = x[train_idx, ]`
			`x_test = x[test_idx, ]`

			`start = timeit.timeit()`

			`# fit CBM model`
made scikit-learn compatible 2021-05-31 21:29:29 +03:00			`model = cbm.CBM(single_update_per_iteration=False)`
			`model.fit(x_train, y_train)`
initial clean code 2021-05-10 10:53:54 +03:00
			`y_pred = model.predict(x_test)`
fixed bug that didn't update the highest bucket 2021-05-12 17:32:55 +03:00			`y_pred_train = model.predict(x_train)`
switched to qcut for discretizing expose explainations expose weights 2021-05-11 19:26:15 +03:00
			`# y_pred_explain[:, 0] --> predictions`
			`# y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean`
			`y_pred_explain = model.predict(x_test, explain=True)`

			`# print("x", x_test[:3])`
			`# print("y", y_pred_explain[:3])`
			`# print("f", model.weights)`

			`# validate data predictions line up`
			`# print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))`

fixed bug that didn't update the highest bucket 2021-05-12 17:32:55 +03:00			`print(f"CMB: {mean_squared_error(y_test, y_pred, squared=False):1.4f} (train {mean_squared_error(y_train, y_pred_train, squared=False):1.4f}) bins={bins} {timeit.timeit() - start}sec")`
support externally changing the weights (#6) * support externally changing the weights * some linting fixes 2021-11-24 01:10:54 +03:00			`print("weights", model.weights)`
			`print(f"y_mean: {model.y_mean}")`
initial clean code 2021-05-10 10:53:54 +03:00			`# print(np.stack((y, y_pred))[:5,].transpose())`

support externally changing the weights (#6) * support externally changing the weights * some linting fixes 2021-11-24 01:10:54 +03:00			`model2 = cbm.CBM()`
			`model2.update(model.weights, model.y_mean)`

			`y_pred_train2 = model2.predict(x_train)`

			`# print(y_pred_train[:10])`
			`# print(y_pred_train2[:10])`
			`print('Must match: ', np.allclose(y_pred_train, y_pred_train2))`

initial clean code 2021-05-10 10:53:54 +03:00			`#### Poisson Regression`

			`# one-hot encode categorical`
			`start = timeit.timeit()`

			`x = bic['Weekday'].values.reshape((-1,1)).astype('uint8')`

			`enc = OneHotEncoder()`
			`enc.fit(x)`
			`x = enc.transform(x)`

include fair comparison w/ poisson regression and lightgbm by including all features 2021-05-12 17:45:56 +03:00			`x = np.hstack([x.todense(), bic[['HIGH_T', 'LOW_T', 'PRECIP']].values])`

initial clean code 2021-05-10 10:53:54 +03:00			`clf = linear_model.PoissonRegressor()`
			`clf.fit(x[train_idx, ], y_train)`

			`y_pred = clf.predict(x[test_idx, ])`
			`print(f"Poisson Reg: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")`
			`# print(np.stack((y, y_pred))[:5,].transpose())`

added EBM (Interpret) to comparison 2021-06-02 17:50:14 +03:00			`#### LightGBM`

initial clean code 2021-05-10 10:53:54 +03:00			`start = timeit.timeit()`

			`# train_data = lgb.Dataset(x, label=y, categorical_feature=[0, 1])`
include fair comparison w/ poisson regression and lightgbm by including all features 2021-05-12 17:45:56 +03:00			`x = bic[['Weekday', 'HIGH_T', 'LOW_T', 'PRECIP']].values`
initial clean code 2021-05-10 10:53:54 +03:00
			`train_data = lgb.Dataset(x[train_idx, ], label=y_train, categorical_feature=[0])`
			`model = lgb.train({`
			`'objective': 'poisson',`
			`'metric': ['poisson', 'rmse'],`
			`'verbose': -1,`
			`}, train_data)`

			`y_pred = model.predict(x[test_idx, ])`
			`print(f"LightGBM Reg: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")`
added EBM (Interpret) to comparison 2021-06-02 17:50:14 +03:00			`# print(np.stack((y, y_pred))[:5,].transpose())`


			`#### EBM`
Badge improvements (#3) * try to fix windows * try python 3.6 * remove interpret to get 3.6 work 2021-09-20 19:32:06 +03:00			`# start = timeit.timeit()`
added EBM (Interpret) to comparison 2021-06-02 17:50:14 +03:00
Badge improvements (#3) * try to fix windows * try python 3.6 * remove interpret to get 3.6 work 2021-09-20 19:32:06 +03:00			`# ebm = ExplainableBoostingRegressor(random_state=23, max_bins=8) #, outer_bags=25, inner_bags=25)`
			`# ebm.fit(x[train_idx], y_train)`
added EBM (Interpret) to comparison 2021-06-02 17:50:14 +03:00
Badge improvements (#3) * try to fix windows * try python 3.6 * remove interpret to get 3.6 work 2021-09-20 19:32:06 +03:00			`# y_pred = ebm.predict(x[test_idx,])`
			`# print(f"EBM: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")`