2021-09-29 01:02:44 +03:00
|
|
|
# Copyright (c) Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License.
|
|
|
|
|
2021-05-10 10:53:54 +03:00
|
|
|
import pytest
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
from sklearn import linear_model
|
|
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
from sklearn.preprocessing import OneHotEncoder
|
2021-09-20 19:32:06 +03:00
|
|
|
# from interpret.glassbox import ExplainableBoostingRegressor
|
2021-05-10 10:53:54 +03:00
|
|
|
|
|
|
|
import lightgbm as lgb
|
|
|
|
import timeit
|
|
|
|
import cbm
|
|
|
|
|
|
|
|
def test_poisson_random():
|
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
# n = 1000 # test w/ 100, 1000, 10000, 100000
|
|
|
|
# features = 2
|
|
|
|
# bins = 2
|
|
|
|
|
|
|
|
# y_base = np.random.poisson([[1, 3], [7, 20]], (n, features, bins))
|
|
|
|
|
|
|
|
# x = np.random.randint(0, bins, (n, features), dtype='uint8')
|
|
|
|
|
|
|
|
# y = np.zeros(n, dtype='uint32')
|
|
|
|
# # TODO: figure out proper take, take_along_axis, ...
|
|
|
|
# for i, idx in enumerate(x):
|
|
|
|
# y[i] = y_base[i, idx[0], idx[1]]
|
|
|
|
|
2021-12-10 22:59:22 +03:00
|
|
|
def test_nyc_bicycle_validate():
|
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
# read data
|
|
|
|
bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')
|
|
|
|
bic['Date'] = pd.to_datetime(bic['Date'])
|
|
|
|
bic['Weekday'] = bic['Date'].dt.weekday
|
|
|
|
|
|
|
|
y = bic['BB_COUNT'].values.astype('uint32')
|
|
|
|
|
|
|
|
# train/test split
|
|
|
|
split = int(len(y) * 0.8)
|
|
|
|
train_idx = np.arange(0, split)
|
|
|
|
test_idx = np.arange(split + 1, len(y))
|
|
|
|
|
|
|
|
y_train = y[train_idx]
|
|
|
|
y_test = y[test_idx]
|
|
|
|
|
|
|
|
test_err_expected = {2: 449.848, 3: 533.465, 4: 503.399, 5: 534.738, 6: 527.854, 7: 529.942, 8: 597.041, 9: 615.646, 10: 560.182}
|
|
|
|
train_err_expected = {2: 632.521, 3: 578.816, 4: 588.342, 5: 563.843, 6: 552.219, 7: 547.073, 8: 518.893, 9: 525.629, 10: 523.194}
|
|
|
|
|
|
|
|
for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
|
|
|
|
x = np.stack([
|
|
|
|
bic['Weekday'].values,
|
|
|
|
pd.qcut(bic['HIGH_T'], bins).cat.codes,
|
|
|
|
pd.qcut(bic['LOW_T'], bins).cat.codes,
|
|
|
|
pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes
|
|
|
|
],
|
|
|
|
axis=1)\
|
|
|
|
.astype('uint8')
|
|
|
|
|
|
|
|
x_train = x[train_idx, ]
|
|
|
|
x_test = x[test_idx, ]
|
|
|
|
|
|
|
|
# fit CBM model
|
|
|
|
model = cbm.CBM(single_update_per_iteration=False)
|
|
|
|
model.fit(x_train, y_train)
|
|
|
|
|
|
|
|
y_pred = model.predict(x_test)
|
|
|
|
y_pred_train = model.predict(x_train)
|
|
|
|
|
|
|
|
test_err = mean_squared_error(y_test, y_pred, squared=False)
|
|
|
|
train_err = mean_squared_error(y_train, y_pred_train, squared=False)
|
|
|
|
|
|
|
|
assert test_err_expected[bins] == pytest.approx(test_err, abs=1e-2)
|
|
|
|
assert train_err_expected[bins] == pytest.approx(train_err, abs=1e-2)
|
2021-05-10 10:53:54 +03:00
|
|
|
|
|
|
|
def test_nyc_bicycle():
|
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
# read data
|
|
|
|
bic = pd.read_csv('data/nyc_bb_bicyclist_counts.csv')
|
|
|
|
bic['Date'] = pd.to_datetime(bic['Date'])
|
|
|
|
bic['Weekday'] = bic['Date'].dt.weekday
|
|
|
|
|
|
|
|
y = bic['BB_COUNT'].values.astype('uint32')
|
|
|
|
|
|
|
|
# train/test split
|
|
|
|
split = int(len(y) * 0.8)
|
|
|
|
train_idx = np.arange(0, split)
|
|
|
|
test_idx = np.arange(split + 1, len(y))
|
|
|
|
|
|
|
|
y_train = y[train_idx]
|
|
|
|
y_test = y[test_idx]
|
|
|
|
|
|
|
|
#### CBM
|
|
|
|
|
|
|
|
# TODO: move to CBM.py and support pandas interface?
|
|
|
|
# CBM can only handle categorical information
|
2021-05-11 19:26:15 +03:00
|
|
|
# def histedges_equalN(x, nbin):
|
|
|
|
# npt = len(x)
|
|
|
|
# return np.interp(np.linspace(0, npt, nbin + 1),
|
|
|
|
# np.arange(npt),
|
|
|
|
# np.sort(x))
|
|
|
|
|
|
|
|
# def histedges_equalN(x, nbin):
|
|
|
|
# return pd.qcut(x, nbin)
|
2021-05-10 10:53:54 +03:00
|
|
|
|
2021-05-12 17:32:55 +03:00
|
|
|
print()
|
2021-05-10 10:53:54 +03:00
|
|
|
# some hyper-parameter che.. ehm tuning
|
2021-05-12 17:32:55 +03:00
|
|
|
for bins in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
|
2021-05-10 10:53:54 +03:00
|
|
|
x = np.stack([
|
|
|
|
bic['Weekday'].values,
|
2021-05-11 19:26:15 +03:00
|
|
|
pd.qcut(bic['HIGH_T'], bins).cat.codes,
|
2021-05-12 17:32:55 +03:00
|
|
|
pd.qcut(bic['LOW_T'], bins).cat.codes,
|
|
|
|
pd.qcut(bic['PRECIP'], 5, duplicates='drop').cat.codes
|
|
|
|
],
|
2021-05-11 19:26:15 +03:00
|
|
|
axis=1)\
|
2021-05-10 10:53:54 +03:00
|
|
|
.astype('uint8')
|
|
|
|
|
|
|
|
x_train = x[train_idx, ]
|
|
|
|
x_test = x[test_idx, ]
|
|
|
|
|
|
|
|
start = timeit.timeit()
|
|
|
|
|
|
|
|
# fit CBM model
|
2021-05-31 21:29:29 +03:00
|
|
|
model = cbm.CBM(single_update_per_iteration=False)
|
|
|
|
model.fit(x_train, y_train)
|
2021-05-10 10:53:54 +03:00
|
|
|
|
|
|
|
y_pred = model.predict(x_test)
|
2021-05-12 17:32:55 +03:00
|
|
|
y_pred_train = model.predict(x_train)
|
2021-05-11 19:26:15 +03:00
|
|
|
|
|
|
|
# y_pred_explain[:, 0] --> predictions
|
|
|
|
# y_pred_explain[:, 1:] --> explainations in-terms of multiplicative deviation from global mean
|
|
|
|
y_pred_explain = model.predict(x_test, explain=True)
|
|
|
|
|
|
|
|
# print("x", x_test[:3])
|
|
|
|
# print("y", y_pred_explain[:3])
|
|
|
|
# print("f", model.weights)
|
|
|
|
|
|
|
|
# validate data predictions line up
|
|
|
|
# print(np.all(y_pred[:, 0] == y_pred_explain[:,0]))
|
|
|
|
|
2021-05-12 17:32:55 +03:00
|
|
|
print(f"CMB: {mean_squared_error(y_test, y_pred, squared=False):1.4f} (train {mean_squared_error(y_train, y_pred_train, squared=False):1.4f}) bins={bins} {timeit.timeit() - start}sec")
|
2021-11-24 01:10:54 +03:00
|
|
|
print("weights", model.weights)
|
|
|
|
print(f"y_mean: {model.y_mean}")
|
2021-05-10 10:53:54 +03:00
|
|
|
# print(np.stack((y, y_pred))[:5,].transpose())
|
|
|
|
|
2021-11-24 01:10:54 +03:00
|
|
|
model2 = cbm.CBM()
|
|
|
|
model2.update(model.weights, model.y_mean)
|
|
|
|
|
|
|
|
y_pred_train2 = model2.predict(x_train)
|
|
|
|
|
|
|
|
# print(y_pred_train[:10])
|
|
|
|
# print(y_pred_train2[:10])
|
|
|
|
print('Must match: ', np.allclose(y_pred_train, y_pred_train2))
|
|
|
|
|
2021-05-10 10:53:54 +03:00
|
|
|
#### Poisson Regression
|
|
|
|
|
|
|
|
# one-hot encode categorical
|
|
|
|
start = timeit.timeit()
|
|
|
|
|
|
|
|
x = bic['Weekday'].values.reshape((-1,1)).astype('uint8')
|
|
|
|
|
|
|
|
enc = OneHotEncoder()
|
|
|
|
enc.fit(x)
|
|
|
|
x = enc.transform(x)
|
|
|
|
|
2021-05-12 17:45:56 +03:00
|
|
|
x = np.hstack([x.todense(), bic[['HIGH_T', 'LOW_T', 'PRECIP']].values])
|
|
|
|
|
2021-05-10 10:53:54 +03:00
|
|
|
clf = linear_model.PoissonRegressor()
|
|
|
|
clf.fit(x[train_idx, ], y_train)
|
|
|
|
|
|
|
|
y_pred = clf.predict(x[test_idx, ])
|
|
|
|
print(f"Poisson Reg: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
|
|
|
|
# print(np.stack((y, y_pred))[:5,].transpose())
|
|
|
|
|
2021-06-02 17:50:14 +03:00
|
|
|
#### LightGBM
|
|
|
|
|
2021-05-10 10:53:54 +03:00
|
|
|
start = timeit.timeit()
|
|
|
|
|
|
|
|
# train_data = lgb.Dataset(x, label=y, categorical_feature=[0, 1])
|
2021-05-12 17:45:56 +03:00
|
|
|
x = bic[['Weekday', 'HIGH_T', 'LOW_T', 'PRECIP']].values
|
2021-05-10 10:53:54 +03:00
|
|
|
|
|
|
|
train_data = lgb.Dataset(x[train_idx, ], label=y_train, categorical_feature=[0])
|
|
|
|
model = lgb.train({
|
|
|
|
'objective': 'poisson',
|
|
|
|
'metric': ['poisson', 'rmse'],
|
|
|
|
'verbose': -1,
|
|
|
|
}, train_data)
|
|
|
|
|
|
|
|
y_pred = model.predict(x[test_idx, ])
|
|
|
|
print(f"LightGBM Reg: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
|
2021-06-02 17:50:14 +03:00
|
|
|
# print(np.stack((y, y_pred))[:5,].transpose())
|
|
|
|
|
|
|
|
|
|
|
|
#### EBM
|
2021-09-20 19:32:06 +03:00
|
|
|
# start = timeit.timeit()
|
2021-06-02 17:50:14 +03:00
|
|
|
|
2021-09-20 19:32:06 +03:00
|
|
|
# ebm = ExplainableBoostingRegressor(random_state=23, max_bins=8) #, outer_bags=25, inner_bags=25)
|
|
|
|
# ebm.fit(x[train_idx], y_train)
|
2021-06-02 17:50:14 +03:00
|
|
|
|
2021-09-20 19:32:06 +03:00
|
|
|
# y_pred = ebm.predict(x[test_idx,])
|
|
|
|
# print(f"EBM: {mean_squared_error(y_test, y_pred, squared=False):1.4f} {timeit.timeit() - start}sec")
|