UPDATE: sweeping complete, and notebooks moved to NB directory

This commit is contained in:
Ali Zaidi 2021-04-22 17:08:11 -07:00
Родитель e269e41832
Коммит d0da05d80c
22 изменённых файлов: 6274 добавлений и 1947 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -74,6 +74,7 @@ tmp/
# hydra outputs
outputs/
multirun/
mlruns/
# pytest reports
.coverage

136
base.py
Просмотреть файл

@ -1,11 +1,11 @@
import abc
import copy
import logging
import os
import pathlib
import pickle
import sys
from typing import List, Tuple, Union
import copy
from collections import OrderedDict
from typing import Dict, List, Tuple, Union
import matplotlib
import matplotlib.pyplot as plt
@ -13,21 +13,22 @@ import numpy as np
import pandas as pd
from natsort import natsorted
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import (
GridSearchCV,
GroupShuffleSplit,
RandomizedSearchCV,
TimeSeriesSplit,
PredefinedSplit,
)
from sklearn.preprocessing import StandardScaler
from collections import OrderedDict
from tune_sklearn import TuneSearchCV
from loaders import CsvReader
import mlflow
logger = logging.getLogger(__name__)
matplotlib.rcParams["figure.figsize"] = [12, 10]
# Add stdout handler, with level INFO
console = logging.StreamHandler(sys.stdout)
console.setLevel(logging.DEBUG)
formater = logging.Formatter("%(name)-13s: %(levelname)-8s %(message)s")
console.setFormatter(formater)
logging.getLogger("datamodeler").addHandler(console)
class BaseModel(abc.ABC):
def __init__(self, log_dirs: str = "logs", model=None):
@ -557,6 +558,117 @@ class BaseModel(abc.ABC):
return X_grouped, y_grouped
def sweep(
self,
params: Dict,
X,
y,
search_algorithm: str = "bayesian",
num_trials: int = 3,
scoring_func: str = "r2",
early_stopping: bool = False,
results_csv_path: str = "outputs/results.csv",
splitting_criteria: str = "CV",
test_indices: Union[None, List[int]] = None,
num_splits: int = 5,
) -> pd.DataFrame:
if self.scale_data:
X, y = self.scalar(X, y)
if splitting_criteria.lower() == "cv":
cv = None
elif splitting_criteria.lower() == "timeseries":
cv = TimeSeriesSplit(n_splits=num_splits)
elif splitting_criteria.lower() == "grouped":
cv = GroupShuffleSplit(n_splits=num_splits)
elif splitting_criteria.lower() == "fixed":
if type(test_indices) != list:
raise ValueError("fixed split used but no test-indices provided...")
cv = PredefinedSplit(test_fold=test_indices)
else:
raise ValueError(
"Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
)
# early stopping only supported for learners that have a
# `partial_fit` method
# start mlflow auto-logging
mlflow.sklearn.autolog()
if search_algorithm.lower() == "bohb":
early_stopping = True
if any(
[search_algorithm.lower() in ["bohb", "bayesian", "hyperopt", "optuna"]]
):
search = TuneSearchCV(
self.model,
params,
search_optimization=search_algorithm,
cv=cv,
n_trials=num_trials,
early_stopping=early_stopping,
scoring=scoring_func,
loggers=["csv", "tensorboard"],
)
elif search_algorithm == "grid":
search = GridSearchCV(
self.model, param_grid=params, refit=True, cv=cv, scoring=scoring_func,
)
elif search_algorithm == "random":
search = RandomizedSearchCV(
self.model,
param_distributions=params,
refit=True,
cv=cv,
scoring=scoring_func,
)
else:
raise NotImplementedError(
"Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
)
with mlflow.start_run() as run:
search.fit(X, y)
self.model = search.best_estimator_
results_df = pd.DataFrame(search.cv_results_)
if not pathlib.Path(results_csv_path).parent.exists():
pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True, parents=True)
logger.info(f"Saving sweeping results to {results_csv_path}")
results_df.to_csv(results_csv_path)
logger.info(f"Best hyperparams: {search.best_params_}")
return results_df
def plot_parallel_coords(results_df: pd.DataFrame):
import plotly.express as px
cols_keep = [col for col in results_df if "param_" in col]
cols_keep += ["mean_test_score"]
results_df = results_df[cols_keep]
# want to convert object columns to type float
results_df = results_df.apply(pd.to_numeric, errors="ignore", downcast="float")
fig = px.parallel_coordinates(
results_df,
color="mean_test_score",
labels=dict(
zip(
list(results_df.columns),
list(["_".join(i.split("_")[1:]) for i in results_df.columns]),
)
),
color_continuous_scale=px.colors.diverging.Earth,
# color_continuous_midpoint=27,
)
fig.show()
if __name__ == "__main__":

Просмотреть файл

@ -10,3 +10,4 @@ data:
max_rows: 1000
scale_data: True
diff_state: False
test_perc: 0.25

Просмотреть файл

@ -10,3 +10,4 @@ data:
max_rows: -1
scale_data: True
diff_state: False
test_perc: 0.25

Просмотреть файл

@ -36,3 +36,4 @@ data:
max_rows: -1
scale_data: True
diff_state: False
test_perc: 0.25

Просмотреть файл

@ -10,3 +10,4 @@ data:
max_rows: 230000 # 251000 used in LNN to generate 232432 samples of which 200K is trained
scale_data: True
diff_state: True
test_perc: 0.25

Просмотреть файл

@ -20,3 +20,4 @@ data:
max_rows: -1
scale_data: True
diff_state: False
test_perc: 0.25

Просмотреть файл

@ -3,11 +3,16 @@ model:
build_params:
model_type: SVR
scale_data: True
fit_separate: True
fit_separate: False
saver:
filename: models/moab/SVR_model
sweep:
run: False
split_strategy: timeseries
search_algorithm: bayesian
num_trials: 3
num_trials: 30
scoring_func: r2
results_csv_path: sklearn-SVR/search_results.csv
params:
estimator__C: [1, 0.5, 1.5]
estimator__kernel: ["linear", "poly", "rbf"]

31
conf/model/lightgbm.yaml Normal file
Просмотреть файл

@ -0,0 +1,31 @@
model:
name: lightgbm
build_params:
model_type: lightgbm
fit_separate: False
halt_model: False
objective: reg:squarederror
scale_data: True
saver:
filename: models/boost/lightgbm
sweep:
run: True
# can be any of bayesian, bohb, hyperopt, optuna, grid or random
search_algorithm: bayesian
split_strategy: timeseries
num_trials: 50
scoring_func: r2
early_stopping: False
results_csv_path: lightgbm/search_results.csv
params:
# all parameters should be prefixed with estimator__
# see the list of available parameters:
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html
estimator__max_depth: [-1, 5, 10, 25]
estimator__learning_rate: [0.01, 0.05, 0.1, 0.25]
estimator__num_leaves: [10, 31, 50]
estimator__n_estimators: [10, 100, 250]
estimator__boosting_type: ["gbdt", "dart", "goss"]
estimator__reg_alpha: [0, 0.01, 0.05, 0.1]
estimator__reg_lambda: [0, 0.01, 0.05, 0.1]

Просмотреть файл

@ -20,9 +20,12 @@ model:
filename: models/torch_model
sweep:
run: True
search_algorithm: grid
num_trials: 3
search_algorithm: bayesian
split_strategy: timeseries
num_trials: 50
scoring_func: r2
results_csv_path: torch/search_results.csv
early_stopping: False
params:
lr: [0.01, 0.02]
module__num_units: [10, 50]

Просмотреть файл

@ -4,22 +4,24 @@ model:
model_type: xgboost
fit_separate: False
halt_model: False
num_trees: 50
n_estimators: 100
objective: reg:squarederror
step_size: 0.3
device: cpu
batch_size: 128
gamma: 0
learning_rate: 0.3
scale_data: True
max_bin: 256
max_depth: 6
saver:
filename: models/boost/moab/xgboost_model
sweep:
run: True
search_algorithm: random
num_trials: 3
run: False
split_strategy: timeseries
search_algorithm: bayesian
num_trials: 25
scoring_func: r2
early_stopping: False
results_csv_path: xgboost_gridsearch/search_results.csv
params:
# see list of parameters: https://xgboost.readthedocs.io/en/latest/parameter.html
estimator__max_depth: [1, 3, 5, 10]
estimator__gamma: [0, 0.5, 1, 5]
estimator__subsample: [0.1, 0.5, 1]
estimator__eta: [0.3, 0.1, 0.5]

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,15 +1,14 @@
import logging
import os
logging.basicConfig()
logging.root.setLevel(logging.INFO)
logger = logging.getLogger("datamodeler")
import pathlib
import hydra
import numpy as np
from math import floor
from omegaconf import DictConfig, ListConfig, OmegaConf
from model_loader import available_models
logger = logging.getLogger("datamodeler")
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -19,6 +18,7 @@ def main(cfg: DictConfig) -> None:
logger.info("Configuration: ")
logger.info(f"\n{OmegaConf.to_yaml(cfg)}")
# for readability, read common data args into variables
input_cols = cfg["data"]["inputs"]
output_cols = cfg["data"]["outputs"]
augmented_cols = cfg["data"]["augmented_cols"]
@ -27,10 +27,15 @@ def main(cfg: DictConfig) -> None:
iteration_col = cfg["data"]["iteration_col"]
dataset_path = cfg["data"]["path"]
max_rows = cfg["data"]["max_rows"]
test_perc = cfg["data"]["test_perc"]
# common model args
save_path = cfg["model"]["saver"]["filename"]
model_name = cfg["model"]["name"]
delta_state = cfg["data"]["diff_state"]
run_sweep = cfg["model"]["sweep"]["run"]
split_strategy = cfg["model"]["sweep"]["split_strategy"]
results_csv_path = cfg["model"]["sweep"]["results_csv_path"]
Model = available_models[model_name]
@ -58,6 +63,24 @@ def main(cfg: DictConfig) -> None:
max_rows=max_rows,
diff_state=delta_state,
)
logger.info(
f"Saving last {test_perc * 100}% for test, using first {(1 - test_perc) * 100}% for training/sweeping"
)
train_id_end = floor(X.shape[0] * (1 - test_perc))
X_train, y_train = X[:train_id_end,], y[:train_id_end,]
X_test, y_test = X[train_id_end:,], y[train_id_end:,]
# save training and test sets
save_data_path = os.path.join(os.getcwd(), "data")
if not os.path.exists(save_data_path):
pathlib.Path(save_data_path).mkdir(parents=True, exist_ok=True)
logger.info(f"Saving data to {os.path.abspath(save_data_path)}")
np.save(os.path.join(save_data_path, "x_train.npy"), X_train)
np.save(os.path.join(save_data_path, "y_train.npy"), y_train)
np.save(os.path.join(save_data_path, "x_test.npy"), X_test)
np.save(os.path.join(save_data_path, "y_test.npy"), y_test)
logger.info("Building model...")
model.build_model(**cfg["model"]["build_params"])
@ -65,17 +88,20 @@ def main(cfg: DictConfig) -> None:
params = OmegaConf.to_container(cfg["model"]["sweep"]["params"])
logger.info(f"Sweeping with parameters: {params}")
model.sweep(
sweep_df = model.sweep(
params=params,
X=X,
y=y,
X=X_train,
y=y_train,
search_algorithm=cfg["model"]["sweep"]["search_algorithm"],
num_trials=cfg["model"]["sweep"]["num_trials"],
scoring_func=cfg["model"]["sweep"]["scoring_func"],
results_csv_path=results_csv_path,
splitting_criteria=split_strategy,
)
logger.info(f"Sweep results: {sweep_df}")
else:
logger.info("Fitting model...")
model.fit(X, y)
model.fit(X_train, y_train)
logger.info(f"Saving model to {save_path}")
model.save_model(filename=save_path)

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -7,7 +7,6 @@ import numpy as np
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.exceptions import NotFittedError
from tune_sklearn import TuneSearchCV
from xgboost import XGBRegressor, XGBClassifier
from base import BaseModel
@ -26,17 +25,19 @@ class GBoostModel(BaseModel):
halt_model: bool = False,
objective: str = "reg:squarederror",
fit_separate: bool = False,
num_trees: int = 50,
step_size: float = 0.3,
device: str = "cpu",
batch_size: int = 128,
gamma: int = 0,
max_bin: int = 256,
n_estimators: int = 100,
learning_rate: float = 0.3,
max_depth: int = 6,
):
self.scale_data = scale_data
if model_type == "xgboost":
self.single_model = XGBRegressor(objective=objective)
self.single_model = XGBRegressor(
objective=objective,
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
)
elif model_type == "lightgbm":
self.single_model = LGBMRegressor()
else:
@ -182,37 +183,6 @@ class GBoostModel(BaseModel):
# open(os.path.join(path_name, "yscalar.pkl"), "rb")
# )
def sweep(
self,
params: Dict,
X,
y,
search_algorithm: str = "bayesian",
num_trials: int = 3,
scoring_func: str = "r2",
):
if self.scale_data:
X, y = self.scalar(X, y)
# early stopping only supported for learners that have a
# `partial_fit` method
tune_search = TuneSearchCV(
self.model,
param_distributions=params,
n_trials=num_trials,
search_optimization=search_algorithm,
early_stopping=False,
scoring=scoring_func,
)
tune_search.fit(X, y)
self.model = tune_search.best_estimator_
logger.info(f"Best hyperparams: {tune_search.best_params}")
return tune_search
if __name__ == "__main__":
@ -240,5 +210,5 @@ if __name__ == "__main__":
# gsxgbm = GridSearchCV(mgbm, param_grid=params, scoring="r2")
# gsxgbm.fit(X, y)
tunexgbm = TuneSearchCV(xgm.model, param_distributions=params, scoring="r2")
tunexgbm.fit(X, y)
# tunexgbm = TuneSearchCV(xgm.model, param_distributions=params, scoring="r2")
# tunexgbm.fit(X, y)

Просмотреть файл

@ -1,6 +1,6 @@
import os
import pandas as pd
from typing import List, Tuple, Union
from typing import List
import logging
logger = logging.getLogger("data_loaders")

4183
notebooks/01-training.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

613
notebooks/eda.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,39 +1,21 @@
import copy
import logging
import os
import pathlib
import pickle
from typing import Dict, Tuple
import numpy as np
import copy
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.multioutput import MultiOutputRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from natsort import natsorted
from tune_sklearn import TuneSearchCV
from tune_sklearn import TuneGridSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from ray.tune.sklearn import TuneGridSearchCV, TuneSearchCV
from base import BaseModel
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# param_grid = {"learning_rate": (0.01, 0.1), "n_estimators": (25, 250), "subsample": [False, True]}
class SKModel(BaseModel):
def build_model(
@ -44,38 +26,26 @@ class SKModel(BaseModel):
):
self.scale_data = scale_data
self.model_type = model_type
self.fit_separate = fit_separate
self.separate_models = fit_separate
if model_type == "linear_model":
self.model = linear_model.LinearRegression()
elif model_type == "SVR":
self.model = SVR(C=1.0, epsilon=0.2)
elif model_type == "GradientBoostingRegressor":
self.model = GradientBoostingRegressor()
elif model_type.lower() == "sgdregressor":
self.model = SGDRegressor()
else:
raise NotImplementedError("unknown model selected")
if not self.separate_models:
self.single_model = self.model
self.model = MultiOutputRegressor(self.single_model)
def fit(self, X, y):
if self.scale_data:
X, y = self.scalar(X, y)
if (
self.model_type == "GradientBoostingRegressor"
and self.fit_separate == False
):
fit_separate = True
logger.warn(
"Note: fit_separate must be set toTrue for GradientBoostingRegressor, but False was provided. Changing to True"
)
if self.model_type == "SVR" and self.fit_separate == False:
fit_separate = True
logger.warn(
"Note: fit_separate must be set to True for SVR, but False was provided. Changing to True"
)
self.separate_models = self.fit_separate
if self.separate_models:
self.models = []
for i in range(y.shape[1]):
@ -87,9 +57,7 @@ class SKModel(BaseModel):
try:
self.model.fit(X, y)
except ValueError:
logger.info(
f"fit separate should be True for model type of {self.model_type}"
)
logger.info(f"Unable to fit model of type {type(self.model)}")
def predict(self, X):
@ -168,22 +136,6 @@ class SKModel(BaseModel):
# self.scale_data = scale_data
def sweep(self, X, y, params: Dict = None):
if not params:
raise NotImplementedError
tune_search = TuneSearchCV(
self.model,
param_distributions=params,
n_trials=3,
# early_stopping=True,
# use_gpu=True
)
tune_search.fit(X, y)
return tune_search
if __name__ == "__main__":
@ -202,7 +154,7 @@ if __name__ == "__main__":
)
skm.build_model(model_type="linear_model")
skm.fit(X, y, fit_separate=False)
skm.fit(X, y)
logger.info(X)
yhat = skm.predict(X)
@ -216,14 +168,14 @@ if __name__ == "__main__":
)
skm.build_model(model_type="SVR")
skm.fit(X, y, fit_separate=False)
skm.fit(X, y)
logger.info(X)
yhat = skm.predict(X)
skm.save_model(dir_path="models/lsvc_pole_multi.pkl")
skm.build_model(model_type="GradientBoostingRegressor")
skm.fit(X, y, fit_separate=False)
skm.fit(X, y)
logger.info(X)
yhat = skm.predict(X)

Просмотреть файл

@ -33,7 +33,6 @@ def test_fit():
torch_model.fit(X, y)
@pytest.mark.skip(reason="Long test, skipping for CI speed")
def test_sweep():
torch_model.build_model()

Просмотреть файл

@ -1,15 +1,26 @@
import os
import pathlib
from typing import Dict
import pickle
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetRegressor
from skorch.callbacks import LRScheduler
from torch.optim.lr_scheduler import CyclicLR
from tune_sklearn import TuneSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from base import BaseModel
import logging
from rich.logging import RichHandler
FORMAT = "%(message)s"
logging.basicConfig(
level="INFO", format=FORMAT, datefmt="[%X]", handlers=[RichHandler(markup=True)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@ -139,9 +150,14 @@ class PyTorchModel(BaseModel):
search_algorithm: str = "bayesian",
num_trials: int = 3,
scoring_func: str = "r2",
early_stopping: bool = False,
results_csv_path: str = "outputs/results.csv",
):
from tune_sklearn import TuneSearchCV
start_dir = str(pathlib.Path(os.getcwd()).parent)
module_dir = str(pathlib.Path(__file__).parent)
# temporarily change directory to file directory and then reset
os.chdir(module_dir)
if self.scale_data:
X, y = self.scalar(X, y)
@ -151,13 +167,18 @@ class PyTorchModel(BaseModel):
torch.tensor(y).float().to(device=self.device),
)
if search_algorithm == "bayesian" or search_algorithm == "hyperopt":
if search_algorithm.lower() == "bohb":
early_stopping = True
if any(
[search_algorithm.lower() in ["bohb", "bayesian", "hyperopt", "optuna"]]
):
search = TuneSearchCV(
self.model,
params,
search_optimization=search_algorithm,
n_trials=num_trials,
early_stopping=True,
early_stopping=early_stopping,
scoring=scoring_func,
)
elif search_algorithm == "grid":
@ -178,13 +199,27 @@ class PyTorchModel(BaseModel):
)
else:
raise NotImplementedError(
"Search algorithm should be one of gridsearch, hyperopt, bayesian, or randomsearch"
"Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
)
search.fit(X, y)
self.model = search.best_estimator_
# set path back to initial
os.chdir(start_dir)
results_df = pd.DataFrame(search.cv_results_)
logger.info(f"Best hyperparams: {search.best_params_}")
return search
if not pathlib.Path(results_csv_path).parent.exists():
pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True, parents=True)
logger.info(f"Saving sweeping results to {results_csv_path}")
results_df.to_csv(results_csv_path)
cols_keep = [col for col in results_df if "param_" in col]
cols_keep += ["mean_test_score"]
results_df = results_df[cols_keep]
return results_df
if __name__ == "__main__":
@ -198,7 +233,9 @@ if __name__ == "__main__":
pytorch_model.build_model()
pytorch_model.fit(X, y)
# tune tests
# params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
# from tune_sklearn import TuneSearchCV, TuneGridSearchCV
# params = {"lr": [0.01, 0.02], "modu
# gs = TuneGridSearchCV(pytorch_model.model, params, scoring="neg_mean_squared_error")
# gs.fit(torch.tensor(X).float(), torch.tensor(y).float())