UPDATE: sweeping complete, and notebooks moved to NB directory
This commit is contained in:
Родитель
e269e41832
Коммит
d0da05d80c
|
@ -74,6 +74,7 @@ tmp/
|
|||
# hydra outputs
|
||||
outputs/
|
||||
multirun/
|
||||
mlruns/
|
||||
|
||||
# pytest reports
|
||||
.coverage
|
||||
|
|
136
base.py
136
base.py
|
@ -1,11 +1,11 @@
|
|||
import abc
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import pickle
|
||||
import sys
|
||||
from typing import List, Tuple, Union
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
@ -13,21 +13,22 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from natsort import natsorted
|
||||
from sklearn.metrics import auc, roc_curve
|
||||
from sklearn.model_selection import (
|
||||
GridSearchCV,
|
||||
GroupShuffleSplit,
|
||||
RandomizedSearchCV,
|
||||
TimeSeriesSplit,
|
||||
PredefinedSplit,
|
||||
)
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from collections import OrderedDict
|
||||
from tune_sklearn import TuneSearchCV
|
||||
|
||||
from loaders import CsvReader
|
||||
import mlflow
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
matplotlib.rcParams["figure.figsize"] = [12, 10]
|
||||
|
||||
# Add stdout handler, with level INFO
|
||||
console = logging.StreamHandler(sys.stdout)
|
||||
console.setLevel(logging.DEBUG)
|
||||
formater = logging.Formatter("%(name)-13s: %(levelname)-8s %(message)s")
|
||||
console.setFormatter(formater)
|
||||
logging.getLogger("datamodeler").addHandler(console)
|
||||
|
||||
|
||||
class BaseModel(abc.ABC):
|
||||
def __init__(self, log_dirs: str = "logs", model=None):
|
||||
|
@ -557,6 +558,117 @@ class BaseModel(abc.ABC):
|
|||
|
||||
return X_grouped, y_grouped
|
||||
|
||||
def sweep(
|
||||
self,
|
||||
params: Dict,
|
||||
X,
|
||||
y,
|
||||
search_algorithm: str = "bayesian",
|
||||
num_trials: int = 3,
|
||||
scoring_func: str = "r2",
|
||||
early_stopping: bool = False,
|
||||
results_csv_path: str = "outputs/results.csv",
|
||||
splitting_criteria: str = "CV",
|
||||
test_indices: Union[None, List[int]] = None,
|
||||
num_splits: int = 5,
|
||||
) -> pd.DataFrame:
|
||||
|
||||
if self.scale_data:
|
||||
X, y = self.scalar(X, y)
|
||||
|
||||
if splitting_criteria.lower() == "cv":
|
||||
cv = None
|
||||
elif splitting_criteria.lower() == "timeseries":
|
||||
cv = TimeSeriesSplit(n_splits=num_splits)
|
||||
elif splitting_criteria.lower() == "grouped":
|
||||
cv = GroupShuffleSplit(n_splits=num_splits)
|
||||
elif splitting_criteria.lower() == "fixed":
|
||||
if type(test_indices) != list:
|
||||
raise ValueError("fixed split used but no test-indices provided...")
|
||||
cv = PredefinedSplit(test_fold=test_indices)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
|
||||
)
|
||||
|
||||
# early stopping only supported for learners that have a
|
||||
# `partial_fit` method
|
||||
|
||||
# start mlflow auto-logging
|
||||
mlflow.sklearn.autolog()
|
||||
|
||||
if search_algorithm.lower() == "bohb":
|
||||
early_stopping = True
|
||||
|
||||
if any(
|
||||
[search_algorithm.lower() in ["bohb", "bayesian", "hyperopt", "optuna"]]
|
||||
):
|
||||
search = TuneSearchCV(
|
||||
self.model,
|
||||
params,
|
||||
search_optimization=search_algorithm,
|
||||
cv=cv,
|
||||
n_trials=num_trials,
|
||||
early_stopping=early_stopping,
|
||||
scoring=scoring_func,
|
||||
loggers=["csv", "tensorboard"],
|
||||
)
|
||||
elif search_algorithm == "grid":
|
||||
search = GridSearchCV(
|
||||
self.model, param_grid=params, refit=True, cv=cv, scoring=scoring_func,
|
||||
)
|
||||
elif search_algorithm == "random":
|
||||
search = RandomizedSearchCV(
|
||||
self.model,
|
||||
param_distributions=params,
|
||||
refit=True,
|
||||
cv=cv,
|
||||
scoring=scoring_func,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
|
||||
)
|
||||
|
||||
with mlflow.start_run() as run:
|
||||
search.fit(X, y)
|
||||
self.model = search.best_estimator_
|
||||
results_df = pd.DataFrame(search.cv_results_)
|
||||
if not pathlib.Path(results_csv_path).parent.exists():
|
||||
pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True, parents=True)
|
||||
logger.info(f"Saving sweeping results to {results_csv_path}")
|
||||
results_df.to_csv(results_csv_path)
|
||||
logger.info(f"Best hyperparams: {search.best_params_}")
|
||||
|
||||
return results_df
|
||||
|
||||
|
||||
def plot_parallel_coords(results_df: pd.DataFrame):
|
||||
|
||||
import plotly.express as px
|
||||
|
||||
cols_keep = [col for col in results_df if "param_" in col]
|
||||
cols_keep += ["mean_test_score"]
|
||||
|
||||
results_df = results_df[cols_keep]
|
||||
# want to convert object columns to type float
|
||||
results_df = results_df.apply(pd.to_numeric, errors="ignore", downcast="float")
|
||||
|
||||
fig = px.parallel_coordinates(
|
||||
results_df,
|
||||
color="mean_test_score",
|
||||
labels=dict(
|
||||
zip(
|
||||
list(results_df.columns),
|
||||
list(["_".join(i.split("_")[1:]) for i in results_df.columns]),
|
||||
)
|
||||
),
|
||||
color_continuous_scale=px.colors.diverging.Earth,
|
||||
# color_continuous_midpoint=27,
|
||||
)
|
||||
|
||||
fig.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
|
|
@ -10,3 +10,4 @@ data:
|
|||
max_rows: 1000
|
||||
scale_data: True
|
||||
diff_state: False
|
||||
test_perc: 0.25
|
|
@ -10,3 +10,4 @@ data:
|
|||
max_rows: -1
|
||||
scale_data: True
|
||||
diff_state: False
|
||||
test_perc: 0.25
|
||||
|
|
|
@ -36,3 +36,4 @@ data:
|
|||
max_rows: -1
|
||||
scale_data: True
|
||||
diff_state: False
|
||||
test_perc: 0.25
|
||||
|
|
|
@ -10,3 +10,4 @@ data:
|
|||
max_rows: 230000 # 251000 used in LNN to generate 232432 samples of which 200K is trained
|
||||
scale_data: True
|
||||
diff_state: True
|
||||
test_perc: 0.25
|
||||
|
|
|
@ -20,3 +20,4 @@ data:
|
|||
max_rows: -1
|
||||
scale_data: True
|
||||
diff_state: False
|
||||
test_perc: 0.25
|
||||
|
|
|
@ -3,11 +3,16 @@ model:
|
|||
build_params:
|
||||
model_type: SVR
|
||||
scale_data: True
|
||||
fit_separate: True
|
||||
fit_separate: False
|
||||
saver:
|
||||
filename: models/moab/SVR_model
|
||||
sweep:
|
||||
run: False
|
||||
split_strategy: timeseries
|
||||
search_algorithm: bayesian
|
||||
num_trials: 3
|
||||
num_trials: 30
|
||||
scoring_func: r2
|
||||
results_csv_path: sklearn-SVR/search_results.csv
|
||||
params:
|
||||
estimator__C: [1, 0.5, 1.5]
|
||||
estimator__kernel: ["linear", "poly", "rbf"]
|
|
@ -0,0 +1,31 @@
|
|||
model:
|
||||
name: lightgbm
|
||||
build_params:
|
||||
model_type: lightgbm
|
||||
fit_separate: False
|
||||
halt_model: False
|
||||
objective: reg:squarederror
|
||||
scale_data: True
|
||||
saver:
|
||||
filename: models/boost/lightgbm
|
||||
sweep:
|
||||
run: True
|
||||
# can be any of bayesian, bohb, hyperopt, optuna, grid or random
|
||||
search_algorithm: bayesian
|
||||
split_strategy: timeseries
|
||||
num_trials: 50
|
||||
scoring_func: r2
|
||||
early_stopping: False
|
||||
results_csv_path: lightgbm/search_results.csv
|
||||
params:
|
||||
# all parameters should be prefixed with estimator__
|
||||
# see the list of available parameters:
|
||||
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html
|
||||
estimator__max_depth: [-1, 5, 10, 25]
|
||||
estimator__learning_rate: [0.01, 0.05, 0.1, 0.25]
|
||||
estimator__num_leaves: [10, 31, 50]
|
||||
estimator__n_estimators: [10, 100, 250]
|
||||
estimator__boosting_type: ["gbdt", "dart", "goss"]
|
||||
estimator__reg_alpha: [0, 0.01, 0.05, 0.1]
|
||||
estimator__reg_lambda: [0, 0.01, 0.05, 0.1]
|
||||
|
|
@ -20,9 +20,12 @@ model:
|
|||
filename: models/torch_model
|
||||
sweep:
|
||||
run: True
|
||||
search_algorithm: grid
|
||||
num_trials: 3
|
||||
search_algorithm: bayesian
|
||||
split_strategy: timeseries
|
||||
num_trials: 50
|
||||
scoring_func: r2
|
||||
results_csv_path: torch/search_results.csv
|
||||
early_stopping: False
|
||||
params:
|
||||
lr: [0.01, 0.02]
|
||||
module__num_units: [10, 50]
|
|
@ -4,22 +4,24 @@ model:
|
|||
model_type: xgboost
|
||||
fit_separate: False
|
||||
halt_model: False
|
||||
num_trees: 50
|
||||
n_estimators: 100
|
||||
objective: reg:squarederror
|
||||
step_size: 0.3
|
||||
device: cpu
|
||||
batch_size: 128
|
||||
gamma: 0
|
||||
learning_rate: 0.3
|
||||
scale_data: True
|
||||
max_bin: 256
|
||||
max_depth: 6
|
||||
saver:
|
||||
filename: models/boost/moab/xgboost_model
|
||||
sweep:
|
||||
run: True
|
||||
search_algorithm: random
|
||||
num_trials: 3
|
||||
run: False
|
||||
split_strategy: timeseries
|
||||
search_algorithm: bayesian
|
||||
num_trials: 25
|
||||
scoring_func: r2
|
||||
early_stopping: False
|
||||
results_csv_path: xgboost_gridsearch/search_results.csv
|
||||
params:
|
||||
# see list of parameters: https://xgboost.readthedocs.io/en/latest/parameter.html
|
||||
estimator__max_depth: [1, 3, 5, 10]
|
||||
estimator__gamma: [0, 0.5, 1, 5]
|
||||
estimator__subsample: [0.1, 0.5, 1]
|
||||
estimator__eta: [0.3, 0.1, 0.5]
|
||||
|
|
1188
ddm_evaluation.ipynb
1188
ddm_evaluation.ipynb
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,15 +1,14 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
logging.basicConfig()
|
||||
logging.root.setLevel(logging.INFO)
|
||||
logger = logging.getLogger("datamodeler")
|
||||
|
||||
import pathlib
|
||||
import hydra
|
||||
import numpy as np
|
||||
from math import floor
|
||||
from omegaconf import DictConfig, ListConfig, OmegaConf
|
||||
|
||||
from model_loader import available_models
|
||||
|
||||
logger = logging.getLogger("datamodeler")
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
|
@ -19,6 +18,7 @@ def main(cfg: DictConfig) -> None:
|
|||
logger.info("Configuration: ")
|
||||
logger.info(f"\n{OmegaConf.to_yaml(cfg)}")
|
||||
|
||||
# for readability, read common data args into variables
|
||||
input_cols = cfg["data"]["inputs"]
|
||||
output_cols = cfg["data"]["outputs"]
|
||||
augmented_cols = cfg["data"]["augmented_cols"]
|
||||
|
@ -27,10 +27,15 @@ def main(cfg: DictConfig) -> None:
|
|||
iteration_col = cfg["data"]["iteration_col"]
|
||||
dataset_path = cfg["data"]["path"]
|
||||
max_rows = cfg["data"]["max_rows"]
|
||||
test_perc = cfg["data"]["test_perc"]
|
||||
|
||||
# common model args
|
||||
save_path = cfg["model"]["saver"]["filename"]
|
||||
model_name = cfg["model"]["name"]
|
||||
delta_state = cfg["data"]["diff_state"]
|
||||
run_sweep = cfg["model"]["sweep"]["run"]
|
||||
split_strategy = cfg["model"]["sweep"]["split_strategy"]
|
||||
results_csv_path = cfg["model"]["sweep"]["results_csv_path"]
|
||||
|
||||
Model = available_models[model_name]
|
||||
|
||||
|
@ -58,6 +63,24 @@ def main(cfg: DictConfig) -> None:
|
|||
max_rows=max_rows,
|
||||
diff_state=delta_state,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Saving last {test_perc * 100}% for test, using first {(1 - test_perc) * 100}% for training/sweeping"
|
||||
)
|
||||
train_id_end = floor(X.shape[0] * (1 - test_perc))
|
||||
X_train, y_train = X[:train_id_end,], y[:train_id_end,]
|
||||
X_test, y_test = X[train_id_end:,], y[train_id_end:,]
|
||||
|
||||
# save training and test sets
|
||||
save_data_path = os.path.join(os.getcwd(), "data")
|
||||
if not os.path.exists(save_data_path):
|
||||
pathlib.Path(save_data_path).mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Saving data to {os.path.abspath(save_data_path)}")
|
||||
np.save(os.path.join(save_data_path, "x_train.npy"), X_train)
|
||||
np.save(os.path.join(save_data_path, "y_train.npy"), y_train)
|
||||
np.save(os.path.join(save_data_path, "x_test.npy"), X_test)
|
||||
np.save(os.path.join(save_data_path, "y_test.npy"), y_test)
|
||||
|
||||
logger.info("Building model...")
|
||||
model.build_model(**cfg["model"]["build_params"])
|
||||
|
||||
|
@ -65,17 +88,20 @@ def main(cfg: DictConfig) -> None:
|
|||
params = OmegaConf.to_container(cfg["model"]["sweep"]["params"])
|
||||
logger.info(f"Sweeping with parameters: {params}")
|
||||
|
||||
model.sweep(
|
||||
sweep_df = model.sweep(
|
||||
params=params,
|
||||
X=X,
|
||||
y=y,
|
||||
X=X_train,
|
||||
y=y_train,
|
||||
search_algorithm=cfg["model"]["sweep"]["search_algorithm"],
|
||||
num_trials=cfg["model"]["sweep"]["num_trials"],
|
||||
scoring_func=cfg["model"]["sweep"]["scoring_func"],
|
||||
results_csv_path=results_csv_path,
|
||||
splitting_criteria=split_strategy,
|
||||
)
|
||||
logger.info(f"Sweep results: {sweep_df}")
|
||||
else:
|
||||
logger.info("Fitting model...")
|
||||
model.fit(X, y)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
logger.info(f"Saving model to {save_path}")
|
||||
model.save_model(filename=save_path)
|
||||
|
|
613
eda/eda.ipynb
613
eda/eda.ipynb
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -7,7 +7,6 @@ import numpy as np
|
|||
from lightgbm import LGBMRegressor, LGBMClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from tune_sklearn import TuneSearchCV
|
||||
from xgboost import XGBRegressor, XGBClassifier
|
||||
|
||||
from base import BaseModel
|
||||
|
@ -26,17 +25,19 @@ class GBoostModel(BaseModel):
|
|||
halt_model: bool = False,
|
||||
objective: str = "reg:squarederror",
|
||||
fit_separate: bool = False,
|
||||
num_trees: int = 50,
|
||||
step_size: float = 0.3,
|
||||
device: str = "cpu",
|
||||
batch_size: int = 128,
|
||||
gamma: int = 0,
|
||||
max_bin: int = 256,
|
||||
n_estimators: int = 100,
|
||||
learning_rate: float = 0.3,
|
||||
max_depth: int = 6,
|
||||
):
|
||||
|
||||
self.scale_data = scale_data
|
||||
if model_type == "xgboost":
|
||||
self.single_model = XGBRegressor(objective=objective)
|
||||
self.single_model = XGBRegressor(
|
||||
objective=objective,
|
||||
n_estimators=n_estimators,
|
||||
max_depth=max_depth,
|
||||
learning_rate=learning_rate,
|
||||
)
|
||||
elif model_type == "lightgbm":
|
||||
self.single_model = LGBMRegressor()
|
||||
else:
|
||||
|
@ -182,37 +183,6 @@ class GBoostModel(BaseModel):
|
|||
# open(os.path.join(path_name, "yscalar.pkl"), "rb")
|
||||
# )
|
||||
|
||||
def sweep(
|
||||
self,
|
||||
params: Dict,
|
||||
X,
|
||||
y,
|
||||
search_algorithm: str = "bayesian",
|
||||
num_trials: int = 3,
|
||||
scoring_func: str = "r2",
|
||||
):
|
||||
|
||||
if self.scale_data:
|
||||
X, y = self.scalar(X, y)
|
||||
|
||||
# early stopping only supported for learners that have a
|
||||
# `partial_fit` method
|
||||
|
||||
tune_search = TuneSearchCV(
|
||||
self.model,
|
||||
param_distributions=params,
|
||||
n_trials=num_trials,
|
||||
search_optimization=search_algorithm,
|
||||
early_stopping=False,
|
||||
scoring=scoring_func,
|
||||
)
|
||||
|
||||
tune_search.fit(X, y)
|
||||
self.model = tune_search.best_estimator_
|
||||
logger.info(f"Best hyperparams: {tune_search.best_params}")
|
||||
|
||||
return tune_search
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
@ -240,5 +210,5 @@ if __name__ == "__main__":
|
|||
# gsxgbm = GridSearchCV(mgbm, param_grid=params, scoring="r2")
|
||||
# gsxgbm.fit(X, y)
|
||||
|
||||
tunexgbm = TuneSearchCV(xgm.model, param_distributions=params, scoring="r2")
|
||||
tunexgbm.fit(X, y)
|
||||
# tunexgbm = TuneSearchCV(xgm.model, param_distributions=params, scoring="r2")
|
||||
# tunexgbm.fit(X, y)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
from typing import List, Tuple, Union
|
||||
from typing import List
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("data_loaders")
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
78
skmodels.py
78
skmodels.py
|
@ -1,39 +1,21 @@
|
|||
import copy
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import pickle
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import numpy as np
|
||||
import copy
|
||||
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn import linear_model
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from natsort import natsorted
|
||||
|
||||
from tune_sklearn import TuneSearchCV
|
||||
from tune_sklearn import TuneGridSearchCV
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.decomposition import PCA, NMF
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.linear_model import SGDRegressor
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.svm import SVR
|
||||
from sklearn.pipeline import make_pipeline
|
||||
|
||||
from ray.tune.sklearn import TuneGridSearchCV, TuneSearchCV
|
||||
|
||||
from base import BaseModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# param_grid = {"learning_rate": (0.01, 0.1), "n_estimators": (25, 250), "subsample": [False, True]}
|
||||
|
||||
|
||||
class SKModel(BaseModel):
|
||||
def build_model(
|
||||
|
@ -44,38 +26,26 @@ class SKModel(BaseModel):
|
|||
):
|
||||
self.scale_data = scale_data
|
||||
self.model_type = model_type
|
||||
self.fit_separate = fit_separate
|
||||
self.separate_models = fit_separate
|
||||
if model_type == "linear_model":
|
||||
self.model = linear_model.LinearRegression()
|
||||
elif model_type == "SVR":
|
||||
self.model = SVR(C=1.0, epsilon=0.2)
|
||||
elif model_type == "GradientBoostingRegressor":
|
||||
self.model = GradientBoostingRegressor()
|
||||
elif model_type.lower() == "sgdregressor":
|
||||
self.model = SGDRegressor()
|
||||
else:
|
||||
raise NotImplementedError("unknown model selected")
|
||||
if not self.separate_models:
|
||||
self.single_model = self.model
|
||||
self.model = MultiOutputRegressor(self.single_model)
|
||||
|
||||
def fit(self, X, y):
|
||||
|
||||
if self.scale_data:
|
||||
X, y = self.scalar(X, y)
|
||||
|
||||
if (
|
||||
self.model_type == "GradientBoostingRegressor"
|
||||
and self.fit_separate == False
|
||||
):
|
||||
fit_separate = True
|
||||
logger.warn(
|
||||
"Note: fit_separate must be set toTrue for GradientBoostingRegressor, but False was provided. Changing to True"
|
||||
)
|
||||
|
||||
if self.model_type == "SVR" and self.fit_separate == False:
|
||||
fit_separate = True
|
||||
logger.warn(
|
||||
"Note: fit_separate must be set to True for SVR, but False was provided. Changing to True"
|
||||
)
|
||||
|
||||
self.separate_models = self.fit_separate
|
||||
|
||||
if self.separate_models:
|
||||
self.models = []
|
||||
for i in range(y.shape[1]):
|
||||
|
@ -87,9 +57,7 @@ class SKModel(BaseModel):
|
|||
try:
|
||||
self.model.fit(X, y)
|
||||
except ValueError:
|
||||
logger.info(
|
||||
f"fit separate should be True for model type of {self.model_type}"
|
||||
)
|
||||
logger.info(f"Unable to fit model of type {type(self.model)}")
|
||||
|
||||
def predict(self, X):
|
||||
|
||||
|
@ -168,22 +136,6 @@ class SKModel(BaseModel):
|
|||
|
||||
# self.scale_data = scale_data
|
||||
|
||||
def sweep(self, X, y, params: Dict = None):
|
||||
if not params:
|
||||
raise NotImplementedError
|
||||
|
||||
tune_search = TuneSearchCV(
|
||||
self.model,
|
||||
param_distributions=params,
|
||||
n_trials=3,
|
||||
# early_stopping=True,
|
||||
# use_gpu=True
|
||||
)
|
||||
|
||||
tune_search.fit(X, y)
|
||||
|
||||
return tune_search
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
@ -202,7 +154,7 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
skm.build_model(model_type="linear_model")
|
||||
skm.fit(X, y, fit_separate=False)
|
||||
skm.fit(X, y)
|
||||
logger.info(X)
|
||||
yhat = skm.predict(X)
|
||||
|
||||
|
@ -216,14 +168,14 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
skm.build_model(model_type="SVR")
|
||||
skm.fit(X, y, fit_separate=False)
|
||||
skm.fit(X, y)
|
||||
logger.info(X)
|
||||
yhat = skm.predict(X)
|
||||
|
||||
skm.save_model(dir_path="models/lsvc_pole_multi.pkl")
|
||||
|
||||
skm.build_model(model_type="GradientBoostingRegressor")
|
||||
skm.fit(X, y, fit_separate=False)
|
||||
skm.fit(X, y)
|
||||
logger.info(X)
|
||||
yhat = skm.predict(X)
|
||||
|
||||
|
|
|
@ -33,7 +33,6 @@ def test_fit():
|
|||
torch_model.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Long test, skipping for CI speed")
|
||||
def test_sweep():
|
||||
|
||||
torch_model.build_model()
|
||||
|
|
|
@ -1,15 +1,26 @@
|
|||
import os
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
from skorch import NeuralNetRegressor
|
||||
from skorch.callbacks import LRScheduler
|
||||
from torch.optim.lr_scheduler import CyclicLR
|
||||
from tune_sklearn import TuneSearchCV
|
||||
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
||||
|
||||
from base import BaseModel
|
||||
import logging
|
||||
from rich.logging import RichHandler
|
||||
|
||||
FORMAT = "%(message)s"
|
||||
|
||||
logging.basicConfig(
|
||||
level="INFO", format=FORMAT, datefmt="[%X]", handlers=[RichHandler(markup=True)],
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
@ -139,9 +150,14 @@ class PyTorchModel(BaseModel):
|
|||
search_algorithm: str = "bayesian",
|
||||
num_trials: int = 3,
|
||||
scoring_func: str = "r2",
|
||||
early_stopping: bool = False,
|
||||
results_csv_path: str = "outputs/results.csv",
|
||||
):
|
||||
|
||||
from tune_sklearn import TuneSearchCV
|
||||
start_dir = str(pathlib.Path(os.getcwd()).parent)
|
||||
module_dir = str(pathlib.Path(__file__).parent)
|
||||
# temporarily change directory to file directory and then reset
|
||||
os.chdir(module_dir)
|
||||
|
||||
if self.scale_data:
|
||||
X, y = self.scalar(X, y)
|
||||
|
@ -151,13 +167,18 @@ class PyTorchModel(BaseModel):
|
|||
torch.tensor(y).float().to(device=self.device),
|
||||
)
|
||||
|
||||
if search_algorithm == "bayesian" or search_algorithm == "hyperopt":
|
||||
if search_algorithm.lower() == "bohb":
|
||||
early_stopping = True
|
||||
|
||||
if any(
|
||||
[search_algorithm.lower() in ["bohb", "bayesian", "hyperopt", "optuna"]]
|
||||
):
|
||||
search = TuneSearchCV(
|
||||
self.model,
|
||||
params,
|
||||
search_optimization=search_algorithm,
|
||||
n_trials=num_trials,
|
||||
early_stopping=True,
|
||||
early_stopping=early_stopping,
|
||||
scoring=scoring_func,
|
||||
)
|
||||
elif search_algorithm == "grid":
|
||||
|
@ -178,13 +199,27 @@ class PyTorchModel(BaseModel):
|
|||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Search algorithm should be one of gridsearch, hyperopt, bayesian, or randomsearch"
|
||||
"Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
|
||||
)
|
||||
search.fit(X, y)
|
||||
self.model = search.best_estimator_
|
||||
|
||||
# set path back to initial
|
||||
os.chdir(start_dir)
|
||||
|
||||
results_df = pd.DataFrame(search.cv_results_)
|
||||
logger.info(f"Best hyperparams: {search.best_params_}")
|
||||
|
||||
return search
|
||||
if not pathlib.Path(results_csv_path).parent.exists():
|
||||
pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True, parents=True)
|
||||
logger.info(f"Saving sweeping results to {results_csv_path}")
|
||||
results_df.to_csv(results_csv_path)
|
||||
cols_keep = [col for col in results_df if "param_" in col]
|
||||
cols_keep += ["mean_test_score"]
|
||||
|
||||
results_df = results_df[cols_keep]
|
||||
|
||||
return results_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -198,7 +233,9 @@ if __name__ == "__main__":
|
|||
pytorch_model.build_model()
|
||||
pytorch_model.fit(X, y)
|
||||
# tune tests
|
||||
# params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
|
||||
# from tune_sklearn import TuneSearchCV, TuneGridSearchCV
|
||||
|
||||
# params = {"lr": [0.01, 0.02], "modu
|
||||
# gs = TuneGridSearchCV(pytorch_model.model, params, scoring="neg_mean_squared_error")
|
||||
# gs.fit(torch.tensor(X).float(), torch.tensor(y).float())
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче