datadrivenmodel/ddm_trainer.py

import logging
import os
import pathlib
import hydra
import numpy as np
from math import floor
from omegaconf import DictConfig, ListConfig, OmegaConf

logger = logging.getLogger("datamodeler")
dir_path = os.path.dirname(os.path.realpath(__file__))


@hydra.main(config_path="conf", config_name="config")
def main(cfg: DictConfig) -> None:

    logger.info("Configuration: ")
    logger.info(f"\n{OmegaConf.to_yaml(cfg)}")

    # for readability, read common data args into variables
    input_cols = cfg["data"]["inputs"]
    output_cols = cfg["data"]["outputs"]
    augmented_cols = cfg["data"]["augmented_cols"]

    iteration_order = cfg["data"]["iteration_order"]
    episode_col = cfg["data"]["episode_col"]
    iteration_col = cfg["data"]["iteration_col"]
    dataset_path = cfg["data"]["path"]
    max_rows = cfg["data"]["max_rows"]
    test_perc = cfg["data"]["test_perc"]

    # common model args
    save_path = cfg["model"]["saver"]["filename"]
    model_name = cfg["model"]["name"]
    delta_state = cfg["data"]["diff_state"]
    run_sweep = cfg["model"]["sweep"]["run"]
    split_strategy = cfg["model"]["sweep"]["split_strategy"]
    results_csv_path = cfg["model"]["sweep"]["results_csv_path"]

    if model_name.lower() == "torch":
        from all_models import available_models
    else:
        from model_loader import available_models

    Model = available_models[model_name]

    if cfg["data"]["full_or_relative"] == "relative":
        dataset_path = os.path.join(dir_path, dataset_path)

    save_path = os.path.join(dir_path, save_path + ".pkl")

    if type(input_cols) == ListConfig:
        input_cols = list(input_cols)
    if type(output_cols) == ListConfig:
        output_cols = list(output_cols)
    if type(augmented_cols) == ListConfig:
        augmented_cols = list(augmented_cols)

    model = Model()
    X, y = model.load_csv(
        input_cols=input_cols,
        output_cols=output_cols,
        augm_cols=augmented_cols,
        dataset_path=dataset_path,
        iteration_order=iteration_order,
        episode_col=episode_col,
        iteration_col=iteration_col,
        max_rows=max_rows,
        diff_state=delta_state,
    )

    logger.info(
        f"Saving last {test_perc * 100}% for test, using first {(1 - test_perc) * 100}% for training/sweeping"
    )
    train_id_end = floor(X.shape[0] * (1 - test_perc))
    X_train, y_train = (
        X[:train_id_end,],
        y[:train_id_end,],
    )
    X_test, y_test = (
        X[train_id_end:,],
        y[train_id_end:,],
    )

    # save training and test sets
    save_data_path = os.path.join(os.getcwd(), "data")
    if not os.path.exists(save_data_path):
        pathlib.Path(save_data_path).mkdir(parents=True, exist_ok=True)
    logger.info(f"Saving data to {os.path.abspath(save_data_path)}")
    np.save(os.path.join(save_data_path, "x_train.npy"), X_train)
    np.save(os.path.join(save_data_path, "y_train.npy"), y_train)
    np.save(os.path.join(save_data_path, "x_test.npy"), X_test)
    np.save(os.path.join(save_data_path, "y_test.npy"), y_test)

    logger.info("Building model...")
    model.build_model(**cfg["model"]["build_params"])

    if run_sweep:
        params = OmegaConf.to_container(cfg["model"]["sweep"]["params"])
        logger.info(f"Sweeping with parameters: {params}")

        sweep_df = model.sweep(
            params=params,
            X=X_train,
            y=y_train,
            search_algorithm=cfg["model"]["sweep"]["search_algorithm"],
            num_trials=cfg["model"]["sweep"]["num_trials"],
            scoring_func=cfg["model"]["sweep"]["scoring_func"],
            results_csv_path=results_csv_path,
            splitting_criteria=split_strategy,
        )
        logger.info(f"Sweep results: {sweep_df}")
    else:
        logger.info("Fitting model...")
        model.fit(X_train, y_train)

    logger.info(f"Saving model to {save_path}")
    model.save_model(filename=save_path)


if __name__ == "__main__":

    main()
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00			`import logging`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`import os`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`import pathlib`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`import hydra`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`import numpy as np`
			`from math import floor`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`from omegaconf import DictConfig, ListConfig, OmegaConf`

UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`logger = logging.getLogger("datamodeler")`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`dir_path = os.path.dirname(os.path.realpath(__file__))`
added model_loader.py as a sole place to add newly developed models for use in datamodeler.py and predicter.py 2021-01-13 02:24:38 +03:00
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`@hydra.main(config_path="conf", config_name="config")`
			`def main(cfg: DictConfig) -> None:`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`logger.info("Configuration: ")`
			`logger.info(f"\n{OmegaConf.to_yaml(cfg)}")`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`# for readability, read common data args into variables`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`input_cols = cfg["data"]["inputs"]`
			`output_cols = cfg["data"]["outputs"]`
			`augmented_cols = cfg["data"]["augmented_cols"]`
BUGFIXES: don't rebuild model after load 2021-06-03 02:28:05 +03:00
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`iteration_order = cfg["data"]["iteration_order"]`
			`episode_col = cfg["data"]["episode_col"]`
			`iteration_col = cfg["data"]["iteration_col"]`
			`dataset_path = cfg["data"]["path"]`
UPDATE: :books: initial readme plus improved tests for sklearn and add max_rows arg 2021-01-15 22:48:54 +03:00			`max_rows = cfg["data"]["max_rows"]`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`test_perc = cfg["data"]["test_perc"]`

			`# common model args`
UPDATE: using **kwargs for build params 2021-03-26 06:39:35 +03:00			`save_path = cfg["model"]["saver"]["filename"]`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`model_name = cfg["model"]["name"]`
UPDATE: add diffstate to trainer + some docs; todos 2021-04-01 22:06:06 +03:00			`delta_state = cfg["data"]["diff_state"]`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`run_sweep = cfg["model"]["sweep"]["run"]`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`split_strategy = cfg["model"]["sweep"]["split_strategy"]`
			`results_csv_path = cfg["model"]["sweep"]["results_csv_path"]`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00
UPDATE: make torch installation optional for simulator packages, upgrade mba to 0.1.3, and update dockerfile 2021-06-09 22:38:33 +03:00			`if model_name.lower() == "torch":`
			`from all_models import available_models`
			`else:`
			`from model_loader import available_models`

add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`Model = available_models[model_name]`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`if cfg["data"]["full_or_relative"] == "relative":`
			`dataset_path = os.path.join(dir_path, dataset_path)`

			`save_path = os.path.join(dir_path, save_path + ".pkl")`

			`if type(input_cols) == ListConfig:`
			`input_cols = list(input_cols)`
			`if type(output_cols) == ListConfig:`
			`output_cols = list(output_cols)`
			`if type(augmented_cols) == ListConfig:`
			`augmented_cols = list(augmented_cols)`

			`model = Model()`
			`X, y = model.load_csv(`
			`input_cols=input_cols,`
			`output_cols=output_cols,`
			`augm_cols=augmented_cols,`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00			`dataset_path=dataset_path,`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`iteration_order=iteration_order,`
			`episode_col=episode_col,`
			`iteration_col=iteration_col,`
UPDATE: :books: initial readme plus improved tests for sklearn and add max_rows arg 2021-01-15 22:48:54 +03:00			`max_rows=max_rows,`
UPDATE: add diffstate to trainer + some docs; todos 2021-04-01 22:06:06 +03:00			`diff_state=delta_state,`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00			`)`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00
			`logger.info(`
			`f"Saving last {test_perc * 100}% for test, using first {(1 - test_perc) * 100}% for training/sweeping"`
			`)`
			`train_id_end = floor(X.shape[0] * (1 - test_perc))`
fixup: Format Python code with Black 2021-04-23 03:46:17 +03:00			`X_train, y_train = (`
BUGFIXES: don't rebuild model after load 2021-06-03 02:28:05 +03:00			`X[:train_id_end,],`
			`y[:train_id_end,],`
fixup: Format Python code with Black 2021-04-23 03:46:17 +03:00			`)`
			`X_test, y_test = (`
BUGFIXES: don't rebuild model after load 2021-06-03 02:28:05 +03:00			`X[train_id_end:,],`
			`y[train_id_end:,],`
fixup: Format Python code with Black 2021-04-23 03:46:17 +03:00			`)`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00
			`# save training and test sets`
			`save_data_path = os.path.join(os.getcwd(), "data")`
			`if not os.path.exists(save_data_path):`
			`pathlib.Path(save_data_path).mkdir(parents=True, exist_ok=True)`
			`logger.info(f"Saving data to {os.path.abspath(save_data_path)}")`
			`np.save(os.path.join(save_data_path, "x_train.npy"), X_train)`
			`np.save(os.path.join(save_data_path, "y_train.npy"), y_train)`
			`np.save(os.path.join(save_data_path, "x_test.npy"), X_test)`
			`np.save(os.path.join(save_data_path, "y_test.npy"), y_test)`

add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`logger.info("Building model...")`
:BUG:-fix: fit separate when using sklearn should only expect dir names 2021-03-26 08:10:13 +03:00			`model.build_model(**cfg["model"]["build_params"])`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00
			`if run_sweep:`
			`params = OmegaConf.to_container(cfg["model"]["sweep"]["params"])`
			`logger.info(f"Sweeping with parameters: {params}")`
WIP: torch models sweeper 2021-04-17 22:11:36 +03:00
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`sweep_df = model.sweep(`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`params=params,`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`X=X_train,`
			`y=y_train,`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`search_algorithm=cfg["model"]["sweep"]["search_algorithm"],`
			`num_trials=cfg["model"]["sweep"]["num_trials"],`
			`scoring_func=cfg["model"]["sweep"]["scoring_func"],`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`results_csv_path=results_csv_path,`
			`splitting_criteria=split_strategy,`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`)`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`logger.info(f"Sweep results: {sweep_df}")`
UPDATE: initial work on sweepers 2021-04-17 02:59:55 +03:00			`else:`
			`logger.info("Fitting model...")`
UPDATE: sweeping complete, and notebooks moved to NB directory 2021-04-23 03:08:11 +03:00			`model.fit(X_train, y_train)`
add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00
			`logger.info(f"Saving model to {save_path}")`
			`model.save_model(filename=save_path)`
ADD: datamodeler2 with initial update for scriptable model training and saving 2021-01-12 03:19:44 +03:00

			`if __name__ == "__main__":`

add: initial hydra integration for multiple configs 2021-01-15 11:16:44 +03:00			`main()`