datadrivenmodel/base.py

200 строки
6.4 KiB
Python

import abc
import logging
import os
import pathlib
import pickle
import sys
import numpy as np
import pandas as pd
from typing import Tuple, List, Union
from sklearn.preprocessing import StandardScaler
from loaders import CsvReader
# Add stdout handler, with level INFO
console = logging.StreamHandler(sys.stdout)
console.setLevel(logging.DEBUG)
formater = logging.Formatter("%(name)-13s: %(levelname)-8s %(message)s")
console.setFormatter(formater)
logging.getLogger(__name__).addHandler(console)
# TODO: add weighting to the model
# TODO: this should go into a metrics function?
# TODO: implement saving of scalar transformers
class BaseModel(abc.ABC):
def __init__(self, log_dirs: str = "logs"):
self.logs_dir = log_dirs
self.model = None
def load_csv(
self,
dataset_path: str,
input_cols: Union[str, List[str]] = "state",
augm_cols: Union[str, List[str]] = ["action_command"],
output_cols: Union[str, List[str]] = "state",
iteration_order: int = -1,
max_rows: Union[int, None] = None,
) -> Tuple[np.ndarray, np.ndarray]:
"""Read CSV data into two datasets for modeling
Parameters
----------
dataset_path : str
path to csv dataset
input_cols : Union[str, List[str]], optional
list of columns represent the inputs to the dynamical system in the raw dataset. Can either be a string which is then matched for all columns in the dataset, or a list of strings with exact matches, by default "state"
augm_cols : Union[str, List[str]], optional
Exact match of additional columns to use for modeling, such as the actions of the current iteration and any scenario/config parameters, by default ["action_command"]
output_col : Union[str, List[str]], optional
output columns of the dynamical system. Can either be a string which is then matched for any columns or a list of exact matches, by default "state"
iteration_order : int, optional
in the order of the raw dataset, what is the lag between iteration t and iteration t+1, by default -1
max_rows : Union[int, None], optional
max rows to read for a large dataset, by default None
Returns
-------
Tuple[np.array, np.array]
Features and labels for modeling
Raises
------
ValueError
Data not found
"""
csv_reader = CsvReader()
if not os.path.exists(dataset_path):
raise ValueError(f"No data found at {dataset_path}")
else:
df = pd.read_csv(dataset_path, nrows=max_rows)
if type(input_cols) == str:
base_features = [col for col in df if col.startswith(input_cols)]
elif type(input_cols) == list:
base_features = input_cols
else:
raise TypeError(
f"input_cols expected type List[str] or str but received type {type(input_cols)}"
)
if type(augm_cols) == str:
augm_features = [col for col in df if col.startswith(augm_cols)]
elif type(augm_cols) == list:
augm_features = augm_cols
else:
raise TypeError(
f"augm_cols expected type List[str] or str but received type {type(augm_cols)}"
)
features = base_features + augm_features
if type(output_cols) == str:
labels = [col for col in df if col.startswith(output_cols)]
elif type(output_cols) == list:
labels = output_cols
else:
raise TypeError(
f"output_cols expected type List[str] but received type {type(output_cols)}"
)
df = csv_reader.read(
df,
iteration_order=iteration_order,
feature_cols=features,
label_cols=labels,
)
X = df[csv_reader.feature_cols].values
y = df[csv_reader.label_cols].values
self.input_dim = X.shape[1]
self.output_dim = y.shape[1]
return X, y
def load_numpy(self, dataset_path: str) -> Tuple:
X = np.load(os.path.join(dataset_path, "x_set.npy"))
y = np.load(os.path.join(dataset_path, "y_set.npy"))
self.input_dim = X.shape[1]
self.output_dim = y.shape[1]
return X, y
def load_pickle_data(self, x_path: str, y_path: str):
X = pickle.load(open(x_path, "rb"))
y = pickle.load(open(y_path, "rb"))
return X, y
def scalar(self, X, y):
self.xscalar = StandardScaler()
self.yscalar = StandardScaler()
X_scaled = self.xscalar.fit_transform(X)
y_scaled = self.yscalar.fit_transform(y)
return X_scaled, y_scaled
def build_model(self, scale_data: bool = False):
self.scale_data = scale_data
def fit(self, X, y):
if not self.model:
raise ValueError("Please build or load the model first")
if self.scale_data:
X, y = self.scalar(X, y)
self.model.fit(X, y)
def predict(self, X, label_col_names: List[str] = None):
if not self.model:
raise ValueError("Please build or load the model first")
else:
if self.scale_data:
X = self.xscalar.transform(X)
preds = self.model.predict(X)
if self.scale_data:
preds = self.yscalar.inverse_transform(preds)
preds_df = pd.DataFrame(preds)
preds_df.columns = label_col_names
return preds_df
def save_model(self, filename):
if not pathlib.Path(filename).parent.exists():
pathlib.Path(filename).parent.mkdir(parents=True)
if self.scale_data:
logging.warn("Saving scalars not yet implemented.")
pickle.dump(self.model, open(filename, "wb"))
def load_model(self, filename: str, scale_data: bool = False):
self.scale_data = scale_data
self.model = pickle.load(open(filename, "rb"))
def evaluate(self, test_data: np.ndarray):
if not self.model:
raise Exception("No model found, please run fit first")
if __name__ == "__main__":
base_model = BaseModel()
x, y = base_model.load_csv(
dataset_path="csv_data/cartpole-log.csv",
max_rows=1000,
augm_cols=["action_command", "config_length", "config_masspole"],
)