UPDATE: new loaders and tests for refactor
This commit is contained in:
Родитель
0d0076d556
Коммит
6cfd67a20d
65
base.py
65
base.py
|
@ -7,8 +7,9 @@ import sys
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from typing import Tuple, List
|
||||
from typing import Tuple, List, Union
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from loaders import CsvReader
|
||||
|
||||
|
||||
# Add stdout handler, with level INFO
|
||||
|
@ -26,15 +27,57 @@ class BaseModel(abc.ABC):
|
|||
self.model = None
|
||||
|
||||
def load_csv(
|
||||
self, dataset_path: str, feature_columns: List[str], label_columns: List[str]
|
||||
) -> Tuple:
|
||||
self,
|
||||
dataset_path: str,
|
||||
input_cols_read: Union[str, List[str]] = "state",
|
||||
augm_cols: Union[str, List[str]] = ["action_command"],
|
||||
output_col: Union[str, List[str]] = "state",
|
||||
timelag: int = -1,
|
||||
max_rows: Union[int, None] = None,
|
||||
) -> Tuple[np.array, np.array]:
|
||||
"""Read CSV data into two datasets for modeling
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_path : str
|
||||
path to csv dataset
|
||||
input_cols_read : Union[str, List[str]], optional
|
||||
list of columns represent the inputs to the dynamical system in the raw dataset. Can either be a string which is then matched for all columns in the dataset, or a list of strings with exact matches, by default "state"
|
||||
augm_cols : Union[str, List[str]], optional
|
||||
Exact match of additional columns to use for modeling, such as the actions of the current iteration and any scenario/config parameters, by default ["action_command"]
|
||||
output_col : Union[str, List[str]], optional
|
||||
output columns of the dynamical system. Can either be a string which is then matched for any columns or a list of exact matches, by default "state"
|
||||
timelag : int, optional
|
||||
in the order of the raw dataset, what is the lag between iteration t and iteration t+1, by default -1
|
||||
max_rows : Union[int, None], optional
|
||||
max rows to read for a large dataset, by default None
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple[np.array, np.array]
|
||||
Features and labels for modeling
|
||||
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
Data not found
|
||||
"""
|
||||
|
||||
csv_reader = CsvReader()
|
||||
if not os.path.exists(dataset_path):
|
||||
raise ValueError(f"No data found at {dataset_path}")
|
||||
else:
|
||||
df = pd.read_csv(dataset_path)
|
||||
X = df[feature_columns].values
|
||||
y = df[label_columns].values
|
||||
df = csv_reader.read(
|
||||
dataset_path,
|
||||
timelag=timelag,
|
||||
feature_cols=input_cols_read,
|
||||
max_rows=max_rows,
|
||||
)
|
||||
features = csv_reader.feature_cols + augm_cols
|
||||
output_cols = [col for col in df if col.startswith(output_col)]
|
||||
X = df[features].values
|
||||
y = df[output_cols].values
|
||||
|
||||
self.input_dim = X.shape[1]
|
||||
self.output_dim = y.shape[1]
|
||||
|
@ -105,3 +148,13 @@ class BaseModel(abc.ABC):
|
|||
|
||||
if not self.model:
|
||||
raise Exception("No model found, please run fit first")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
base_model = BaseModel()
|
||||
base_model.load_csv(
|
||||
dataset_path="csv_data/cartpole-log.csv",
|
||||
max_rows=1000,
|
||||
augm_cols=["action_command", "config_length", "config_masspole"],
|
||||
)
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,41 +0,0 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
from typing import List, Tuple, Union
|
||||
import logging
|
||||
|
||||
FORMAT = "%(message)s"
|
||||
logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
|
||||
logger = logging.getLogger("data_loader")
|
||||
|
||||
|
||||
def csv_reader(
|
||||
filename: str,
|
||||
timelag: int = 1,
|
||||
episode_col: Union[str, None] = "episode",
|
||||
iteration_col: Union[str, None] = "iteration",
|
||||
):
|
||||
|
||||
df = pd.read_csv(filename)
|
||||
# if timelag != 0 then drop the last_episode - timelag iteration from each episode
|
||||
# and append next state columns into each row: {row_t, row_{t+timelag}} -> {st, at, st+1}
|
||||
if all([episode_col, iteration_col, timelag != 0]):
|
||||
df = df.sort_values(by=[episode_col, iteration_col])
|
||||
neg_lag = timelag * -1
|
||||
lagged_df = df.groupby(by=episode_col, as_index=False).shift(neg_lag)
|
||||
lagged_df = lagged_df.drop([iteration_col], axis=1)
|
||||
joined_df = df.join(lagged_df.rename(columns=lambda x: "lag_" + x))
|
||||
# truncate before the end of timelag for complete observations only
|
||||
joined_df = (
|
||||
joined_df.groupby(by=episode_col, as_index=False)
|
||||
.apply(lambda x: x.iloc[:neg_lag])
|
||||
.reset_index()
|
||||
)
|
||||
return joined_df.drop(["level_0", "level_1"], axis=1)
|
||||
else:
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
data_dir = "csv_data"
|
||||
df = csv_reader(os.path.join(data_dir, "cartpole-log.csv"))
|
|
@ -152,7 +152,6 @@ def read_env_data():
|
|||
if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
args.pickle = "/home/alizaidi/bonsai/repsol/data/scenario1/"
|
||||
|
||||
with open(args.config_path) as cmfile:
|
||||
config = yaml.full_load(cmfile)
|
||||
|
|
|
@ -1,20 +1,29 @@
|
|||
name: datadriven
|
||||
name: ddm
|
||||
channels:
|
||||
- pytorch
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.8.5
|
||||
- pip=19.1.1
|
||||
- scipy=1.5.4
|
||||
- pandas=1.1.4
|
||||
- pip:
|
||||
- ray==1.0.0
|
||||
- joblib==0.17.0
|
||||
- keras==2.4.3
|
||||
- scikit-learn==0.23.2
|
||||
- scikit-optimize==0.8.1
|
||||
- tensorboard==2.3.0
|
||||
- tensorflow==2.3.1
|
||||
- tensorflow-estimator==2.3.0
|
||||
- microsoft-bonsai-api==0.1.2
|
||||
- bonsai-cli==1.0.4
|
||||
- pyyaml==5.3.1
|
||||
- h5py==2.10.0
|
||||
- nbgrader==0.6.1
|
||||
- python=3.7.7
|
||||
- pip=19.1.1
|
||||
- pytorch=1.7.0
|
||||
- torchvision=0.8
|
||||
- pip:
|
||||
- ray==1.0.0
|
||||
- black==19.10b0
|
||||
- joblib==0.17.0
|
||||
- keras==2.4.3
|
||||
- scikit-learn==0.23.2
|
||||
- scikit-optimize==0.8.1
|
||||
- skorch==0.9.0
|
||||
- tensorboard==2.3.0
|
||||
- tensorflow==2.3.1
|
||||
- tensorflow-estimator==2.3.0
|
||||
- microsoft-bonsai-api==0.1.2
|
||||
- bonsai-cli==1.0.4
|
||||
- pandas==1.1.4
|
||||
- pyyaml==5.3.1
|
||||
- pytest==6.2.1
|
||||
- scipy==1.5.4
|
||||
- h5py==2.10.0
|
||||
- nbgrader==0.6.1
|
||||
- tune-sklearn==0.1.0
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
from typing import List, Tuple, Union
|
||||
import logging
|
||||
|
||||
FORMAT = "%(message)s"
|
||||
logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
|
||||
logger = logging.getLogger("data_loader")
|
||||
data_dir = "csv_data"
|
||||
|
||||
|
||||
class CsvReader(object):
|
||||
def read(
|
||||
self,
|
||||
filename: str,
|
||||
timelag: int = -1,
|
||||
episode_col: Union[str, None] = "episode",
|
||||
iteration_col: Union[str, None] = "iteration",
|
||||
feature_cols: Union[List, str] = "state_",
|
||||
max_rows: Union[int, None] = None,
|
||||
):
|
||||
|
||||
df = pd.read_csv(filename, nrows=max_rows)
|
||||
|
||||
# CASE 1: rows are of the form {st+1, at}
|
||||
# Append st into next row
|
||||
# if timelag < 0 then drop the iteration - timelag iteration from each episode
|
||||
# and append previous state columns into each row: {st+1, at} -> {st, at, st+1}
|
||||
if all([episode_col, iteration_col, timelag < 0]):
|
||||
df = df.sort_values(by=[episode_col, iteration_col])
|
||||
lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
|
||||
lagged_df = lagged_df.drop([iteration_col], axis=1)
|
||||
if type(feature_cols) == list:
|
||||
lagged_df = lagged_df[feature_cols]
|
||||
else:
|
||||
self.feature_cols = [
|
||||
col for col in lagged_df if col.startswith(feature_cols)
|
||||
]
|
||||
lagged_df = lagged_df[self.feature_cols]
|
||||
lagged_df = lagged_df.rename(columns=lambda x: "prev_" + x)
|
||||
self.feature_cols = list(lagged_df.columns.values)
|
||||
joined_df = df.join(lagged_df)
|
||||
# skip the first row of each episode since we do not have its st
|
||||
joined_df = (
|
||||
joined_df.groupby(by=episode_col, as_index=False)
|
||||
.apply(lambda x: x.iloc[timelag * -1 :])
|
||||
.reset_index()
|
||||
)
|
||||
return joined_df.drop(["level_0", "level_1"], axis=1)
|
||||
# CASE 2: rows of the form {st, at}
|
||||
# Append st+1 from next row into current row {st, at, st+1}
|
||||
elif all([episode_col, iteration_col, timelag > 0]):
|
||||
df = df.sort_values(by=[episode_col, iteration_col])
|
||||
lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
|
||||
lagged_df = lagged_df.drop([iteration_col], axis=1)
|
||||
if type(feature_cols) == list:
|
||||
lagged_df = lagged_df[feature_cols]
|
||||
else:
|
||||
self.feature_cols = [
|
||||
col for col in lagged_df if col.startswith(feature_cols)
|
||||
]
|
||||
lagged_df = lagged_df[self.feature_cols]
|
||||
lagged_df = lagged_df.rename(columns=lambda x: "next_" + x)
|
||||
self.feature_cols = list(lagged_df.columns.values)
|
||||
joined_df = df.join(lagged_df)
|
||||
# truncate before the end of timelag for complete observations only
|
||||
joined_df = (
|
||||
joined_df.groupby(by=episode_col, as_index=False)
|
||||
.apply(lambda x: x.iloc[: timelag * -1])
|
||||
.reset_index()
|
||||
)
|
||||
return joined_df.drop(["level_0", "level_1"], axis=1)
|
||||
else:
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
csv_reader = CsvReader()
|
||||
df = csv_reader.read(
|
||||
os.path.join(data_dir, "cartpole-log.csv"), timelag=-1, max_rows=1000
|
||||
)
|
|
@ -1,10 +1,51 @@
|
|||
import os
|
||||
from data_loader import csv_reader, data_dir
|
||||
import pytest
|
||||
from loaders import CsvReader, data_dir
|
||||
from base import BaseModel
|
||||
|
||||
|
||||
def test_cartpole_at_st1():
|
||||
@pytest.fixture
|
||||
def csv_reader():
|
||||
csv_reader = CsvReader()
|
||||
return csv_reader
|
||||
|
||||
cp_df = csv_reader(csv_reader(os.path.join(data_dir, "cartpole-log.csv")))
|
||||
assert cp_df.shape[0] == 490000
|
||||
assert cp_df.shape[1] == 16
|
||||
|
||||
def test_cartpole_at_st1(csv_reader):
|
||||
|
||||
cp_df = csv_reader.read(
|
||||
os.path.join(data_dir, "cartpole-log.csv"), max_rows=1000, timelag=-1
|
||||
)
|
||||
assert cp_df.shape[0] == 980
|
||||
assert cp_df.shape[1] == 13
|
||||
assert (
|
||||
cp_df["state_x_position"].values[0] == cp_df["prev_state_x_position"].values[1]
|
||||
)
|
||||
|
||||
|
||||
def test_cartpole_at_st(csv_reader):
|
||||
|
||||
cp2_df = csv_reader.read(
|
||||
os.path.join(data_dir, "cartpole_at_st.csv"), timelag=1, max_rows=1000
|
||||
)
|
||||
|
||||
assert cp2_df.shape[0] == 980
|
||||
assert cp2_df.shape[1] == 13
|
||||
assert (
|
||||
cp2_df["state_x_position"].values[1]
|
||||
== cp2_df["next_state_x_position"].values[0]
|
||||
)
|
||||
|
||||
|
||||
def test_base_reader():
|
||||
|
||||
base_model = BaseModel()
|
||||
X,y = base_model.load_csv(
|
||||
dataset_path=os.path.join(data_dir, "cartpole-log.csv"),
|
||||
max_rows=1000,
|
||||
augm_cols=["action_command", "config_length", "config_masspole"],
|
||||
)
|
||||
|
||||
assert X.shape[0] == 980 == y.shape[0]
|
||||
assert X.shape[1] == 7
|
||||
assert y.shape[1] == 4
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
import pytest
|
||||
from torch_models import PyTorchModel
|
||||
|
||||
torch_model = PyTorchModel()
|
||||
X, y = torch_model.load_csv(
|
||||
dataset_path="csv_data/cartpole-log.csv",
|
||||
max_rows=1000,
|
||||
augm_cols=["action_command", "config_length", "config_masspole"],
|
||||
)
|
||||
|
||||
|
||||
def test_shape():
|
||||
|
||||
assert X.shape[0] == 980 == y.shape[0]
|
||||
assert X.shape[1] == torch_model.input_dim
|
||||
assert y.shape[1] == torch_model.output_dim
|
||||
|
||||
|
||||
def test_build():
|
||||
|
||||
torch_model.build_model()
|
||||
assert torch_model.scale_data == False
|
||||
assert torch_model.model is not None
|
||||
|
||||
|
||||
def test_fit():
|
||||
|
||||
torch_model.build_model()
|
||||
torch_model.fit(X, y)
|
||||
|
|
@ -8,7 +8,6 @@ from skorch.callbacks import LRScheduler
|
|||
from torch.optim.lr_scheduler import CyclicLR
|
||||
|
||||
from base import BaseModel
|
||||
from tune_sklearn import TuneGridSearchCV, TuneSearchCV
|
||||
|
||||
|
||||
class MVRegressor(nn.Module):
|
||||
|
@ -127,6 +126,8 @@ class PyTorchModel(BaseModel):
|
|||
scoring_func: str = "r2",
|
||||
):
|
||||
|
||||
from tune_sklearn import TuneGridSearchCV, TuneSearchCV
|
||||
|
||||
X, y = (
|
||||
torch.tensor(X).float().to(device=self.device),
|
||||
torch.tensor(y).float().to(device=self.device),
|
||||
|
@ -147,10 +148,15 @@ class PyTorchModel(BaseModel):
|
|||
if __name__ == "__main__":
|
||||
|
||||
pytorch_model = PyTorchModel()
|
||||
X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")
|
||||
X, y = pytorch_model.load_csv(
|
||||
dataset_path="csv_data/cartpole-log.csv",
|
||||
max_rows=1000,
|
||||
augm_cols=["action_command", "config_length", "config_masspole"],
|
||||
)
|
||||
# X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")
|
||||
|
||||
pytorch_model.build_model()
|
||||
# pytorch_model.fit(X, y)
|
||||
pytorch_model.fit(X, y)
|
||||
# predict_one = pytorch_model.predict(X[0])
|
||||
|
||||
# tune tests
|
||||
|
@ -158,5 +164,5 @@ if __name__ == "__main__":
|
|||
# gs = TuneGridSearchCV(pytorch_model.model, params, scoring="neg_mean_squared_error")
|
||||
# gs.fit(torch.tensor(X).float(), torch.tensor(y).float())
|
||||
|
||||
params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
|
||||
pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")
|
||||
# params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
|
||||
# pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")
|
||||
|
|
Загрузка…
Ссылка в новой задаче