UPDATE: new loaders and tests for refactor

This commit is contained in:
Ali Zaidi 2020-12-17 13:02:27 -08:00
Родитель 0d0076d556
Коммит 6cfd67a20d
13 изменённых файлов: 605261 добавлений и 77 удалений

65
base.py
Просмотреть файл

@ -7,8 +7,9 @@ import sys
import numpy as np
import pandas as pd
from typing import Tuple, List
from typing import Tuple, List, Union
from sklearn.preprocessing import StandardScaler
from loaders import CsvReader
# Add stdout handler, with level INFO
@ -26,15 +27,57 @@ class BaseModel(abc.ABC):
self.model = None
def load_csv(
self, dataset_path: str, feature_columns: List[str], label_columns: List[str]
) -> Tuple:
self,
dataset_path: str,
input_cols_read: Union[str, List[str]] = "state",
augm_cols: Union[str, List[str]] = ["action_command"],
output_col: Union[str, List[str]] = "state",
timelag: int = -1,
max_rows: Union[int, None] = None,
) -> Tuple[np.array, np.array]:
"""Read CSV data into two datasets for modeling
Parameters
----------
dataset_path : str
path to csv dataset
input_cols_read : Union[str, List[str]], optional
list of columns represent the inputs to the dynamical system in the raw dataset. Can either be a string which is then matched for all columns in the dataset, or a list of strings with exact matches, by default "state"
augm_cols : Union[str, List[str]], optional
Exact match of additional columns to use for modeling, such as the actions of the current iteration and any scenario/config parameters, by default ["action_command"]
output_col : Union[str, List[str]], optional
output columns of the dynamical system. Can either be a string which is then matched for any columns or a list of exact matches, by default "state"
timelag : int, optional
in the order of the raw dataset, what is the lag between iteration t and iteration t+1, by default -1
max_rows : Union[int, None], optional
max rows to read for a large dataset, by default None
Returns
-------
Tuple[np.array, np.array]
Features and labels for modeling
Raises
------
ValueError
Data not found
"""
csv_reader = CsvReader()
if not os.path.exists(dataset_path):
raise ValueError(f"No data found at {dataset_path}")
else:
df = pd.read_csv(dataset_path)
X = df[feature_columns].values
y = df[label_columns].values
df = csv_reader.read(
dataset_path,
timelag=timelag,
feature_cols=input_cols_read,
max_rows=max_rows,
)
features = csv_reader.feature_cols + augm_cols
output_cols = [col for col in df if col.startswith(output_col)]
X = df[features].values
y = df[output_cols].values
self.input_dim = X.shape[1]
self.output_dim = y.shape[1]
@ -105,3 +148,13 @@ class BaseModel(abc.ABC):
if not self.model:
raise Exception("No model found, please run fit first")
if __name__ == "__main__":
base_model = BaseModel()
base_model.load_csv(
dataset_path="csv_data/cartpole-log.csv",
max_rows=1000,
augm_cols=["action_command", "config_length", "config_masspole"],
)

Разница между файлами не показана из-за своего большого размера Загрузить разницу

500001
csv_data/cartpole-log.csv Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

5001
csv_data/cartpole_at_st.csv Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

50001
csv_data/quanser-log.csv Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,41 +0,0 @@
import os
import pandas as pd
from typing import List, Tuple, Union
import logging
FORMAT = "%(message)s"
logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
logger = logging.getLogger("data_loader")
def csv_reader(
filename: str,
timelag: int = 1,
episode_col: Union[str, None] = "episode",
iteration_col: Union[str, None] = "iteration",
):
df = pd.read_csv(filename)
# if timelag != 0 then drop the last_episode - timelag iteration from each episode
# and append next state columns into each row: {row_t, row_{t+timelag}} -> {st, at, st+1}
if all([episode_col, iteration_col, timelag != 0]):
df = df.sort_values(by=[episode_col, iteration_col])
neg_lag = timelag * -1
lagged_df = df.groupby(by=episode_col, as_index=False).shift(neg_lag)
lagged_df = lagged_df.drop([iteration_col], axis=1)
joined_df = df.join(lagged_df.rename(columns=lambda x: "lag_" + x))
# truncate before the end of timelag for complete observations only
joined_df = (
joined_df.groupby(by=episode_col, as_index=False)
.apply(lambda x: x.iloc[:neg_lag])
.reset_index()
)
return joined_df.drop(["level_0", "level_1"], axis=1)
else:
return df
if __name__ == "__main__":
data_dir = "csv_data"
df = csv_reader(os.path.join(data_dir, "cartpole-log.csv"))

Просмотреть файл

@ -152,7 +152,6 @@ def read_env_data():
if __name__ == "__main__":
args = parser.parse_args()
args.pickle = "/home/alizaidi/bonsai/repsol/data/scenario1/"
with open(args.config_path) as cmfile:
config = yaml.full_load(cmfile)

Просмотреть файл

@ -1,20 +1,29 @@
name: datadriven
name: ddm
channels:
- pytorch
- defaults
dependencies:
- python=3.8.5
- pip=19.1.1
- scipy=1.5.4
- pandas=1.1.4
- pip:
- ray==1.0.0
- joblib==0.17.0
- keras==2.4.3
- scikit-learn==0.23.2
- scikit-optimize==0.8.1
- tensorboard==2.3.0
- tensorflow==2.3.1
- tensorflow-estimator==2.3.0
- microsoft-bonsai-api==0.1.2
- bonsai-cli==1.0.4
- pyyaml==5.3.1
- h5py==2.10.0
- nbgrader==0.6.1
- python=3.7.7
- pip=19.1.1
- pytorch=1.7.0
- torchvision=0.8
- pip:
- ray==1.0.0
- black==19.10b0
- joblib==0.17.0
- keras==2.4.3
- scikit-learn==0.23.2
- scikit-optimize==0.8.1
- skorch==0.9.0
- tensorboard==2.3.0
- tensorflow==2.3.1
- tensorflow-estimator==2.3.0
- microsoft-bonsai-api==0.1.2
- bonsai-cli==1.0.4
- pandas==1.1.4
- pyyaml==5.3.1
- pytest==6.2.1
- scipy==1.5.4
- h5py==2.10.0
- nbgrader==0.6.1
- tune-sklearn==0.1.0

82
loaders.py Normal file
Просмотреть файл

@ -0,0 +1,82 @@
import os
import pandas as pd
from typing import List, Tuple, Union
import logging
FORMAT = "%(message)s"
logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
logger = logging.getLogger("data_loader")
data_dir = "csv_data"
class CsvReader(object):
def read(
self,
filename: str,
timelag: int = -1,
episode_col: Union[str, None] = "episode",
iteration_col: Union[str, None] = "iteration",
feature_cols: Union[List, str] = "state_",
max_rows: Union[int, None] = None,
):
df = pd.read_csv(filename, nrows=max_rows)
# CASE 1: rows are of the form {st+1, at}
# Append st into next row
# if timelag < 0 then drop the iteration - timelag iteration from each episode
# and append previous state columns into each row: {st+1, at} -> {st, at, st+1}
if all([episode_col, iteration_col, timelag < 0]):
df = df.sort_values(by=[episode_col, iteration_col])
lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
lagged_df = lagged_df.drop([iteration_col], axis=1)
if type(feature_cols) == list:
lagged_df = lagged_df[feature_cols]
else:
self.feature_cols = [
col for col in lagged_df if col.startswith(feature_cols)
]
lagged_df = lagged_df[self.feature_cols]
lagged_df = lagged_df.rename(columns=lambda x: "prev_" + x)
self.feature_cols = list(lagged_df.columns.values)
joined_df = df.join(lagged_df)
# skip the first row of each episode since we do not have its st
joined_df = (
joined_df.groupby(by=episode_col, as_index=False)
.apply(lambda x: x.iloc[timelag * -1 :])
.reset_index()
)
return joined_df.drop(["level_0", "level_1"], axis=1)
# CASE 2: rows of the form {st, at}
# Append st+1 from next row into current row {st, at, st+1}
elif all([episode_col, iteration_col, timelag > 0]):
df = df.sort_values(by=[episode_col, iteration_col])
lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
lagged_df = lagged_df.drop([iteration_col], axis=1)
if type(feature_cols) == list:
lagged_df = lagged_df[feature_cols]
else:
self.feature_cols = [
col for col in lagged_df if col.startswith(feature_cols)
]
lagged_df = lagged_df[self.feature_cols]
lagged_df = lagged_df.rename(columns=lambda x: "next_" + x)
self.feature_cols = list(lagged_df.columns.values)
joined_df = df.join(lagged_df)
# truncate before the end of timelag for complete observations only
joined_df = (
joined_df.groupby(by=episode_col, as_index=False)
.apply(lambda x: x.iloc[: timelag * -1])
.reset_index()
)
return joined_df.drop(["level_0", "level_1"], axis=1)
else:
return df
if __name__ == "__main__":
csv_reader = CsvReader()
df = csv_reader.read(
os.path.join(data_dir, "cartpole-log.csv"), timelag=-1, max_rows=1000
)

0
tests/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -1,10 +1,51 @@
import os
from data_loader import csv_reader, data_dir
import pytest
from loaders import CsvReader, data_dir
from base import BaseModel
def test_cartpole_at_st1():
@pytest.fixture
def csv_reader():
csv_reader = CsvReader()
return csv_reader
cp_df = csv_reader(csv_reader(os.path.join(data_dir, "cartpole-log.csv")))
assert cp_df.shape[0] == 490000
assert cp_df.shape[1] == 16
def test_cartpole_at_st1(csv_reader):
cp_df = csv_reader.read(
os.path.join(data_dir, "cartpole-log.csv"), max_rows=1000, timelag=-1
)
assert cp_df.shape[0] == 980
assert cp_df.shape[1] == 13
assert (
cp_df["state_x_position"].values[0] == cp_df["prev_state_x_position"].values[1]
)
def test_cartpole_at_st(csv_reader):
cp2_df = csv_reader.read(
os.path.join(data_dir, "cartpole_at_st.csv"), timelag=1, max_rows=1000
)
assert cp2_df.shape[0] == 980
assert cp2_df.shape[1] == 13
assert (
cp2_df["state_x_position"].values[1]
== cp2_df["next_state_x_position"].values[0]
)
def test_base_reader():
base_model = BaseModel()
X,y = base_model.load_csv(
dataset_path=os.path.join(data_dir, "cartpole-log.csv"),
max_rows=1000,
augm_cols=["action_command", "config_length", "config_masspole"],
)
assert X.shape[0] == 980 == y.shape[0]
assert X.shape[1] == 7
assert y.shape[1] == 4

Просмотреть файл

@ -0,0 +1,31 @@
import os
import pytest
from torch_models import PyTorchModel
torch_model = PyTorchModel()
X, y = torch_model.load_csv(
dataset_path="csv_data/cartpole-log.csv",
max_rows=1000,
augm_cols=["action_command", "config_length", "config_masspole"],
)
def test_shape():
assert X.shape[0] == 980 == y.shape[0]
assert X.shape[1] == torch_model.input_dim
assert y.shape[1] == torch_model.output_dim
def test_build():
torch_model.build_model()
assert torch_model.scale_data == False
assert torch_model.model is not None
def test_fit():
torch_model.build_model()
torch_model.fit(X, y)

Просмотреть файл

@ -8,7 +8,6 @@ from skorch.callbacks import LRScheduler
from torch.optim.lr_scheduler import CyclicLR
from base import BaseModel
from tune_sklearn import TuneGridSearchCV, TuneSearchCV
class MVRegressor(nn.Module):
@ -127,6 +126,8 @@ class PyTorchModel(BaseModel):
scoring_func: str = "r2",
):
from tune_sklearn import TuneGridSearchCV, TuneSearchCV
X, y = (
torch.tensor(X).float().to(device=self.device),
torch.tensor(y).float().to(device=self.device),
@ -147,10 +148,15 @@ class PyTorchModel(BaseModel):
if __name__ == "__main__":
pytorch_model = PyTorchModel()
X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")
X, y = pytorch_model.load_csv(
dataset_path="csv_data/cartpole-log.csv",
max_rows=1000,
augm_cols=["action_command", "config_length", "config_masspole"],
)
# X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")
pytorch_model.build_model()
# pytorch_model.fit(X, y)
pytorch_model.fit(X, y)
# predict_one = pytorch_model.predict(X[0])
# tune tests
@ -158,5 +164,5 @@ if __name__ == "__main__":
# gs = TuneGridSearchCV(pytorch_model.model, params, scoring="neg_mean_squared_error")
# gs.fit(torch.tensor(X).float(), torch.tensor(y).float())
params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")
# params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
# pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")