UPDATE: new loaders and tests for refactor

2020-12-17 13:02:27 -08:00 · 2020-12-17 13:02:27 -08:00 · 6cfd67a20d
--- a/base.py
+++ b/base.py
@ -7,8 +7,9 @@ import sys
 import numpy as np
 import pandas as pd

-from typing import Tuple, List
+from typing import Tuple, List, Union
 from sklearn.preprocessing import StandardScaler
+from loaders import CsvReader


 # Add stdout handler, with level INFO
@ -26,15 +27,57 @@ class BaseModel(abc.ABC):
        self.model = None

    def load_csv(
-        self, dataset_path: str, feature_columns: List[str], label_columns: List[str]
-    ) -> Tuple:
+        self,
+        dataset_path: str,
+        input_cols_read: Union[str, List[str]] = "state",
+        augm_cols: Union[str, List[str]] = ["action_command"],
+        output_col: Union[str, List[str]] = "state",
+        timelag: int = -1,
+        max_rows: Union[int, None] = None,
+    ) -> Tuple[np.array, np.array]:
+        """Read CSV data into two datasets for modeling

+        Parameters
+        ----------
+        dataset_path : str
+            path to csv dataset
+        input_cols_read : Union[str, List[str]], optional
+            list of columns represent the inputs to the dynamical system in the raw dataset. Can either be a string which is then matched for all columns in the dataset, or a list of strings with exact matches, by default "state"
+        augm_cols : Union[str, List[str]], optional
+            Exact match of additional columns to use for modeling, such as the actions of the current iteration and any scenario/config parameters, by default ["action_command"]
+        output_col : Union[str, List[str]], optional
+            output columns of the dynamical system. Can either be a string which is then matched for any columns or a list of exact matches, by default "state"
+        timelag : int, optional
+            in the order of the raw dataset, what is the lag between iteration t and iteration t+1, by default -1
+        max_rows : Union[int, None], optional
+            max rows to read for a large dataset, by default None
+
+        Returns
+        -------
+        Tuple[np.array, np.array]
+            Features and labels for modeling
+            
+
+        Raises
+        ------
+        ValueError
+            Data not found
+        """
+
+        csv_reader = CsvReader()
        if not os.path.exists(dataset_path):
            raise ValueError(f"No data found at {dataset_path}")
        else:
-            df = pd.read_csv(dataset_path)
-            X = df[feature_columns].values
-            y = df[label_columns].values
+            df = csv_reader.read(
+                dataset_path,
+                timelag=timelag,
+                feature_cols=input_cols_read,
+                max_rows=max_rows,
+            )
+            features = csv_reader.feature_cols + augm_cols
+            output_cols = [col for col in df if col.startswith(output_col)]
+            X = df[features].values
+            y = df[output_cols].values

        self.input_dim = X.shape[1]
        self.output_dim = y.shape[1]
@ -105,3 +148,13 @@ class BaseModel(abc.ABC):

        if not self.model:
            raise Exception("No model found, please run fit first")
+
+
+if __name__ == "__main__":
+
+    base_model = BaseModel()
+    base_model.load_csv(
+        dataset_path="csv_data/cartpole-log.csv",
+        max_rows=1000,
+        augm_cols=["action_command", "config_length", "config_masspole"],
+    )
--- a/20:48:53_Quanser_log.csv
+++ b/20:48:53_Quanser_log.csv
--- a/csv_data/cartpole-log.csv
+++ b/csv_data/cartpole-log.csv
--- a/csv_data/cartpole_at_st.csv
+++ b/csv_data/cartpole_at_st.csv
--- a/csv_data/quanser-log.csv
+++ b/csv_data/quanser-log.csv
--- a/data_loader.py
+++ b/data_loader.py
@ -1,41 +0,0 @@
-import os
-import pandas as pd
-from typing import List, Tuple, Union
-import logging
-
-FORMAT = "%(message)s"
-logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
-logger = logging.getLogger("data_loader")
-
-
-def csv_reader(
-    filename: str,
-    timelag: int = 1,
-    episode_col: Union[str, None] = "episode",
-    iteration_col: Union[str, None] = "iteration",
-):
-
-    df = pd.read_csv(filename)
-    # if timelag != 0 then drop the last_episode - timelag iteration from each episode
-    # and append next state columns into each row: {row_t, row_{t+timelag}} -> {st, at, st+1}
-    if all([episode_col, iteration_col, timelag != 0]):
-        df = df.sort_values(by=[episode_col, iteration_col])
-        neg_lag = timelag * -1
-        lagged_df = df.groupby(by=episode_col, as_index=False).shift(neg_lag)
-        lagged_df = lagged_df.drop([iteration_col], axis=1)
-        joined_df = df.join(lagged_df.rename(columns=lambda x: "lag_" + x))
-        # truncate before the end of timelag for complete observations only
-        joined_df = (
-            joined_df.groupby(by=episode_col, as_index=False)
-            .apply(lambda x: x.iloc[:neg_lag])
-            .reset_index()
-        )
-        return joined_df.drop(["level_0", "level_1"], axis=1)
-    else:
-        return df
-
-
-if __name__ == "__main__":
-
-    data_dir = "csv_data"
-    df = csv_reader(os.path.join(data_dir, "cartpole-log.csv"))
--- a/datamodeler.py
+++ b/datamodeler.py
@ -152,7 +152,6 @@ def read_env_data():
 if __name__ == "__main__":

    args = parser.parse_args()
-    args.pickle = "/home/alizaidi/bonsai/repsol/data/scenario1/"

    with open(args.config_path) as cmfile:
        config = yaml.full_load(cmfile)
--- a/environment.yml
+++ b/environment.yml
@ -1,20 +1,29 @@
-name: datadriven
+name: ddm
+channels:
+  - pytorch
+  - defaults
 dependencies:
- python=3.8.5
- pip=19.1.1
- scipy=1.5.4
- pandas=1.1.4
- pip:
-  - ray==1.0.0
-  - joblib==0.17.0
-  - keras==2.4.3
-  - scikit-learn==0.23.2
-  - scikit-optimize==0.8.1
-  - tensorboard==2.3.0
-  - tensorflow==2.3.1
-  - tensorflow-estimator==2.3.0
-  - microsoft-bonsai-api==0.1.2
-  - bonsai-cli==1.0.4
-  - pyyaml==5.3.1
-  - h5py==2.10.0
-  - nbgrader==0.6.1
+  - python=3.7.7
+  - pip=19.1.1
+  - pytorch=1.7.0
+  - torchvision=0.8
+  - pip:
+      - ray==1.0.0
+      - black==19.10b0
+      - joblib==0.17.0
+      - keras==2.4.3
+      - scikit-learn==0.23.2
+      - scikit-optimize==0.8.1
+      - skorch==0.9.0
+      - tensorboard==2.3.0
+      - tensorflow==2.3.1
+      - tensorflow-estimator==2.3.0
+      - microsoft-bonsai-api==0.1.2
+      - bonsai-cli==1.0.4
+      - pandas==1.1.4
+      - pyyaml==5.3.1
+      - pytest==6.2.1
+      - scipy==1.5.4
+      - h5py==2.10.0
+      - nbgrader==0.6.1
+      - tune-sklearn==0.1.0
--- a/loaders.py
+++ b/loaders.py
@ -0,0 +1,82 @@
+import os
+import pandas as pd
+from typing import List, Tuple, Union
+import logging
+
+FORMAT = "%(message)s"
+logging.basicConfig(level="INFO", format=FORMAT, datefmt="[%X]")
+logger = logging.getLogger("data_loader")
+data_dir = "csv_data"
+
+
+class CsvReader(object):
+    def read(
+        self,
+        filename: str,
+        timelag: int = -1,
+        episode_col: Union[str, None] = "episode",
+        iteration_col: Union[str, None] = "iteration",
+        feature_cols: Union[List, str] = "state_",
+        max_rows: Union[int, None] = None,
+    ):
+
+        df = pd.read_csv(filename, nrows=max_rows)
+
+        # CASE 1: rows are of the form {st+1, at}
+        # Append st into next row
+        # if timelag < 0 then drop the iteration - timelag iteration from each episode
+        # and append previous state columns into each row: {st+1, at} -> {st, at, st+1}
+        if all([episode_col, iteration_col, timelag < 0]):
+            df = df.sort_values(by=[episode_col, iteration_col])
+            lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
+            lagged_df = lagged_df.drop([iteration_col], axis=1)
+            if type(feature_cols) == list:
+                lagged_df = lagged_df[feature_cols]
+            else:
+                self.feature_cols = [
+                    col for col in lagged_df if col.startswith(feature_cols)
+                ]
+                lagged_df = lagged_df[self.feature_cols]
+                lagged_df = lagged_df.rename(columns=lambda x: "prev_" + x)
+                self.feature_cols = list(lagged_df.columns.values)
+            joined_df = df.join(lagged_df)
+            # skip the first row of each episode since we do not have its st
+            joined_df = (
+                joined_df.groupby(by=episode_col, as_index=False)
+                .apply(lambda x: x.iloc[timelag * -1 :])
+                .reset_index()
+            )
+            return joined_df.drop(["level_0", "level_1"], axis=1)
+        # CASE 2: rows of the form {st, at}
+        # Append st+1 from next row into current row {st, at, st+1}
+        elif all([episode_col, iteration_col, timelag > 0]):
+            df = df.sort_values(by=[episode_col, iteration_col])
+            lagged_df = df.groupby(by=episode_col, as_index=False).shift(timelag * -1)
+            lagged_df = lagged_df.drop([iteration_col], axis=1)
+            if type(feature_cols) == list:
+                lagged_df = lagged_df[feature_cols]
+            else:
+                self.feature_cols = [
+                    col for col in lagged_df if col.startswith(feature_cols)
+                ]
+                lagged_df = lagged_df[self.feature_cols]
+                lagged_df = lagged_df.rename(columns=lambda x: "next_" + x)
+                self.feature_cols = list(lagged_df.columns.values)
+            joined_df = df.join(lagged_df)
+            # truncate before the end of timelag for complete observations only
+            joined_df = (
+                joined_df.groupby(by=episode_col, as_index=False)
+                .apply(lambda x: x.iloc[: timelag * -1])
+                .reset_index()
+            )
+            return joined_df.drop(["level_0", "level_1"], axis=1)
+        else:
+            return df
+
+
+if __name__ == "__main__":
+
+    csv_reader = CsvReader()
+    df = csv_reader.read(
+        os.path.join(data_dir, "cartpole-log.csv"), timelag=-1, max_rows=1000
+    )
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@ -1,10 +1,51 @@
 import os
-from data_loader import csv_reader, data_dir
+import pytest
+from loaders import CsvReader, data_dir
+from base import BaseModel


-def test_cartpole_at_st1():
+@pytest.fixture
+def csv_reader():
+    csv_reader = CsvReader()
+    return csv_reader

-    cp_df = csv_reader(csv_reader(os.path.join(data_dir, "cartpole-log.csv")))
-    assert cp_df.shape[0] == 490000
-    assert cp_df.shape[1] == 16
+
+def test_cartpole_at_st1(csv_reader):
+
+    cp_df = csv_reader.read(
+        os.path.join(data_dir, "cartpole-log.csv"), max_rows=1000, timelag=-1
+    )
+    assert cp_df.shape[0] == 980
+    assert cp_df.shape[1] == 13
+    assert (
+        cp_df["state_x_position"].values[0] == cp_df["prev_state_x_position"].values[1]
+    )
+
+
+def test_cartpole_at_st(csv_reader):
+
+    cp2_df = csv_reader.read(
+        os.path.join(data_dir, "cartpole_at_st.csv"), timelag=1, max_rows=1000
+    )
+
+    assert cp2_df.shape[0] == 980
+    assert cp2_df.shape[1] == 13
+    assert (
+        cp2_df["state_x_position"].values[1]
+        == cp2_df["next_state_x_position"].values[0]
+    )
+
+
+def test_base_reader():
+
+    base_model = BaseModel()
+    X,y = base_model.load_csv(
+        dataset_path=os.path.join(data_dir, "cartpole-log.csv"),
+        max_rows=1000,
+        augm_cols=["action_command", "config_length", "config_masspole"],
+    )
+
+    assert X.shape[0] == 980 == y.shape[0]
+    assert X.shape[1] == 7
+    assert y.shape[1] == 4

--- a/tests/test_pytorch_model.py
+++ b/tests/test_pytorch_model.py
@ -0,0 +1,31 @@
+import os
+import pytest
+from torch_models import PyTorchModel
+
+torch_model = PyTorchModel()
+X, y = torch_model.load_csv(
+    dataset_path="csv_data/cartpole-log.csv",
+    max_rows=1000,
+    augm_cols=["action_command", "config_length", "config_masspole"],
+)
+
+
+def test_shape():
+
+    assert X.shape[0] == 980 == y.shape[0]
+    assert X.shape[1] == torch_model.input_dim
+    assert y.shape[1] == torch_model.output_dim
+
+
+def test_build():
+
+    torch_model.build_model()
+    assert torch_model.scale_data == False
+    assert torch_model.model is not None
+
+
+def test_fit():
+
+    torch_model.build_model()
+    torch_model.fit(X, y)
+
--- a/torch_models.py
+++ b/torch_models.py
@ -8,7 +8,6 @@ from skorch.callbacks import LRScheduler
 from torch.optim.lr_scheduler import CyclicLR

 from base import BaseModel
-from tune_sklearn import TuneGridSearchCV, TuneSearchCV


 class MVRegressor(nn.Module):
@ -127,6 +126,8 @@ class PyTorchModel(BaseModel):
        scoring_func: str = "r2",
    ):

+        from tune_sklearn import TuneGridSearchCV, TuneSearchCV
+
        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
@ -147,10 +148,15 @@ class PyTorchModel(BaseModel):
 if __name__ == "__main__":

    pytorch_model = PyTorchModel()
-    X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")
+    X, y = pytorch_model.load_csv(
+        dataset_path="csv_data/cartpole-log.csv",
+        max_rows=1000,
+        augm_cols=["action_command", "config_length", "config_masspole"],
+    )
+    # X, y = pytorch_model.load_numpy("/home/alizaidi/bonsai/repsol/data/scenario1")

    pytorch_model.build_model()
-    # pytorch_model.fit(X, y)
+    pytorch_model.fit(X, y)
    # predict_one = pytorch_model.predict(X[0])

    # tune tests
@ -158,5 +164,5 @@ if __name__ == "__main__":
    # gs = TuneGridSearchCV(pytorch_model.model, params, scoring="neg_mean_squared_error")
    # gs.fit(torch.tensor(X).float(), torch.tensor(y).float())

-    params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
-    pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")
+    # params = {"lr": [0.01, 0.02], "module__num_units": [10, 50]}
+    # pytorch_model.sweep(params=params, X=X, y=y, search_algorithm="hyperopt")