Vapaunic/split bug (#65)

* fixed a yield bug * removed two blank files * modified split data function to auto-calculate the splits based on the parameters * removed forecast_settings module * removed unused parameter * modified splitting function to use non-overlapping testing * tested the split function after the update * minor fix * defaults changed in split function * modified lightgbm example with new split function * modified automl example (needs verification) * modified data explore notebook * quick fix: * updated data preparation notebook * changed defaults in split function * Addressed changes in lightgbm * addressed issues in automl notebook * fixed typo in lightgbm plot
2020-02-20 15:27:40 +00:00 · 2020-02-20 15:27:40 +00:00 · 4cc59ac53e
--- a/examples/00_quick_start/azure_automl_forecast.ipynb
+++ b/examples/00_quick_start/azure_automl_forecast.ipynb
@ -67,10 +67,9 @@
    "import pandas as pd\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
-    "import fclib.common.forecast_settings as fs\n",
    "from fclib.common.utils import git_repo_path\n",
    "from fclib.evaluation.evaluation_utils import MAPE\n",
-    "from fclib.dataset.ojdata import download_ojdata\n",
+    "from fclib.dataset.ojdata import download_ojdata, FIRST_WEEK_START\n",
    "from fclib.common.utils import align_outputs\n",
    "from fclib.models.multiple_linear_regression import fit, predict\n",
    "\n",
@ -100,8 +99,9 @@
    "# Data directory\n",
    "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n",
    "\n",
-    "# Maximum week of the data\n",
-    "MAX_WEEK = 138\n",
+    "# Forecasting settings\n",
+    "HORIZON = 2\n",
+    "LAST_WEEK = 138\n",
    "\n",
    "# Number of test periods\n",
    "NUM_TEST_PERIODS = 3\n",
@ -350,7 +350,7 @@
    "if DOWNLOAD_SPLIT_DATA:\n",
    "    download_ojdata(DATA_DIR)\n",
    "    df = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n",
-    "    df = df.loc[df.week <= MAX_WEEK]"
+    "    df = df.loc[df.week <= LAST_WEEK]"
   ]
  },
  {
@ -362,7 +362,7 @@
    "# Convert logarithm of the unit sales to unit sales\n",
    "df[\"move\"] = df[\"logmove\"].apply(lambda x: round(math.exp(x)))\n",
    "# Add timestamp column\n",
-    "df[\"week_start\"] = df[\"week\"].apply(lambda x: fs.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
+    "df[\"week_start\"] = df[\"week\"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
    "# Select a subset of stores for demo purpose\n",
    "df_sub = df[df.store.isin(USE_STORES)]"
   ]
@ -2356,7 +2356,7 @@
    }
   ],
   "source": [
-    "pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - fs.PRED_STEPS]\n",
+    "pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - HORIZON]\n",
    "mape_automl_sub = MAPE(pred_automl_sub[\"predicted\"], pred_automl_sub[\"move\"]) * 100\n",
    "print(\"MAPE of forecasts obtained by AutoML in the last two weeks: \" + str(mape_automl_sub))"
   ]
@ -2680,7 +2680,7 @@
    }
   ],
   "source": [
-    "pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - fs.PRED_STEPS]\n",
+    "pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - HORIZON]\n",
    "mape_lr_sub = MAPE(pred_lr_sub[\"prediction\"], pred_lr_sub[\"move\"]) * 100\n",
    "print(\"MAPE of forecasts obtained by multiple linear regression in the last two weeks: \" + str(mape_lr_sub))"
   ]
@ -2741,7 +2741,7 @@
    }
   ],
   "source": [
-    "pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - fs.PRED_STEPS]\n",
+    "pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - HORIZON]\n",
    "mape_final_sub = MAPE(pred_final_sub[\"combined_prediction\"], pred_final_sub[\"move\"]) * 100\n",
    "print(\"MAPE of forecasts obtained by the combined model in the last two weeks: \" + str(mape_final_sub))"
   ]
--- a/examples/00_quick_start/lightgbm_point_forecast.ipynb
+++ b/examples/00_quick_start/lightgbm_point_forecast.ipynb
--- a/examples/01_prepare_data/data_explore_retail_python.ipynb
+++ b/examples/01_prepare_data/data_explore_retail_python.ipynb
@ -8,15 +8,15 @@
    "\n",
    "In this notebook, we use Python to explore the Orange Juice dataset in R package `bayesm`. This dataset is used in the retail forecasting benchmark [OrangeJuice_Pt_3Weeks_Weekly](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_git/TSPerf?path=%2Fretail_sales%2FOrangeJuice_Pt_3Weeks_Weekly&version=GBmaster) of TSPerf. \n",
    "\n",
-    "To run this notebook, please first create and activate `forecast` conda environment by following the setup instructions.\n",
+    "To run this notebook, please first create and activate `forecasting_env` conda environment by following the setup instructions.\n",
    "\n",
-    "Then, inside the `forecast` environment, please run the following commands to create the jupyter notebook kernel:\n",
+    "Then, inside the `forecasting_env` environment, please run the following commands to create the jupyter notebook kernel:\n",
    "\n",
    "``\n",
-    "python -m ipykernel install --name forecast\n",
+    "python -m ipykernel install --name forecasting_env\n",
    "``\n",
    "\n",
-    "Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecast` in the list of kernels under Kernel tab."
+    "Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecasting_env` in the list of kernels under Kernel tab."
   ]
  },
  {
@ -147,10 +147,10 @@
   ],
   "source": [
    "# Check number of time series and lengths\n",
-    "print('number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
-    "print('number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
-    "print('number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
-    "print('lenth distribution of the time series:')\n",
+    "print('Number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
+    "print('Number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
+    "print('Number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
+    "print('Lenth distribution of the time series:')\n",
    "print(sales.groupby(['store', 'brand']).size().describe())"
   ]
  },
@ -1160,9 +1160,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "tsperf",
+   "display_name": "forecasting_env",
   "language": "python",
-   "name": "tsperf"
+   "name": "forecasting_env"
  },
  "language_info": {
   "codemirror_mode": {
@ -1174,7 +1174,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.10"
  }
 },
 "nbformat": 4,
--- a/examples/01_prepare_data/data_prepare_retail.ipynb
+++ b/examples/01_prepare_data/data_prepare_retail.ipynb
--- a/fclib/fclib/common/constants.py
+++ b/fclib/fclib/common/constants.py
@ -1,2 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
--- a/fclib/fclib/common/forecast_settings.py
+++ b/fclib/fclib/common/forecast_settings.py
@ -1,15 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-# This module contains experiment related parameters.
-
-import pandas as pd
-
-NUM_ROUNDS = 1
-PRED_HORIZON = 3
-PRED_STEPS = 2
-TRAIN_START_WEEK = 40
-TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
-TEST_START_WEEK_LIST = list(range(137, 161, 2))
-TEST_END_WEEK_LIST = list(range(138, 162, 2))
-FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")  # The start datetime of the first week in the record
--- a/fclib/fclib/common/timer.py
+++ b/fclib/fclib/common/timer.py
--- a/fclib/fclib/dataset/ojdata.py
+++ b/fclib/fclib/dataset/ojdata.py
@ -16,6 +16,9 @@ DEFAULT_TARGET_COL = "move"
 DEFAULT_STATIC_FEA = None
 DEFAULT_DYNAMIC_FEA = ["deal", "feat"]

+# The start datetime of the first week in the record
+FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")
+

 def download_ojdata(dest_dir):
    """Downloads Orange Juice dataset.
@ -53,7 +56,29 @@ def maybe_download(dest_dir):
        print("Data already exists at the specified location.")


-def split_train_test(data_dir, forecast_settings, write_csv=False):
+def _gen_split_indices(n_splits=12, horizon=2, gap=2, first_week=40, last_week=156):
+    """Generate week splits for given parameters"""
+    test_start_index = last_week - (horizon * n_splits) + 1
+    train_end_index_first = test_start_index - gap
+    train_end_index_last = train_end_index_first + (n_splits - 1) * horizon
+
+    assert (
+        test_start_index >= first_week
+    ), f"Please adjust your parameters, so that testing data (currently week {test_start_index}), \
+         starts after the first available week (week {first_week})."
+
+    assert (
+        train_end_index_first >= first_week
+    ), f"Please adjust your parameters, so that last training data point (currently week {train_end_index_first}) \
+        comes after the first available week (week {first_week})."
+
+    test_start_week_list = list(range(test_start_index, (last_week - horizon + 1) + 1, horizon))
+    test_end_week_list = list(range(test_start_index + horizon - 1, last_week + 1, horizon))
+    train_end_week_list = list(range(train_end_index_first, train_end_index_last + 1, horizon))
+    return test_start_week_list, test_end_week_list, train_end_week_list
+
+
+def split_train_test(data_dir, n_splits=1, horizon=2, gap=2, first_week=40, last_week=156, write_csv=False):
    """Generate training, testing, and auxiliary datasets. Training data includes the historical 
    sales and external features; testing data contains the future sales and external features; 
    auxiliary data includes the future price, deal, and advertisement information which can be 
@ -64,30 +89,32 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
    Note that train_*.csv files in /train folder contain all the features in the training period
    and aux_*.csv files in /train folder contain all the features except 'logmove', 'constant',
    'profit' up until the forecast period end week. Both train_*.csv and aux_*csv can be used for
-    generating forecasts in each round. However, test_*.csv files in /test folder can only be used
+    generating forecasts in each split. However, test_*.csv files in /test folder can only be used
    for model performance evaluation.

    Example:
-        from fclib.common.utils import forecast_settings
+        data_dir = "/home/vapaunic/forecasting/ojdata"

-        data_dir = "/home/forecasting/ojdata"
+        train, test, aux = split_train_test(data_dir=data_dir, n_splits=5, horizon=3, write_csv=True)

-        for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings):
-            print("Training data size: {}".format(train.shape))
-            print("Testing data size: {}".format(test.shape))
-            print("Auxiliary data size: {}".format(aux.shape))
-            print("Minimum training week number: {}".format(min(train["week"])))
-            print("Maximum training week number: {}".format(max(train["week"])))
-            print("Minimum testing week number: {}".format(min(test["week"])))
-            print("Maximum testing week number: {}".format(max(test["week"])))
-            print("Minimum auxiliary week number: {}".format(min(aux["week"])))
-            print("Maximum auxiliary week number: {}".format(max(aux["week"])))
-            print("")
+        print(len(train))
+        print(len(test))
+        print(len(aux))

    Args:
        data_dir (str): location of the download directory
-        forecast_settings (dict): dictionary containing forecast experiment parameters
-        write_csv (Boolean): Whether to write out the data files or not
+        n_splits (int, optional): number of splits (folds) to generate (default: 1) 
+        horizon (int, optional): forecasting horizon, number of weeks to forecast (default: 2) 
+        gap (int, optional): gap between training and testing, number of weeks between last training 
+            week and first test week (default: 2) 
+        first_week (int, optional): first available week (default: 40) 
+        last_week (int, optional): last available week (default: 156)
+        write_csv (Boolean, optional): Whether to write out the data files or not (default: False)
+    
+    Returns:
+        list[pandas.DataFrame]: a list containing train data frames for each split
+        list[pandas.DataFrame]: a list containing test data frames for each split
+        list[pandas.DataFrame]: a list containing aux data frames for each split
        
    """
    # Read sales data into dataframe
@ -101,27 +128,34 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
        if not os.path.isdir(TEST_DATA_DIR):
            os.mkdir(TEST_DATA_DIR)

-    for i in range(forecast_settings.NUM_ROUNDS):
-        data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
-            sales.week <= forecast_settings.TRAIN_END_WEEK_LIST[i]
-        )
-        train = sales[data_mask].copy()
-        data_mask = (sales.week >= forecast_settings.TEST_START_WEEK_LIST[i]) & (
-            sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
-        )
-        test = sales[data_mask].copy()
-        data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
-            sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
-        )
-        aux = sales[data_mask].copy()
-        aux.drop(["logmove", "constant", "profit"], axis=1, inplace=True)
+    train_df_list = list()
+    test_df_list = list()
+    aux_df_list = list()
+
+    test_start_week_list, test_end_week_list, train_end_week_list = _gen_split_indices(
+        n_splits, horizon, gap, first_week, last_week
+    )
+
+    for i in range(n_splits):
+        data_mask = (sales.week >= first_week) & (sales.week <= train_end_week_list[i])
+        train_df = sales[data_mask].copy()
+        data_mask = (sales.week >= test_start_week_list[i]) & (sales.week <= test_end_week_list[i])
+        test_df = sales[data_mask].copy()
+        data_mask = (sales.week >= first_week) & (sales.week <= test_end_week_list[i])
+        aux_df = sales[data_mask].copy()
+        aux_df.drop(["logmove", "constant", "profit"], axis=1, inplace=True)

        if write_csv:
-            roundstr = "_" + str(i + 1) if forecast_settings.NUM_ROUNDS > 1 else ""
-            train.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
-            test.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
-            aux.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
-        yield train, test, aux
+            roundstr = "_" + str(i + 1) if n_splits > 1 else ""
+            train_df.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
+            test_df.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
+            aux_df.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
+
+        train_df_list.append(train_df)
+        test_df_list.append(test_df)
+        aux_df_list.append(aux_df)
+
+    return train_df_list, test_df_list, aux_df_list


 def specify_data_schema(
@ -294,7 +328,6 @@ def _check_static_feat(df, ts_id_col_names, static_feat_names):

 def specify_retail_data_schema(
    data_dir,
-    forecast_settings,
    sales=None,
    target_col_name=DEFAULT_TARGET_COL,
    static_feat_names=DEFAULT_STATIC_FEA,
@ -309,7 +342,7 @@ def specify_retail_data_schema(
        print(df_config)

    Args:
-        sales (Pandas DataFrame): sales data in the current forecast round
+        sales (Pandas DataFrame): sales data in the current forecast split
        target_col_name (str): name of the target column that need to be forecasted
        static_feat_names (list): names of the feature columns that do not change over time
        dynamic_feat_names (list): names of the feature columns that can change over time
@ -319,9 +352,9 @@ def specify_retail_data_schema(
        df_config (dict): configuration of the time series data 
        df (Pandas DataFrame): sales data combined with store demographic features
    """
-    # Read the 1st round training data if "sales" is not specified
+    # Read the 1st split of training data if "sales" is not specified
    if sales is None:
-        print("Sales dataframe is not given! The 1st round training data will be used.")
+        print("Sales dataframe is not given! The 1st split of training data will be used.")
        sales = pd.read_csv(os.path.join(data_dir, "train", "train_round_1.csv"), index_col=False)
        aux = pd.read_csv(os.path.join(data_dir, "train", "aux_round_1.csv"), index_col=False)
        # Merge with future price, deal, and advertisement info
@ -361,9 +394,7 @@ def specify_retail_data_schema(
    df.drop("STORE", axis=1, inplace=True)

    # Create timestamp
-    df["timestamp"] = df["week"].apply(
-        lambda x: forecast_settings.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7)
-    )
+    df["timestamp"] = df["week"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))

    df_config = specify_data_schema(
        df,
@ -380,19 +411,13 @@ def specify_retail_data_schema(


 if __name__ == "__main__":
-    from fclib.common import forecast_settings
-
-    forecast_settings.NUM_ROUNDS = 3
    data_dir = "/home/vapaunic/forecasting/ojdata"

-    for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings, write_csv=True):
-        print("Training data size: {}".format(train.shape))
-        print("Testing data size: {}".format(test.shape))
-        print("Auxiliary data size: {}".format(aux.shape))
-        print("Minimum training week number: {}".format(min(train["week"])))
-        print("Maximum training week number: {}".format(max(train["week"])))
-        print("Minimum testing week number: {}".format(min(test["week"])))
-        print("Maximum testing week number: {}".format(max(test["week"])))
-        print("Minimum auxiliary week number: {}".format(min(aux["week"])))
-        print("Maximum auxiliary week number: {}".format(max(aux["week"])))
-        print("")
+    download_ojdata(data_dir)
+    # train, test, aux = split_train_test(data_dir=data_dir, n_splits=1, horizon=2, write_csv=True)
+
+    # print((test[0].week))
+    # print((test[1].week))
+    # print((test[2].week))
+    # print((test[3].week))
+    # print((test[4].week))