* fixed a yield bug

* removed two blank files

* modified split data function to auto-calculate the splits based on the parameters

* removed forecast_settings module

* removed unused parameter

* modified splitting function to use non-overlapping testing

* tested the split function after the update

* minor fix

* defaults changed in split function

* modified lightgbm example with new split function

* modified automl example (needs verification)

* modified data explore notebook

* quick fix:

* updated data preparation notebook

* changed defaults in split function

* Addressed changes in lightgbm

* addressed issues in automl notebook

* fixed typo in lightgbm plot
This commit is contained in:
vapaunic 2020-02-20 15:27:40 +00:00 коммит произвёл GitHub
Родитель 6b2ef3f100
Коммит 4cc59ac53e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 815 добавлений и 6140 удалений

Просмотреть файл

@ -67,10 +67,9 @@
"import pandas as pd\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import fclib.common.forecast_settings as fs\n",
"from fclib.common.utils import git_repo_path\n",
"from fclib.evaluation.evaluation_utils import MAPE\n",
"from fclib.dataset.ojdata import download_ojdata\n",
"from fclib.dataset.ojdata import download_ojdata, FIRST_WEEK_START\n",
"from fclib.common.utils import align_outputs\n",
"from fclib.models.multiple_linear_regression import fit, predict\n",
"\n",
@ -100,8 +99,9 @@
"# Data directory\n",
"DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n",
"\n",
"# Maximum week of the data\n",
"MAX_WEEK = 138\n",
"# Forecasting settings\n",
"HORIZON = 2\n",
"LAST_WEEK = 138\n",
"\n",
"# Number of test periods\n",
"NUM_TEST_PERIODS = 3\n",
@ -350,7 +350,7 @@
"if DOWNLOAD_SPLIT_DATA:\n",
" download_ojdata(DATA_DIR)\n",
" df = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n",
" df = df.loc[df.week <= MAX_WEEK]"
" df = df.loc[df.week <= LAST_WEEK]"
]
},
{
@ -362,7 +362,7 @@
"# Convert logarithm of the unit sales to unit sales\n",
"df[\"move\"] = df[\"logmove\"].apply(lambda x: round(math.exp(x)))\n",
"# Add timestamp column\n",
"df[\"week_start\"] = df[\"week\"].apply(lambda x: fs.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
"df[\"week_start\"] = df[\"week\"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
"# Select a subset of stores for demo purpose\n",
"df_sub = df[df.store.isin(USE_STORES)]"
]
@ -2356,7 +2356,7 @@
}
],
"source": [
"pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - fs.PRED_STEPS]\n",
"pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - HORIZON]\n",
"mape_automl_sub = MAPE(pred_automl_sub[\"predicted\"], pred_automl_sub[\"move\"]) * 100\n",
"print(\"MAPE of forecasts obtained by AutoML in the last two weeks: \" + str(mape_automl_sub))"
]
@ -2680,7 +2680,7 @@
}
],
"source": [
"pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - fs.PRED_STEPS]\n",
"pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - HORIZON]\n",
"mape_lr_sub = MAPE(pred_lr_sub[\"prediction\"], pred_lr_sub[\"move\"]) * 100\n",
"print(\"MAPE of forecasts obtained by multiple linear regression in the last two weeks: \" + str(mape_lr_sub))"
]
@ -2741,7 +2741,7 @@
}
],
"source": [
"pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - fs.PRED_STEPS]\n",
"pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - HORIZON]\n",
"mape_final_sub = MAPE(pred_final_sub[\"combined_prediction\"], pred_final_sub[\"move\"]) * 100\n",
"print(\"MAPE of forecasts obtained by the combined model in the last two weeks: \" + str(mape_final_sub))"
]

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -8,15 +8,15 @@
"\n",
"In this notebook, we use Python to explore the Orange Juice dataset in R package `bayesm`. This dataset is used in the retail forecasting benchmark [OrangeJuice_Pt_3Weeks_Weekly](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_git/TSPerf?path=%2Fretail_sales%2FOrangeJuice_Pt_3Weeks_Weekly&version=GBmaster) of TSPerf. \n",
"\n",
"To run this notebook, please first create and activate `forecast` conda environment by following the setup instructions.\n",
"To run this notebook, please first create and activate `forecasting_env` conda environment by following the setup instructions.\n",
"\n",
"Then, inside the `forecast` environment, please run the following commands to create the jupyter notebook kernel:\n",
"Then, inside the `forecasting_env` environment, please run the following commands to create the jupyter notebook kernel:\n",
"\n",
"``\n",
"python -m ipykernel install --name forecast\n",
"python -m ipykernel install --name forecasting_env\n",
"``\n",
"\n",
"Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecast` in the list of kernels under Kernel tab."
"Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecasting_env` in the list of kernels under Kernel tab."
]
},
{
@ -147,10 +147,10 @@
],
"source": [
"# Check number of time series and lengths\n",
"print('number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
"print('number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
"print('number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
"print('lenth distribution of the time series:')\n",
"print('Number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
"print('Number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
"print('Number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
"print('Lenth distribution of the time series:')\n",
"print(sales.groupby(['store', 'brand']).size().describe())"
]
},
@ -1160,9 +1160,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "tsperf",
"display_name": "forecasting_env",
"language": "python",
"name": "tsperf"
"name": "forecasting_env"
},
"language_info": {
"codemirror_mode": {
@ -1174,7 +1174,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.10"
}
},
"nbformat": 4,

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,2 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

Просмотреть файл

@ -1,15 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# This module contains experiment related parameters.
import pandas as pd
NUM_ROUNDS = 1
PRED_HORIZON = 3
PRED_STEPS = 2
TRAIN_START_WEEK = 40
TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
TEST_START_WEEK_LIST = list(range(137, 161, 2))
TEST_END_WEEK_LIST = list(range(138, 162, 2))
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") # The start datetime of the first week in the record

Просмотреть файл

Просмотреть файл

@ -16,6 +16,9 @@ DEFAULT_TARGET_COL = "move"
DEFAULT_STATIC_FEA = None
DEFAULT_DYNAMIC_FEA = ["deal", "feat"]
# The start datetime of the first week in the record
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")
def download_ojdata(dest_dir):
"""Downloads Orange Juice dataset.
@ -53,7 +56,29 @@ def maybe_download(dest_dir):
print("Data already exists at the specified location.")
def split_train_test(data_dir, forecast_settings, write_csv=False):
def _gen_split_indices(n_splits=12, horizon=2, gap=2, first_week=40, last_week=156):
"""Generate week splits for given parameters"""
test_start_index = last_week - (horizon * n_splits) + 1
train_end_index_first = test_start_index - gap
train_end_index_last = train_end_index_first + (n_splits - 1) * horizon
assert (
test_start_index >= first_week
), f"Please adjust your parameters, so that testing data (currently week {test_start_index}), \
starts after the first available week (week {first_week})."
assert (
train_end_index_first >= first_week
), f"Please adjust your parameters, so that last training data point (currently week {train_end_index_first}) \
comes after the first available week (week {first_week})."
test_start_week_list = list(range(test_start_index, (last_week - horizon + 1) + 1, horizon))
test_end_week_list = list(range(test_start_index + horizon - 1, last_week + 1, horizon))
train_end_week_list = list(range(train_end_index_first, train_end_index_last + 1, horizon))
return test_start_week_list, test_end_week_list, train_end_week_list
def split_train_test(data_dir, n_splits=1, horizon=2, gap=2, first_week=40, last_week=156, write_csv=False):
"""Generate training, testing, and auxiliary datasets. Training data includes the historical
sales and external features; testing data contains the future sales and external features;
auxiliary data includes the future price, deal, and advertisement information which can be
@ -64,30 +89,32 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
Note that train_*.csv files in /train folder contain all the features in the training period
and aux_*.csv files in /train folder contain all the features except 'logmove', 'constant',
'profit' up until the forecast period end week. Both train_*.csv and aux_*csv can be used for
generating forecasts in each round. However, test_*.csv files in /test folder can only be used
generating forecasts in each split. However, test_*.csv files in /test folder can only be used
for model performance evaluation.
Example:
from fclib.common.utils import forecast_settings
data_dir = "/home/vapaunic/forecasting/ojdata"
data_dir = "/home/forecasting/ojdata"
train, test, aux = split_train_test(data_dir=data_dir, n_splits=5, horizon=3, write_csv=True)
for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings):
print("Training data size: {}".format(train.shape))
print("Testing data size: {}".format(test.shape))
print("Auxiliary data size: {}".format(aux.shape))
print("Minimum training week number: {}".format(min(train["week"])))
print("Maximum training week number: {}".format(max(train["week"])))
print("Minimum testing week number: {}".format(min(test["week"])))
print("Maximum testing week number: {}".format(max(test["week"])))
print("Minimum auxiliary week number: {}".format(min(aux["week"])))
print("Maximum auxiliary week number: {}".format(max(aux["week"])))
print("")
print(len(train))
print(len(test))
print(len(aux))
Args:
data_dir (str): location of the download directory
forecast_settings (dict): dictionary containing forecast experiment parameters
write_csv (Boolean): Whether to write out the data files or not
n_splits (int, optional): number of splits (folds) to generate (default: 1)
horizon (int, optional): forecasting horizon, number of weeks to forecast (default: 2)
gap (int, optional): gap between training and testing, number of weeks between last training
week and first test week (default: 2)
first_week (int, optional): first available week (default: 40)
last_week (int, optional): last available week (default: 156)
write_csv (Boolean, optional): Whether to write out the data files or not (default: False)
Returns:
list[pandas.DataFrame]: a list containing train data frames for each split
list[pandas.DataFrame]: a list containing test data frames for each split
list[pandas.DataFrame]: a list containing aux data frames for each split
"""
# Read sales data into dataframe
@ -101,27 +128,34 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
if not os.path.isdir(TEST_DATA_DIR):
os.mkdir(TEST_DATA_DIR)
for i in range(forecast_settings.NUM_ROUNDS):
data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
sales.week <= forecast_settings.TRAIN_END_WEEK_LIST[i]
)
train = sales[data_mask].copy()
data_mask = (sales.week >= forecast_settings.TEST_START_WEEK_LIST[i]) & (
sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
)
test = sales[data_mask].copy()
data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
)
aux = sales[data_mask].copy()
aux.drop(["logmove", "constant", "profit"], axis=1, inplace=True)
train_df_list = list()
test_df_list = list()
aux_df_list = list()
test_start_week_list, test_end_week_list, train_end_week_list = _gen_split_indices(
n_splits, horizon, gap, first_week, last_week
)
for i in range(n_splits):
data_mask = (sales.week >= first_week) & (sales.week <= train_end_week_list[i])
train_df = sales[data_mask].copy()
data_mask = (sales.week >= test_start_week_list[i]) & (sales.week <= test_end_week_list[i])
test_df = sales[data_mask].copy()
data_mask = (sales.week >= first_week) & (sales.week <= test_end_week_list[i])
aux_df = sales[data_mask].copy()
aux_df.drop(["logmove", "constant", "profit"], axis=1, inplace=True)
if write_csv:
roundstr = "_" + str(i + 1) if forecast_settings.NUM_ROUNDS > 1 else ""
train.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
test.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
aux.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
yield train, test, aux
roundstr = "_" + str(i + 1) if n_splits > 1 else ""
train_df.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
test_df.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
aux_df.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
train_df_list.append(train_df)
test_df_list.append(test_df)
aux_df_list.append(aux_df)
return train_df_list, test_df_list, aux_df_list
def specify_data_schema(
@ -294,7 +328,6 @@ def _check_static_feat(df, ts_id_col_names, static_feat_names):
def specify_retail_data_schema(
data_dir,
forecast_settings,
sales=None,
target_col_name=DEFAULT_TARGET_COL,
static_feat_names=DEFAULT_STATIC_FEA,
@ -309,7 +342,7 @@ def specify_retail_data_schema(
print(df_config)
Args:
sales (Pandas DataFrame): sales data in the current forecast round
sales (Pandas DataFrame): sales data in the current forecast split
target_col_name (str): name of the target column that need to be forecasted
static_feat_names (list): names of the feature columns that do not change over time
dynamic_feat_names (list): names of the feature columns that can change over time
@ -319,9 +352,9 @@ def specify_retail_data_schema(
df_config (dict): configuration of the time series data
df (Pandas DataFrame): sales data combined with store demographic features
"""
# Read the 1st round training data if "sales" is not specified
# Read the 1st split of training data if "sales" is not specified
if sales is None:
print("Sales dataframe is not given! The 1st round training data will be used.")
print("Sales dataframe is not given! The 1st split of training data will be used.")
sales = pd.read_csv(os.path.join(data_dir, "train", "train_round_1.csv"), index_col=False)
aux = pd.read_csv(os.path.join(data_dir, "train", "aux_round_1.csv"), index_col=False)
# Merge with future price, deal, and advertisement info
@ -361,9 +394,7 @@ def specify_retail_data_schema(
df.drop("STORE", axis=1, inplace=True)
# Create timestamp
df["timestamp"] = df["week"].apply(
lambda x: forecast_settings.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7)
)
df["timestamp"] = df["week"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))
df_config = specify_data_schema(
df,
@ -380,19 +411,13 @@ def specify_retail_data_schema(
if __name__ == "__main__":
from fclib.common import forecast_settings
forecast_settings.NUM_ROUNDS = 3
data_dir = "/home/vapaunic/forecasting/ojdata"
for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings, write_csv=True):
print("Training data size: {}".format(train.shape))
print("Testing data size: {}".format(test.shape))
print("Auxiliary data size: {}".format(aux.shape))
print("Minimum training week number: {}".format(min(train["week"])))
print("Maximum training week number: {}".format(max(train["week"])))
print("Minimum testing week number: {}".format(min(test["week"])))
print("Maximum testing week number: {}".format(max(test["week"])))
print("Minimum auxiliary week number: {}".format(min(aux["week"])))
print("Maximum auxiliary week number: {}".format(max(aux["week"])))
print("")
download_ojdata(data_dir)
# train, test, aux = split_train_test(data_dir=data_dir, n_splits=1, horizon=2, write_csv=True)
# print((test[0].week))
# print((test[1].week))
# print((test[2].week))
# print((test[3].week))
# print((test[4].week))