Vapaunic/split bug (#65)
* fixed a yield bug * removed two blank files * modified split data function to auto-calculate the splits based on the parameters * removed forecast_settings module * removed unused parameter * modified splitting function to use non-overlapping testing * tested the split function after the update * minor fix * defaults changed in split function * modified lightgbm example with new split function * modified automl example (needs verification) * modified data explore notebook * quick fix: * updated data preparation notebook * changed defaults in split function * Addressed changes in lightgbm * addressed issues in automl notebook * fixed typo in lightgbm plot
This commit is contained in:
Родитель
6b2ef3f100
Коммит
4cc59ac53e
|
@ -67,10 +67,9 @@
|
|||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"import fclib.common.forecast_settings as fs\n",
|
||||
"from fclib.common.utils import git_repo_path\n",
|
||||
"from fclib.evaluation.evaluation_utils import MAPE\n",
|
||||
"from fclib.dataset.ojdata import download_ojdata\n",
|
||||
"from fclib.dataset.ojdata import download_ojdata, FIRST_WEEK_START\n",
|
||||
"from fclib.common.utils import align_outputs\n",
|
||||
"from fclib.models.multiple_linear_regression import fit, predict\n",
|
||||
"\n",
|
||||
|
@ -100,8 +99,9 @@
|
|||
"# Data directory\n",
|
||||
"DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n",
|
||||
"\n",
|
||||
"# Maximum week of the data\n",
|
||||
"MAX_WEEK = 138\n",
|
||||
"# Forecasting settings\n",
|
||||
"HORIZON = 2\n",
|
||||
"LAST_WEEK = 138\n",
|
||||
"\n",
|
||||
"# Number of test periods\n",
|
||||
"NUM_TEST_PERIODS = 3\n",
|
||||
|
@ -350,7 +350,7 @@
|
|||
"if DOWNLOAD_SPLIT_DATA:\n",
|
||||
" download_ojdata(DATA_DIR)\n",
|
||||
" df = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n",
|
||||
" df = df.loc[df.week <= MAX_WEEK]"
|
||||
" df = df.loc[df.week <= LAST_WEEK]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -362,7 +362,7 @@
|
|||
"# Convert logarithm of the unit sales to unit sales\n",
|
||||
"df[\"move\"] = df[\"logmove\"].apply(lambda x: round(math.exp(x)))\n",
|
||||
"# Add timestamp column\n",
|
||||
"df[\"week_start\"] = df[\"week\"].apply(lambda x: fs.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
|
||||
"df[\"week_start\"] = df[\"week\"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n",
|
||||
"# Select a subset of stores for demo purpose\n",
|
||||
"df_sub = df[df.store.isin(USE_STORES)]"
|
||||
]
|
||||
|
@ -2356,7 +2356,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - fs.PRED_STEPS]\n",
|
||||
"pred_automl_sub = pred_automl.loc[pred_automl.week > max(test_df.week) - HORIZON]\n",
|
||||
"mape_automl_sub = MAPE(pred_automl_sub[\"predicted\"], pred_automl_sub[\"move\"]) * 100\n",
|
||||
"print(\"MAPE of forecasts obtained by AutoML in the last two weeks: \" + str(mape_automl_sub))"
|
||||
]
|
||||
|
@ -2680,7 +2680,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - fs.PRED_STEPS]\n",
|
||||
"pred_lr_sub = pred_lr.loc[pred_lr.week > max(test_df.week) - HORIZON]\n",
|
||||
"mape_lr_sub = MAPE(pred_lr_sub[\"prediction\"], pred_lr_sub[\"move\"]) * 100\n",
|
||||
"print(\"MAPE of forecasts obtained by multiple linear regression in the last two weeks: \" + str(mape_lr_sub))"
|
||||
]
|
||||
|
@ -2741,7 +2741,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - fs.PRED_STEPS]\n",
|
||||
"pred_final_sub = pred_final.loc[pred_final.week > max(test_df.week) - HORIZON]\n",
|
||||
"mape_final_sub = MAPE(pred_final_sub[\"combined_prediction\"], pred_final_sub[\"move\"]) * 100\n",
|
||||
"print(\"MAPE of forecasts obtained by the combined model in the last two weeks: \" + str(mape_final_sub))"
|
||||
]
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -8,15 +8,15 @@
|
|||
"\n",
|
||||
"In this notebook, we use Python to explore the Orange Juice dataset in R package `bayesm`. This dataset is used in the retail forecasting benchmark [OrangeJuice_Pt_3Weeks_Weekly](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_git/TSPerf?path=%2Fretail_sales%2FOrangeJuice_Pt_3Weeks_Weekly&version=GBmaster) of TSPerf. \n",
|
||||
"\n",
|
||||
"To run this notebook, please first create and activate `forecast` conda environment by following the setup instructions.\n",
|
||||
"To run this notebook, please first create and activate `forecasting_env` conda environment by following the setup instructions.\n",
|
||||
"\n",
|
||||
"Then, inside the `forecast` environment, please run the following commands to create the jupyter notebook kernel:\n",
|
||||
"Then, inside the `forecasting_env` environment, please run the following commands to create the jupyter notebook kernel:\n",
|
||||
"\n",
|
||||
"``\n",
|
||||
"python -m ipykernel install --name forecast\n",
|
||||
"python -m ipykernel install --name forecasting_env\n",
|
||||
"``\n",
|
||||
"\n",
|
||||
"Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecast` in the list of kernels under Kernel tab."
|
||||
"Finally, you can launch the Jupyter notebook by running `jupyter notebook` and select the kernel named `forecasting_env` in the list of kernels under Kernel tab."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -147,10 +147,10 @@
|
|||
],
|
||||
"source": [
|
||||
"# Check number of time series and lengths\n",
|
||||
"print('number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
|
||||
"print('number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
|
||||
"print('number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
|
||||
"print('lenth distribution of the time series:')\n",
|
||||
"print('Number of stores is {}.'.format(len(sales.groupby(['store']).groups.keys())))\n",
|
||||
"print('Number of brands is {}.'.format(len(sales.groupby(['brand']).groups.keys())))\n",
|
||||
"print('Number of time series is {}.'.format(len(sales.groupby(['store', 'brand']).groups.keys())))\n",
|
||||
"print('Lenth distribution of the time series:')\n",
|
||||
"print(sales.groupby(['store', 'brand']).size().describe())"
|
||||
]
|
||||
},
|
||||
|
@ -1160,9 +1160,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "tsperf",
|
||||
"display_name": "forecasting_env",
|
||||
"language": "python",
|
||||
"name": "tsperf"
|
||||
"name": "forecasting_env"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -1174,7 +1174,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.8"
|
||||
"version": "3.6.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,2 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# This module contains experiment related parameters.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
NUM_ROUNDS = 1
|
||||
PRED_HORIZON = 3
|
||||
PRED_STEPS = 2
|
||||
TRAIN_START_WEEK = 40
|
||||
TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
|
||||
TEST_START_WEEK_LIST = list(range(137, 161, 2))
|
||||
TEST_END_WEEK_LIST = list(range(138, 162, 2))
|
||||
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") # The start datetime of the first week in the record
|
|
@ -16,6 +16,9 @@ DEFAULT_TARGET_COL = "move"
|
|||
DEFAULT_STATIC_FEA = None
|
||||
DEFAULT_DYNAMIC_FEA = ["deal", "feat"]
|
||||
|
||||
# The start datetime of the first week in the record
|
||||
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")
|
||||
|
||||
|
||||
def download_ojdata(dest_dir):
|
||||
"""Downloads Orange Juice dataset.
|
||||
|
@ -53,7 +56,29 @@ def maybe_download(dest_dir):
|
|||
print("Data already exists at the specified location.")
|
||||
|
||||
|
||||
def split_train_test(data_dir, forecast_settings, write_csv=False):
|
||||
def _gen_split_indices(n_splits=12, horizon=2, gap=2, first_week=40, last_week=156):
|
||||
"""Generate week splits for given parameters"""
|
||||
test_start_index = last_week - (horizon * n_splits) + 1
|
||||
train_end_index_first = test_start_index - gap
|
||||
train_end_index_last = train_end_index_first + (n_splits - 1) * horizon
|
||||
|
||||
assert (
|
||||
test_start_index >= first_week
|
||||
), f"Please adjust your parameters, so that testing data (currently week {test_start_index}), \
|
||||
starts after the first available week (week {first_week})."
|
||||
|
||||
assert (
|
||||
train_end_index_first >= first_week
|
||||
), f"Please adjust your parameters, so that last training data point (currently week {train_end_index_first}) \
|
||||
comes after the first available week (week {first_week})."
|
||||
|
||||
test_start_week_list = list(range(test_start_index, (last_week - horizon + 1) + 1, horizon))
|
||||
test_end_week_list = list(range(test_start_index + horizon - 1, last_week + 1, horizon))
|
||||
train_end_week_list = list(range(train_end_index_first, train_end_index_last + 1, horizon))
|
||||
return test_start_week_list, test_end_week_list, train_end_week_list
|
||||
|
||||
|
||||
def split_train_test(data_dir, n_splits=1, horizon=2, gap=2, first_week=40, last_week=156, write_csv=False):
|
||||
"""Generate training, testing, and auxiliary datasets. Training data includes the historical
|
||||
sales and external features; testing data contains the future sales and external features;
|
||||
auxiliary data includes the future price, deal, and advertisement information which can be
|
||||
|
@ -64,30 +89,32 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
|
|||
Note that train_*.csv files in /train folder contain all the features in the training period
|
||||
and aux_*.csv files in /train folder contain all the features except 'logmove', 'constant',
|
||||
'profit' up until the forecast period end week. Both train_*.csv and aux_*csv can be used for
|
||||
generating forecasts in each round. However, test_*.csv files in /test folder can only be used
|
||||
generating forecasts in each split. However, test_*.csv files in /test folder can only be used
|
||||
for model performance evaluation.
|
||||
|
||||
Example:
|
||||
from fclib.common.utils import forecast_settings
|
||||
data_dir = "/home/vapaunic/forecasting/ojdata"
|
||||
|
||||
data_dir = "/home/forecasting/ojdata"
|
||||
train, test, aux = split_train_test(data_dir=data_dir, n_splits=5, horizon=3, write_csv=True)
|
||||
|
||||
for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings):
|
||||
print("Training data size: {}".format(train.shape))
|
||||
print("Testing data size: {}".format(test.shape))
|
||||
print("Auxiliary data size: {}".format(aux.shape))
|
||||
print("Minimum training week number: {}".format(min(train["week"])))
|
||||
print("Maximum training week number: {}".format(max(train["week"])))
|
||||
print("Minimum testing week number: {}".format(min(test["week"])))
|
||||
print("Maximum testing week number: {}".format(max(test["week"])))
|
||||
print("Minimum auxiliary week number: {}".format(min(aux["week"])))
|
||||
print("Maximum auxiliary week number: {}".format(max(aux["week"])))
|
||||
print("")
|
||||
print(len(train))
|
||||
print(len(test))
|
||||
print(len(aux))
|
||||
|
||||
Args:
|
||||
data_dir (str): location of the download directory
|
||||
forecast_settings (dict): dictionary containing forecast experiment parameters
|
||||
write_csv (Boolean): Whether to write out the data files or not
|
||||
n_splits (int, optional): number of splits (folds) to generate (default: 1)
|
||||
horizon (int, optional): forecasting horizon, number of weeks to forecast (default: 2)
|
||||
gap (int, optional): gap between training and testing, number of weeks between last training
|
||||
week and first test week (default: 2)
|
||||
first_week (int, optional): first available week (default: 40)
|
||||
last_week (int, optional): last available week (default: 156)
|
||||
write_csv (Boolean, optional): Whether to write out the data files or not (default: False)
|
||||
|
||||
Returns:
|
||||
list[pandas.DataFrame]: a list containing train data frames for each split
|
||||
list[pandas.DataFrame]: a list containing test data frames for each split
|
||||
list[pandas.DataFrame]: a list containing aux data frames for each split
|
||||
|
||||
"""
|
||||
# Read sales data into dataframe
|
||||
|
@ -101,27 +128,34 @@ def split_train_test(data_dir, forecast_settings, write_csv=False):
|
|||
if not os.path.isdir(TEST_DATA_DIR):
|
||||
os.mkdir(TEST_DATA_DIR)
|
||||
|
||||
for i in range(forecast_settings.NUM_ROUNDS):
|
||||
data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
|
||||
sales.week <= forecast_settings.TRAIN_END_WEEK_LIST[i]
|
||||
)
|
||||
train = sales[data_mask].copy()
|
||||
data_mask = (sales.week >= forecast_settings.TEST_START_WEEK_LIST[i]) & (
|
||||
sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
|
||||
)
|
||||
test = sales[data_mask].copy()
|
||||
data_mask = (sales.week >= forecast_settings.TRAIN_START_WEEK) & (
|
||||
sales.week <= forecast_settings.TEST_END_WEEK_LIST[i]
|
||||
)
|
||||
aux = sales[data_mask].copy()
|
||||
aux.drop(["logmove", "constant", "profit"], axis=1, inplace=True)
|
||||
train_df_list = list()
|
||||
test_df_list = list()
|
||||
aux_df_list = list()
|
||||
|
||||
test_start_week_list, test_end_week_list, train_end_week_list = _gen_split_indices(
|
||||
n_splits, horizon, gap, first_week, last_week
|
||||
)
|
||||
|
||||
for i in range(n_splits):
|
||||
data_mask = (sales.week >= first_week) & (sales.week <= train_end_week_list[i])
|
||||
train_df = sales[data_mask].copy()
|
||||
data_mask = (sales.week >= test_start_week_list[i]) & (sales.week <= test_end_week_list[i])
|
||||
test_df = sales[data_mask].copy()
|
||||
data_mask = (sales.week >= first_week) & (sales.week <= test_end_week_list[i])
|
||||
aux_df = sales[data_mask].copy()
|
||||
aux_df.drop(["logmove", "constant", "profit"], axis=1, inplace=True)
|
||||
|
||||
if write_csv:
|
||||
roundstr = "_" + str(i + 1) if forecast_settings.NUM_ROUNDS > 1 else ""
|
||||
train.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
|
||||
test.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
|
||||
aux.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
|
||||
yield train, test, aux
|
||||
roundstr = "_" + str(i + 1) if n_splits > 1 else ""
|
||||
train_df.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv"))
|
||||
test_df.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv"))
|
||||
aux_df.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv"))
|
||||
|
||||
train_df_list.append(train_df)
|
||||
test_df_list.append(test_df)
|
||||
aux_df_list.append(aux_df)
|
||||
|
||||
return train_df_list, test_df_list, aux_df_list
|
||||
|
||||
|
||||
def specify_data_schema(
|
||||
|
@ -294,7 +328,6 @@ def _check_static_feat(df, ts_id_col_names, static_feat_names):
|
|||
|
||||
def specify_retail_data_schema(
|
||||
data_dir,
|
||||
forecast_settings,
|
||||
sales=None,
|
||||
target_col_name=DEFAULT_TARGET_COL,
|
||||
static_feat_names=DEFAULT_STATIC_FEA,
|
||||
|
@ -309,7 +342,7 @@ def specify_retail_data_schema(
|
|||
print(df_config)
|
||||
|
||||
Args:
|
||||
sales (Pandas DataFrame): sales data in the current forecast round
|
||||
sales (Pandas DataFrame): sales data in the current forecast split
|
||||
target_col_name (str): name of the target column that need to be forecasted
|
||||
static_feat_names (list): names of the feature columns that do not change over time
|
||||
dynamic_feat_names (list): names of the feature columns that can change over time
|
||||
|
@ -319,9 +352,9 @@ def specify_retail_data_schema(
|
|||
df_config (dict): configuration of the time series data
|
||||
df (Pandas DataFrame): sales data combined with store demographic features
|
||||
"""
|
||||
# Read the 1st round training data if "sales" is not specified
|
||||
# Read the 1st split of training data if "sales" is not specified
|
||||
if sales is None:
|
||||
print("Sales dataframe is not given! The 1st round training data will be used.")
|
||||
print("Sales dataframe is not given! The 1st split of training data will be used.")
|
||||
sales = pd.read_csv(os.path.join(data_dir, "train", "train_round_1.csv"), index_col=False)
|
||||
aux = pd.read_csv(os.path.join(data_dir, "train", "aux_round_1.csv"), index_col=False)
|
||||
# Merge with future price, deal, and advertisement info
|
||||
|
@ -361,9 +394,7 @@ def specify_retail_data_schema(
|
|||
df.drop("STORE", axis=1, inplace=True)
|
||||
|
||||
# Create timestamp
|
||||
df["timestamp"] = df["week"].apply(
|
||||
lambda x: forecast_settings.FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7)
|
||||
)
|
||||
df["timestamp"] = df["week"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))
|
||||
|
||||
df_config = specify_data_schema(
|
||||
df,
|
||||
|
@ -380,19 +411,13 @@ def specify_retail_data_schema(
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from fclib.common import forecast_settings
|
||||
|
||||
forecast_settings.NUM_ROUNDS = 3
|
||||
data_dir = "/home/vapaunic/forecasting/ojdata"
|
||||
|
||||
for train, test, aux in split_train_test(data_dir=data_dir, forecast_settings=forecast_settings, write_csv=True):
|
||||
print("Training data size: {}".format(train.shape))
|
||||
print("Testing data size: {}".format(test.shape))
|
||||
print("Auxiliary data size: {}".format(aux.shape))
|
||||
print("Minimum training week number: {}".format(min(train["week"])))
|
||||
print("Maximum training week number: {}".format(max(train["week"])))
|
||||
print("Minimum testing week number: {}".format(min(test["week"])))
|
||||
print("Maximum testing week number: {}".format(max(test["week"])))
|
||||
print("Minimum auxiliary week number: {}".format(min(aux["week"])))
|
||||
print("Maximum auxiliary week number: {}".format(max(aux["week"])))
|
||||
print("")
|
||||
download_ojdata(data_dir)
|
||||
# train, test, aux = split_train_test(data_dir=data_dir, n_splits=1, horizon=2, write_csv=True)
|
||||
|
||||
# print((test[0].week))
|
||||
# print((test[1].week))
|
||||
# print((test[2].week))
|
||||
# print((test[3].week))
|
||||
# print((test[4].week))
|
||||
|
|
Загрузка…
Ссылка в новой задаче