Chenhui/light gbm quick start (#40)
* initial example notebook for lightgbm * reduced to one round forecast * added text * added text * added text * moved week_of_month to feature engineering utils * moved df_from_cartesian_product to feature utils * moved functions to feature utils * moved functions to feature utils * added lightgbm model utils * updated plots * added text and renamed predict function * reduced print out frequency in model training * moved data visualization code to utils * added text * updated plot function and added docstring * renamed the notebook * updated text
This commit is contained in:
Родитель
9d81bf4a16
Коммит
487d6c35b9
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -12,4 +12,4 @@ TRAIN_START_WEEK = 40
|
|||
TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
|
||||
TEST_START_WEEK_LIST = list(range(137, 161, 2))
|
||||
TEST_END_WEEK_LIST = list(range(138, 162, 2))
|
||||
FIRST_WEEK_START = (pd.to_datetime("1989-09-14 00:00:00"),) # The start datetime of the first week in the record
|
||||
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") # The start datetime of the first week in the record
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
import math
|
||||
import random
|
||||
import itertools
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import MaxNLocator
|
||||
|
||||
|
||||
def plot_predictions_with_history(
|
||||
predictions,
|
||||
history,
|
||||
grain1_unique_vals,
|
||||
grain2_unique_vals,
|
||||
time_col_name,
|
||||
target_col_name,
|
||||
grain1_name="grain1",
|
||||
grain2_name="grain2",
|
||||
min_timestep=1,
|
||||
num_samples=4,
|
||||
predict_at_timestep=1,
|
||||
line_at_predict_time=False,
|
||||
title="Prediction results for a few sample time series",
|
||||
x_label="time step",
|
||||
y_label="target value",
|
||||
random_seed=2,
|
||||
):
|
||||
"""Plot prediction results with historical values
|
||||
|
||||
Args:
|
||||
predictions (Dataframe): Prediction results with a time step column (e.g., week_index), a
|
||||
forecasted value column (e.g., forecasted sales of each store-brand), and two columns that
|
||||
identify each individual time series (e.g., store_id and brand_id)
|
||||
history (Dataframe): A dataframe containing historical values of the prediction target, a
|
||||
time step column, and two columns that specify each time series
|
||||
grain1_unique_vals (List): Unique values of the 1st column indicating the granularity of
|
||||
the time series data (e.g, store_list)
|
||||
grain2_unique_vals (List): Unique values of the 2nd column indicating the granularity of
|
||||
the time series data (e.g., brand_list)
|
||||
time_col_name (String): Name of the time step column (e.g., week_index)
|
||||
target_col_name (String): Name of the forecast target column (e.g., unit_sales)
|
||||
grain1_name (String): Name of the 1st column indicating the time series graunularity
|
||||
grain2_name (String): Name of the 2nd column indicating the time series graunularity
|
||||
min_timestep (Integer): Minimum time steps in the plots
|
||||
num_samples (Integer): Number of samples from all the time series (each combination of
|
||||
grain1 column and grain2 column gives an individual time series)
|
||||
predict_at_timestep (Integer): Time step at which the forecasts are made
|
||||
line_at_predict_time (Boolean): Whether to add a vertical line indicating the time step
|
||||
when the forecasts are made
|
||||
title (String): Overall title of the plots
|
||||
x_label (String): Label of the x-axis of the plots
|
||||
y_label (String): Label of the y-axis of the plots
|
||||
random_seed (Integer): Random seed used for sampling the time series
|
||||
"""
|
||||
|
||||
random.seed(random_seed)
|
||||
|
||||
grain_combinations = list(itertools.product(grain1_unique_vals, grain2_unique_vals))
|
||||
sample_grain_combinations = random.sample(grain_combinations, num_samples)
|
||||
max_timestep = max(predictions[time_col_name].unique())
|
||||
|
||||
fig, axes = plt.subplots(nrows=math.ceil(num_samples / 2), ncols=2, figsize=(15, 5 * math.ceil(num_samples / 2)))
|
||||
if axes.ndim == 1:
|
||||
axes = np.reshape(axes, (1, axes.shape[0]))
|
||||
fig.suptitle(title, y=1.02, fontsize=20)
|
||||
|
||||
sample_id = 0
|
||||
for row in axes:
|
||||
for col in row:
|
||||
if sample_id < len(sample_grain_combinations):
|
||||
[grain1_id, grain2_id] = sample_grain_combinations[sample_id]
|
||||
history_sub = history.loc[
|
||||
(history[grain1_name] == grain1_id)
|
||||
& (history[grain2_name] == grain2_id)
|
||||
& (history[time_col_name] <= max_timestep)
|
||||
& (history[time_col_name] >= min_timestep)
|
||||
]
|
||||
predictions_sub = predictions.loc[
|
||||
(predictions[grain1_name] == grain1_id)
|
||||
& (predictions[grain2_name] == grain2_id)
|
||||
& (predictions[time_col_name] >= min_timestep)
|
||||
]
|
||||
col.plot(history_sub[time_col_name], history_sub[target_col_name], marker="o")
|
||||
col.plot(
|
||||
predictions_sub[time_col_name],
|
||||
predictions_sub[target_col_name],
|
||||
linestyle="--",
|
||||
marker="^",
|
||||
color="red",
|
||||
)
|
||||
if line_at_predict_time:
|
||||
col.axvline(x=predict_at_timestep, linestyle="--")
|
||||
col.set_title("{} {} {} {}".format(grain1_name, grain1_id, grain2_name, grain2_id))
|
||||
col.xaxis.set_major_locator(MaxNLocator(integer=True))
|
||||
col.set_xlabel(x_label)
|
||||
col.set_ylabel(y_label)
|
||||
col.legend(labels=["actual", "predicted"])
|
||||
sample_id += 1
|
||||
else:
|
||||
col.axis("off")
|
||||
plt.tight_layout()
|
|
@ -2,9 +2,9 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
This file contains helper functions for creating features for TSPerf
|
||||
reference implementations and submissions. All functions defined assume
|
||||
that there is no missing data.
|
||||
This file contains utility functions for creating features for time
|
||||
series forecasting applications. All functions defined assume that
|
||||
there is no missing data.
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
|
@ -106,6 +106,31 @@ def week_of_year(datetime_col):
|
|||
return datetime_col.dt.week
|
||||
|
||||
|
||||
def week_of_month(date_time):
|
||||
"""Returns the week of the month for a specified date.
|
||||
|
||||
Args:
|
||||
dt (Datetime): Input date
|
||||
|
||||
Returns:
|
||||
wom (Integer): Week of the month of the input date
|
||||
"""
|
||||
|
||||
def _week_of_month(date_time):
|
||||
from math import ceil
|
||||
|
||||
first_day = date_time.replace(day=1)
|
||||
dom = date_time.day
|
||||
adjusted_dom = dom + first_day.weekday()
|
||||
wom = int(ceil(adjusted_dom / 7.0))
|
||||
return wom
|
||||
|
||||
if isinstance(date_time, pd.Series):
|
||||
return date_time.apply(lambda x: _week_of_month(x))
|
||||
else:
|
||||
return _week_of_month(date_time)
|
||||
|
||||
|
||||
def month_of_year(date_time_col):
|
||||
"""Returns the month from a datetime column."""
|
||||
return date_time_col.dt.month
|
||||
|
@ -804,3 +829,77 @@ def same_day_hour_moving_agg(
|
|||
df.drop(["fct_diff", "value"], inplace=True, axis=1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def df_from_cartesian_product(dict_in):
|
||||
"""Generate a Pandas dataframe from Cartesian product of lists.
|
||||
|
||||
Args:
|
||||
dict_in (Dictionary): Dictionary containing multiple lists, e.g. {"fea1": list1, "fea2": list2}
|
||||
|
||||
Returns:
|
||||
df (Dataframe): Dataframe corresponding to the Caresian product of the lists
|
||||
"""
|
||||
from itertools import product
|
||||
|
||||
cart = list(product(*dict_in.values()))
|
||||
df = pd.DataFrame(cart, columns=dict_in.keys())
|
||||
return df
|
||||
|
||||
|
||||
def lagged_features(df, lags):
|
||||
"""Create lagged features based on time series data.
|
||||
|
||||
Args:
|
||||
df (Dataframe): Input time series data sorted by time
|
||||
lags (List): Lag lengths
|
||||
|
||||
Returns:
|
||||
fea (Dataframe): Lagged features
|
||||
"""
|
||||
df_list = []
|
||||
for lag in lags:
|
||||
df_shifted = df.shift(lag)
|
||||
df_shifted.columns = [x + "_lag" + str(lag) for x in df_shifted.columns]
|
||||
df_list.append(df_shifted)
|
||||
fea = pd.concat(df_list, axis=1)
|
||||
return fea
|
||||
|
||||
|
||||
def moving_averages(df, start_step, window_size=None):
|
||||
"""Compute averages of every feature over moving time windows.
|
||||
|
||||
Args:
|
||||
df (Dataframe): Input features as a dataframe
|
||||
start_step (Integer): Starting time step of rolling mean
|
||||
window_size (Integer): Windows size of rolling mean
|
||||
|
||||
Returns:
|
||||
fea (Dataframe): Dataframe consisting of the moving averages
|
||||
"""
|
||||
if window_size is None:
|
||||
# Use a large window to compute average over all historical data
|
||||
window_size = df.shape[0]
|
||||
fea = df.shift(start_step).rolling(min_periods=1, center=False, window=window_size).mean()
|
||||
fea.columns = fea.columns + "_mean"
|
||||
return fea
|
||||
|
||||
|
||||
def combine_features(df, lag_fea, lags, window_size, used_columns):
|
||||
"""Combine lag features, moving average features, and orignal features in the data.
|
||||
|
||||
Args:
|
||||
df (Dataframe): Time series data including the target series and external features
|
||||
lag_fea (List): A list of column names for creating lagged features
|
||||
lags (Numpy Array): Numpy array including all the lags
|
||||
window_size (Integer): Window size of rolling mean
|
||||
used_columns (List): A list containing the names of columns that are needed in the
|
||||
input dataframe (including the target column)
|
||||
|
||||
Returns:
|
||||
fea_all (Dataframe): Dataframe including all the features
|
||||
"""
|
||||
lagged_fea = lagged_features(df[lag_fea], lags)
|
||||
moving_avg = moving_averages(df[lag_fea], 2, window_size)
|
||||
fea_all = pd.concat([df[used_columns], lagged_fea, moving_avg], axis=1)
|
||||
return fea_all
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
This file contains utility functions for builing LightGBM model to
|
||||
solve time series forecasting problems.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def predict(df, model, target_col, idx_cols, integer_output=True):
|
||||
"""Predict target variable with a trained LightGBM model.
|
||||
|
||||
Args:
|
||||
df (Dataframe): Dataframe including all needed features
|
||||
model (Model): Trained LightGBM model
|
||||
target_col (String): Name of the target column
|
||||
idx_col (List): List of the names of the index columns, e.g. ["store", "brand", "week"]
|
||||
integer_output (Boolean): It it is True, the forecast will be rounded to an integer
|
||||
|
||||
Returns:
|
||||
Dataframe including the predictions of the target variable
|
||||
"""
|
||||
if target_col in df.columns:
|
||||
df = df.drop(target_col, axis=1)
|
||||
predictions = pd.DataFrame({target_col: model.predict(df)})
|
||||
if integer_output:
|
||||
predictions[target_col] = predictions[target_col].apply(lambda x: round(x))
|
||||
return pd.concat([df[idx_cols].reset_index(drop=True), predictions], axis=1)
|
|
@ -27,10 +27,12 @@ dependencies:
|
|||
- scikit-learn=0.21.3
|
||||
- pytest
|
||||
- papermill>=1.0.1
|
||||
- matplotlib=3.1.2
|
||||
- r-base
|
||||
- r-bayesm
|
||||
- pip:
|
||||
- black
|
||||
- flake8
|
||||
- jupytext==1.3.0
|
||||
- lightgbm==2.3.1
|
||||
# - fire==0.2.1 # for CLI capabilities
|
Загрузка…
Ссылка в новой задаче