Chenhui/light gbm quick start (#40)

* initial example notebook for lightgbm

* reduced to one round forecast

* added text

* added text

* added text

* moved week_of_month to feature engineering utils

* moved df_from_cartesian_product to feature utils

* moved functions to feature utils

* moved functions to feature utils

* added lightgbm model utils

* updated plots

* added text and renamed predict function

* reduced print out frequency in model training

* moved data visualization code to utils

* added text

* updated plot function and added docstring

* renamed the notebook

* updated text
This commit is contained in:
Chenhui Hu 2020-01-28 14:02:27 -05:00 коммит произвёл GitHub
Родитель 9d81bf4a16
Коммит 487d6c35b9
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 6121 добавлений и 4 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -12,4 +12,4 @@ TRAIN_START_WEEK = 40
TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
TEST_START_WEEK_LIST = list(range(137, 161, 2))
TEST_END_WEEK_LIST = list(range(138, 162, 2))
FIRST_WEEK_START = (pd.to_datetime("1989-09-14 00:00:00"),) # The start datetime of the first week in the record
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") # The start datetime of the first week in the record

Просмотреть файл

@ -0,0 +1,104 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import math
import random
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def plot_predictions_with_history(
predictions,
history,
grain1_unique_vals,
grain2_unique_vals,
time_col_name,
target_col_name,
grain1_name="grain1",
grain2_name="grain2",
min_timestep=1,
num_samples=4,
predict_at_timestep=1,
line_at_predict_time=False,
title="Prediction results for a few sample time series",
x_label="time step",
y_label="target value",
random_seed=2,
):
"""Plot prediction results with historical values
Args:
predictions (Dataframe): Prediction results with a time step column (e.g., week_index), a
forecasted value column (e.g., forecasted sales of each store-brand), and two columns that
identify each individual time series (e.g., store_id and brand_id)
history (Dataframe): A dataframe containing historical values of the prediction target, a
time step column, and two columns that specify each time series
grain1_unique_vals (List): Unique values of the 1st column indicating the granularity of
the time series data (e.g, store_list)
grain2_unique_vals (List): Unique values of the 2nd column indicating the granularity of
the time series data (e.g., brand_list)
time_col_name (String): Name of the time step column (e.g., week_index)
target_col_name (String): Name of the forecast target column (e.g., unit_sales)
grain1_name (String): Name of the 1st column indicating the time series graunularity
grain2_name (String): Name of the 2nd column indicating the time series graunularity
min_timestep (Integer): Minimum time steps in the plots
num_samples (Integer): Number of samples from all the time series (each combination of
grain1 column and grain2 column gives an individual time series)
predict_at_timestep (Integer): Time step at which the forecasts are made
line_at_predict_time (Boolean): Whether to add a vertical line indicating the time step
when the forecasts are made
title (String): Overall title of the plots
x_label (String): Label of the x-axis of the plots
y_label (String): Label of the y-axis of the plots
random_seed (Integer): Random seed used for sampling the time series
"""
random.seed(random_seed)
grain_combinations = list(itertools.product(grain1_unique_vals, grain2_unique_vals))
sample_grain_combinations = random.sample(grain_combinations, num_samples)
max_timestep = max(predictions[time_col_name].unique())
fig, axes = plt.subplots(nrows=math.ceil(num_samples / 2), ncols=2, figsize=(15, 5 * math.ceil(num_samples / 2)))
if axes.ndim == 1:
axes = np.reshape(axes, (1, axes.shape[0]))
fig.suptitle(title, y=1.02, fontsize=20)
sample_id = 0
for row in axes:
for col in row:
if sample_id < len(sample_grain_combinations):
[grain1_id, grain2_id] = sample_grain_combinations[sample_id]
history_sub = history.loc[
(history[grain1_name] == grain1_id)
& (history[grain2_name] == grain2_id)
& (history[time_col_name] <= max_timestep)
& (history[time_col_name] >= min_timestep)
]
predictions_sub = predictions.loc[
(predictions[grain1_name] == grain1_id)
& (predictions[grain2_name] == grain2_id)
& (predictions[time_col_name] >= min_timestep)
]
col.plot(history_sub[time_col_name], history_sub[target_col_name], marker="o")
col.plot(
predictions_sub[time_col_name],
predictions_sub[target_col_name],
linestyle="--",
marker="^",
color="red",
)
if line_at_predict_time:
col.axvline(x=predict_at_timestep, linestyle="--")
col.set_title("{} {} {} {}".format(grain1_name, grain1_id, grain2_name, grain2_id))
col.xaxis.set_major_locator(MaxNLocator(integer=True))
col.set_xlabel(x_label)
col.set_ylabel(y_label)
col.legend(labels=["actual", "predicted"])
sample_id += 1
else:
col.axis("off")
plt.tight_layout()

Просмотреть файл

@ -2,9 +2,9 @@
# Licensed under the MIT License.
"""
This file contains helper functions for creating features for TSPerf
reference implementations and submissions. All functions defined assume
that there is no missing data.
This file contains utility functions for creating features for time
series forecasting applications. All functions defined assume that
there is no missing data.
"""
from datetime import timedelta
@ -106,6 +106,31 @@ def week_of_year(datetime_col):
return datetime_col.dt.week
def week_of_month(date_time):
"""Returns the week of the month for a specified date.
Args:
dt (Datetime): Input date
Returns:
wom (Integer): Week of the month of the input date
"""
def _week_of_month(date_time):
from math import ceil
first_day = date_time.replace(day=1)
dom = date_time.day
adjusted_dom = dom + first_day.weekday()
wom = int(ceil(adjusted_dom / 7.0))
return wom
if isinstance(date_time, pd.Series):
return date_time.apply(lambda x: _week_of_month(x))
else:
return _week_of_month(date_time)
def month_of_year(date_time_col):
"""Returns the month from a datetime column."""
return date_time_col.dt.month
@ -804,3 +829,77 @@ def same_day_hour_moving_agg(
df.drop(["fct_diff", "value"], inplace=True, axis=1)
return df
def df_from_cartesian_product(dict_in):
"""Generate a Pandas dataframe from Cartesian product of lists.
Args:
dict_in (Dictionary): Dictionary containing multiple lists, e.g. {"fea1": list1, "fea2": list2}
Returns:
df (Dataframe): Dataframe corresponding to the Caresian product of the lists
"""
from itertools import product
cart = list(product(*dict_in.values()))
df = pd.DataFrame(cart, columns=dict_in.keys())
return df
def lagged_features(df, lags):
"""Create lagged features based on time series data.
Args:
df (Dataframe): Input time series data sorted by time
lags (List): Lag lengths
Returns:
fea (Dataframe): Lagged features
"""
df_list = []
for lag in lags:
df_shifted = df.shift(lag)
df_shifted.columns = [x + "_lag" + str(lag) for x in df_shifted.columns]
df_list.append(df_shifted)
fea = pd.concat(df_list, axis=1)
return fea
def moving_averages(df, start_step, window_size=None):
"""Compute averages of every feature over moving time windows.
Args:
df (Dataframe): Input features as a dataframe
start_step (Integer): Starting time step of rolling mean
window_size (Integer): Windows size of rolling mean
Returns:
fea (Dataframe): Dataframe consisting of the moving averages
"""
if window_size is None:
# Use a large window to compute average over all historical data
window_size = df.shape[0]
fea = df.shift(start_step).rolling(min_periods=1, center=False, window=window_size).mean()
fea.columns = fea.columns + "_mean"
return fea
def combine_features(df, lag_fea, lags, window_size, used_columns):
"""Combine lag features, moving average features, and orignal features in the data.
Args:
df (Dataframe): Time series data including the target series and external features
lag_fea (List): A list of column names for creating lagged features
lags (Numpy Array): Numpy array including all the lags
window_size (Integer): Window size of rolling mean
used_columns (List): A list containing the names of columns that are needed in the
input dataframe (including the target column)
Returns:
fea_all (Dataframe): Dataframe including all the features
"""
lagged_fea = lagged_features(df[lag_fea], lags)
moving_avg = moving_averages(df[lag_fea], 2, window_size)
fea_all = pd.concat([df[used_columns], lagged_fea, moving_avg], axis=1)
return fea_all

Просмотреть файл

@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
This file contains utility functions for builing LightGBM model to
solve time series forecasting problems.
"""
import pandas as pd
def predict(df, model, target_col, idx_cols, integer_output=True):
"""Predict target variable with a trained LightGBM model.
Args:
df (Dataframe): Dataframe including all needed features
model (Model): Trained LightGBM model
target_col (String): Name of the target column
idx_col (List): List of the names of the index columns, e.g. ["store", "brand", "week"]
integer_output (Boolean): It it is True, the forecast will be rounded to an integer
Returns:
Dataframe including the predictions of the target variable
"""
if target_col in df.columns:
df = df.drop(target_col, axis=1)
predictions = pd.DataFrame({target_col: model.predict(df)})
if integer_output:
predictions[target_col] = predictions[target_col].apply(lambda x: round(x))
return pd.concat([df[idx_cols].reset_index(drop=True), predictions], axis=1)

Просмотреть файл

@ -27,10 +27,12 @@ dependencies:
- scikit-learn=0.21.3
- pytest
- papermill>=1.0.1
- matplotlib=3.1.2
- r-base
- r-bayesm
- pip:
- black
- flake8
- jupytext==1.3.0
- lightgbm==2.3.1
# - fire==0.2.1 # for CLI capabilities