Chenhui/light gbm quick start (#40)

* initial example notebook for lightgbm * reduced to one round forecast * added text * added text * added text * moved week_of_month to feature engineering utils * moved df_from_cartesian_product to feature utils * moved functions to feature utils * moved functions to feature utils * added lightgbm model utils * updated plots * added text and renamed predict function * reduced print out frequency in model training * moved data visualization code to utils * added text * updated plot function and added docstring * renamed the notebook * updated text
2020-01-28 14:02:27 -05:00 · 2020-01-28 14:02:27 -05:00 · 487d6c35b9
--- a/examples/00_quick_start/lightgbm_point_forecast.ipynb
+++ b/examples/00_quick_start/lightgbm_point_forecast.ipynb
--- a/forecasting_lib/forecasting_lib/common/forecast_settings.py
+++ b/forecasting_lib/forecasting_lib/common/forecast_settings.py
@ -12,4 +12,4 @@ TRAIN_START_WEEK = 40
 TRAIN_END_WEEK_LIST = list(range(135, 159, 2))
 TEST_START_WEEK_LIST = list(range(137, 161, 2))
 TEST_END_WEEK_LIST = list(range(138, 162, 2))
-FIRST_WEEK_START = (pd.to_datetime("1989-09-14 00:00:00"),)  # The start datetime of the first week in the record
+FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")  # The start datetime of the first week in the record
--- a/forecasting_lib/forecasting_lib/common/plot.py
+++ b/forecasting_lib/forecasting_lib/common/plot.py
@ -0,0 +1,104 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+import math
+import random
+import itertools
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+
+
+def plot_predictions_with_history(
+    predictions,
+    history,
+    grain1_unique_vals,
+    grain2_unique_vals,
+    time_col_name,
+    target_col_name,
+    grain1_name="grain1",
+    grain2_name="grain2",
+    min_timestep=1,
+    num_samples=4,
+    predict_at_timestep=1,
+    line_at_predict_time=False,
+    title="Prediction results for a few sample time series",
+    x_label="time step",
+    y_label="target value",
+    random_seed=2,
+):
+    """Plot prediction results with historical values
+
+    Args:
+        predictions (Dataframe): Prediction results with a time step column (e.g., week_index), a 
+        forecasted value column (e.g., forecasted sales of each store-brand), and two columns that 
+        identify each individual time series (e.g., store_id and brand_id) 
+        history (Dataframe): A dataframe containing historical values of the prediction target, a 
+        time step column, and two columns that specify each time series
+        grain1_unique_vals (List): Unique values of the 1st column indicating the granularity of
+        the time series data (e.g, store_list)
+        grain2_unique_vals (List): Unique values of the 2nd column indicating the granularity of
+        the time series data (e.g., brand_list)
+        time_col_name (String): Name of the time step column (e.g., week_index)
+        target_col_name (String): Name of the forecast target column (e.g., unit_sales)
+        grain1_name (String): Name of the 1st column indicating the time series graunularity
+        grain2_name (String): Name of the 2nd column indicating the time series graunularity
+        min_timestep (Integer): Minimum time steps in the plots
+        num_samples (Integer): Number of samples from all the time series (each combination of 
+        grain1 column and grain2 column gives an individual time series)
+        predict_at_timestep (Integer): Time step at which the forecasts are made
+        line_at_predict_time (Boolean): Whether to add a vertical line indicating the time step 
+        when the forecasts are made
+        title (String): Overall title of the plots 
+        x_label (String): Label of the x-axis of the plots
+        y_label (String): Label of the y-axis of the plots
+        random_seed (Integer): Random seed used for sampling the time series
+    """
+
+    random.seed(random_seed)
+
+    grain_combinations = list(itertools.product(grain1_unique_vals, grain2_unique_vals))
+    sample_grain_combinations = random.sample(grain_combinations, num_samples)
+    max_timestep = max(predictions[time_col_name].unique())
+
+    fig, axes = plt.subplots(nrows=math.ceil(num_samples / 2), ncols=2, figsize=(15, 5 * math.ceil(num_samples / 2)))
+    if axes.ndim == 1:
+        axes = np.reshape(axes, (1, axes.shape[0]))
+    fig.suptitle(title, y=1.02, fontsize=20)
+
+    sample_id = 0
+    for row in axes:
+        for col in row:
+            if sample_id < len(sample_grain_combinations):
+                [grain1_id, grain2_id] = sample_grain_combinations[sample_id]
+                history_sub = history.loc[
+                    (history[grain1_name] == grain1_id)
+                    & (history[grain2_name] == grain2_id)
+                    & (history[time_col_name] <= max_timestep)
+                    & (history[time_col_name] >= min_timestep)
+                ]
+                predictions_sub = predictions.loc[
+                    (predictions[grain1_name] == grain1_id)
+                    & (predictions[grain2_name] == grain2_id)
+                    & (predictions[time_col_name] >= min_timestep)
+                ]
+                col.plot(history_sub[time_col_name], history_sub[target_col_name], marker="o")
+                col.plot(
+                    predictions_sub[time_col_name],
+                    predictions_sub[target_col_name],
+                    linestyle="--",
+                    marker="^",
+                    color="red",
+                )
+                if line_at_predict_time:
+                    col.axvline(x=predict_at_timestep, linestyle="--")
+                col.set_title("{} {} {} {}".format(grain1_name, grain1_id, grain2_name, grain2_id))
+                col.xaxis.set_major_locator(MaxNLocator(integer=True))
+                col.set_xlabel(x_label)
+                col.set_ylabel(y_label)
+                col.legend(labels=["actual", "predicted"])
+                sample_id += 1
+            else:
+                col.axis("off")
+    plt.tight_layout()
--- a/forecasting_lib/forecasting_lib/feature_engineering/feature_utils.py
+++ b/forecasting_lib/forecasting_lib/feature_engineering/feature_utils.py
@ -2,9 +2,9 @@
 # Licensed under the MIT License.

 """
-This file contains helper functions for creating features for TSPerf
-reference implementations and submissions. All functions defined assume 
-that there is no missing data.
+This file contains utility functions for creating features for time
+series forecasting applications. All functions defined assume that
+there is no missing data.
 """

 from datetime import timedelta
@ -106,6 +106,31 @@ def week_of_year(datetime_col):
    return datetime_col.dt.week


+def week_of_month(date_time):
+    """Returns the week of the month for a specified date.
+
+    Args:
+        dt (Datetime): Input date
+
+    Returns:
+        wom (Integer): Week of the month of the input date
+    """
+
+    def _week_of_month(date_time):
+        from math import ceil
+
+        first_day = date_time.replace(day=1)
+        dom = date_time.day
+        adjusted_dom = dom + first_day.weekday()
+        wom = int(ceil(adjusted_dom / 7.0))
+        return wom
+
+    if isinstance(date_time, pd.Series):
+        return date_time.apply(lambda x: _week_of_month(x))
+    else:
+        return _week_of_month(date_time)
+
+
 def month_of_year(date_time_col):
    """Returns the month from a datetime column."""
    return date_time_col.dt.month
@ -804,3 +829,77 @@ def same_day_hour_moving_agg(
    df.drop(["fct_diff", "value"], inplace=True, axis=1)

    return df
+
+
+def df_from_cartesian_product(dict_in):
+    """Generate a Pandas dataframe from Cartesian product of lists.
+    
+    Args: 
+        dict_in (Dictionary): Dictionary containing multiple lists, e.g. {"fea1": list1, "fea2": list2}
+        
+    Returns:
+        df (Dataframe): Dataframe corresponding to the Caresian product of the lists
+    """
+    from itertools import product
+
+    cart = list(product(*dict_in.values()))
+    df = pd.DataFrame(cart, columns=dict_in.keys())
+    return df
+
+
+def lagged_features(df, lags):
+    """Create lagged features based on time series data.
+    
+    Args:
+        df (Dataframe): Input time series data sorted by time
+        lags (List): Lag lengths
+        
+    Returns:
+        fea (Dataframe): Lagged features 
+    """
+    df_list = []
+    for lag in lags:
+        df_shifted = df.shift(lag)
+        df_shifted.columns = [x + "_lag" + str(lag) for x in df_shifted.columns]
+        df_list.append(df_shifted)
+    fea = pd.concat(df_list, axis=1)
+    return fea
+
+
+def moving_averages(df, start_step, window_size=None):
+    """Compute averages of every feature over moving time windows.
+    
+    Args:
+        df (Dataframe): Input features as a dataframe
+        start_step (Integer): Starting time step of rolling mean
+        window_size (Integer): Windows size of rolling mean
+    
+    Returns:
+        fea (Dataframe): Dataframe consisting of the moving averages
+    """
+    if window_size is None:
+        # Use a large window to compute average over all historical data
+        window_size = df.shape[0]
+    fea = df.shift(start_step).rolling(min_periods=1, center=False, window=window_size).mean()
+    fea.columns = fea.columns + "_mean"
+    return fea
+
+
+def combine_features(df, lag_fea, lags, window_size, used_columns):
+    """Combine lag features, moving average features, and orignal features in the data.
+    
+    Args:
+        df (Dataframe): Time series data including the target series and external features
+        lag_fea (List): A list of column names for creating lagged features
+        lags (Numpy Array): Numpy array including all the lags
+        window_size (Integer): Window size of rolling mean
+        used_columns (List): A list containing the names of columns that are needed in the 
+        input dataframe (including the target column)
+    
+    Returns:
+        fea_all (Dataframe): Dataframe including all the features
+    """
+    lagged_fea = lagged_features(df[lag_fea], lags)
+    moving_avg = moving_averages(df[lag_fea], 2, window_size)
+    fea_all = pd.concat([df[used_columns], lagged_fea, moving_avg], axis=1)
+    return fea_all
--- a/forecasting_lib/forecasting_lib/models/lightgbm.py
+++ b/forecasting_lib/forecasting_lib/models/lightgbm.py
@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+This file contains utility functions for builing LightGBM model to
+solve time series forecasting problems.
+"""
+
+import pandas as pd
+
+
+def predict(df, model, target_col, idx_cols, integer_output=True):
+    """Predict target variable with a trained LightGBM model.
+    
+    Args: 
+        df (Dataframe): Dataframe including all needed features
+        model (Model): Trained LightGBM model
+        target_col (String): Name of the target column
+        idx_col (List): List of the names of the index columns, e.g. ["store", "brand", "week"]
+        integer_output (Boolean): It it is True, the forecast will be rounded to an integer
+        
+    Returns:
+        Dataframe including the predictions of the target variable 
+    """
+    if target_col in df.columns:
+        df = df.drop(target_col, axis=1)
+    predictions = pd.DataFrame({target_col: model.predict(df)})
+    if integer_output:
+        predictions[target_col] = predictions[target_col].apply(lambda x: round(x))
+    return pd.concat([df[idx_cols].reset_index(drop=True), predictions], axis=1)
--- a/tools/environment.yml
+++ b/tools/environment.yml
@ -27,10 +27,12 @@ dependencies:
  - scikit-learn=0.21.3
  - pytest
  - papermill>=1.0.1
+  - matplotlib=3.1.2
  - r-base
  - r-bayesm
  - pip:
    - black
    - flake8
    - jupytext==1.3.0
+    - lightgbm==2.3.1
    # - fire==0.2.1 # for CLI capabilities