diff --git a/.gitignore b/.gitignore index d6dc00f7..708a92cb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,27 @@ **/.ipynb_checkpoints *.egg-info/ .vscode/ +*.pkl +*.h5 +# Data ojdata/* *.Rdata + +# AML Config +aml_config/ +.azureml/ +.config/ + +# Pytests +.pytest_cache/ + +# File for model deployment +score.py + +# Environments +myenv.yml + +# Logs +logs/ +*.log \ No newline at end of file diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..269fffb5 --- /dev/null +++ b/.lintr @@ -0,0 +1,18 @@ +linters: with_defaults( + infix_spaces_linter = NULL, + spaces_left_parentheses_linter = NULL, + open_curly_linter = NULL, + line_length_linter = NULL, + camel_case_linter = NULL, + object_name_linter = NULL, + object_usage_linter = NULL, + object_length_linter = NULL, + trailing_blank_lines_linter = NULL, + absolute_paths_linter = NULL, + commented_code_linter = NULL, + implicit_integer_linter = NULL, + extraction_operator_linter = NULL, + single_quotes_linter = NULL, + pipe_continuation_linter = NULL, + cyclocomp_linter = NULL + ) diff --git a/R/orange_juice/01_dataprep.Rmd b/R/orange_juice/01_dataprep.Rmd deleted file mode 100644 index cd1194a9..00000000 --- a/R/orange_juice/01_dataprep.Rmd +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: Data preparation -output: html_notebook ---- - -```{r, echo=FALSE, results="hide", message=FALSE} -library(tidyr) -library(dplyr) -library(tsibble) -library(feasts) -library(fable) -``` - -In this notebook, we generate the datasets that will be used for model training and validating. The experiment parameters are obtained from the file `ojdata_forecast_settings.json`; you can modify that file to vary the experimental setup, or just edit the values in this notebook. - -The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida. - -A complicating factor is that the data is in a hybrid of long and wide format: while the sales figures are long (one column of sales data for every store and brand), the prices are wide (one price column for each brand). Therefore we need to reshape the data if we want to use prices for modelling. As part of this, we also compute a new column `maxpricediff`: this represents the log-ratio of the price of this brand compared to the best competing price. A positive `maxpricediff` means this brand is cheaper than all the other brands, and a negative `maxpricediff` means it is more expensive. - -```{r} -settings <- jsonlite::fromJSON("ojdata_forecast_settings.json") - -train_periods <- seq(settings$TRAIN_WINDOW, 160 - settings$STEP - 1, settings$STEP) -start_date <- as.Date(settings$START_DATE) - -data(orangeJuice, package="bayesm") - -oj_data <- orangeJuice$yx %>% - complete(store, brand, week) %>% - group_by(store, brand) %>% - group_modify(~ { - pricevars <- grep("price", names(.x), value=TRUE) - thispricevar <- paste0("price", .y$brand) - best_other_price <- do.call(pmin, .x[setdiff(pricevars, thispricevar)]) - .x$price <- .x[[thispricevar]] - .x$maxpricediff <- log(best_other_price/.x$price) - select(.x, week, logmove, deal, feat, price, maxpricediff) - }) %>% - ungroup() %>% - mutate(week=yearweek(start_date + week*7)) %>% # do this separately because of tsibble/vctrs issues - as_tsibble(index=week, key=c(store, brand)) -``` - -Here are some glimpses of what the data looks like. The dependent variable is `logmove`, the logarithm of the total sales for a given brand and store, in a particular week. Note that we do _not_ fill in the missing values in the data, as (with the exception of `ETS`) the modelling functions in the fable package can handle this innately. - -```{r} -head(oj_data) -``` - -The time series plots for a small subset of brands and stores are shown below. It is clear that the statistical behaviour of the data varies by store and brand. - -```{r} -library(ggplot2) - -oj_data %>% - filter(store < 10, brand < 5) %>% - ggplot(aes(x=week, y=logmove)) + - geom_line() + - scale_x_date(labels=NULL) + - facet_grid(vars(store), vars(brand), labeller="label_both") -``` - -Finally, we split the dataset into separate samples for training and testing. The schema used is broadly time series cross-validation, whereby we train a model on data up to time $t$, test it on data for times $t+1$ to $t+k$, then train on data up to time $t+k$, test it on data for times $t+k+1$ to $t+2k$, and so on. - -In this specific case study we introduce a small extra piece of complexity. We train a model on data up to month $t$, then test it on months $t+2$ to $t+3$. Then we train on data up to month $t+2$, and test it on months $t+4$ to $t+5$, and so on. Thus there is always a gap of one month between the training and test samples, a complicating factor introduced after discussions with domain experts. - -```{r} -subset_oj_data <- function(start, end) -{ - start <- yearweek(start_date + start*7) - end <- yearweek(start_date + end*7) - filter(oj_data, week >= start, week <= end) -} - -oj_train <- lapply(train_periods, function(i) subset_oj_data(40, i)) -oj_test <- lapply(train_periods, function(i) subset_oj_data(i + 2, i + settings$STEP + 1)) - -save(oj_train, oj_test, file="oj_data.Rdata") - -head(oj_train[[1]]) - -head(oj_test[[1]]) -``` diff --git a/R/orange_juice/02_simplemodels.Rmd b/R/orange_juice/02_simplemodels.Rmd deleted file mode 100644 index 65ff50d2..00000000 --- a/R/orange_juice/02_simplemodels.Rmd +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: Simple models -output: html_notebook -encoding: utf8 ---- - -```{r, echo=FALSE, results="hide", message=FALSE} -library(tidyr) -library(dplyr) -library(tsibble) -library(feasts) -library(fable) -``` - -We fit some simple models to the orange juice data. One model is fit for each combination of store and brand. - -- `mean`: This is just a simple mean. -- `naive`: A random walk model without any other components. This amounts to setting all forecast values to the last observed value. -- `drift`: This adjusts the `naive` model to incorporate a trend. -- `arima`: An ARIMA model with the parameter values estimated from the data. -- `ets`: An exponentially weighted model, again with parameter values estimated from the data. - -Note that the model training process is embarrassingly parallel on 3 levels: - -- We have multiple independent training datasets; -- For which we fit multiple independent models; -- Within which we have independent sub-models for each store and brand. - -This lets us speed up the training significantly. While the `fable::model` function can fit multiple models in parallel, we will run it sequentially here and instead parallelise by dataset. This avoids contention for cores, and also results in the simplest code. - -```{r, results="hide"} -load("oj_data.Rdata") - -ncores <- max(2, parallel::detectCores(logical=FALSE) - 2) -cl <- parallel::makeCluster(ncores) -parallel::clusterEvalQ(cl, -{ - library(tidyr) - library(feasts) - library(fable) - library(tsibble) -}) -``` - -First, we fit the models that can innately handle missing values. - -```{r} -oj_modelset <- parallel::parLapply(cl, oj_train, function(df) -{ - model(df, - mean=MEAN(logmove), - naive=NAIVE(logmove), - drift=RW(logmove ~ drift()), - arima=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0)) - ) -}) -``` -Next, we fit models that require manual imputation (ETS). - -```{r} -oj_modelset_ets <- parallel::parLapply(cl, oj_train, function(df) -{ - df %>% - fill(everything()) %>% - model(ets=ETS(logmove ~ error("A") + trend("A") + season("N"))) -}) - -parallel::stopCluster(cl) -save(oj_modelset, oj_modelset_ets, file="oj_modelset.Rdata") - -head(oj_modelset[[1]]) -head(oj_modelset_ets[[1]]) -``` - diff --git a/R/orange_juice/02a_simplereg_models.Rmd b/R/orange_juice/02a_simplereg_models.Rmd deleted file mode 100644 index 038bed53..00000000 --- a/R/orange_juice/02a_simplereg_models.Rmd +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Regression models -output: html_notebook ---- - -```{r, echo=FALSE, results="hide", message=FALSE} -library(tidyr) -library(dplyr) -library(tsibble) -library(feasts) -library(fable) -``` - -This notebook builds on the output from "Simple models" by including regressor variables in the ARIMA model(s). - -```{r, results="hide"} -load("oj_data.Rdata") - -ncores <- max(2, parallel::detectCores(logical=FALSE) - 2) -cl <- parallel::makeCluster(ncores) -parallel::clusterEvalQ(cl, -{ - library(feasts) - library(fable) - library(tsibble) -}) - -oj_modelset_reg <- parallel::parLapply(cl, oj_train, function(df) -{ - model(df, - ar_reg=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + price + maxpricediff), - ar_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend()), - ar_regtrend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend() + deal + feat + price + maxpricediff) - ) -}) - -parallel::stopCluster(cl) -save(oj_modelset_reg, file="oj_modelset_reg.Rdata") -``` diff --git a/R/orange_juice/03_model_eval.Rmd b/R/orange_juice/03_model_eval.Rmd deleted file mode 100644 index fdce453e..00000000 --- a/R/orange_juice/03_model_eval.Rmd +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: Model evaluation -output: html_notebook -encoding: utf8 ---- - -```{r, echo=FALSE, results="hide", message=FALSE} -library(tidyr) -library(dplyr) -library(tsibble) -library(feasts) -library(fable) -``` - -Having fit the models, let's examine their rolling goodness of fit, using the MAPE (mean absolute percentage error) metric. - -First, we compute the forecasts for each dataset and model, again in parallel. - -```{r, results="hide"} -for(f in dir(pattern="Rdata$")) - load(f) - -ncores <- max(2, parallel::detectCores(logical=FALSE) - 2) -cl <- parallel::makeCluster(ncores) -parallel::clusterEvalQ(cl, -{ - library(feasts) - library(fable) - library(tsibble) -}) - -fcast_sets <- lapply(ls(pattern="^oj_modelset"), function(mod) - parallel::clusterMap(cl, function(mod, df) forecast(mod, df), get(mod), oj_test) -) - -parallel::stopCluster(cl) -``` - -Next, we compute the MAPE for each model. It is apparent that adding independent variables as regressors improves the quality of the fit substantially. Adding a simple trend does _not_ improve the fit, indicating that the level of sales does not appear to change over time (at least over the period included in the data). - -```{r} -orig <- do.call(rbind, oj_test) %>% - as_tibble() %>% - select(store, brand, week, logmove) %>% - mutate(move=exp(logmove)) - -gof <- function(fcast_data) -{ - fcast_data <- do.call(rbind, fcast_data) %>% - as_tibble() %>% - select(store, brand, week, .model, logmove) %>% - pivot_wider(id_cols=c(store, brand, week), names_from=.model, values_from=logmove) %>% - select(-store, -brand, -week) %>% - summarise_all(function(x) MAPE(exp(x) - orig$move, orig$move)) -} - -lapply(fcast_sets, gof) %>% bind_cols() -``` diff --git a/R/orange_juice/README.md b/R/orange_juice/README.md deleted file mode 100644 index dead2f56..00000000 --- a/R/orange_juice/README.md +++ /dev/null @@ -1,32 +0,0 @@ -## Orange juice dataset - -### Package installation - -You'll need the following packages to run the notebooks in this directory: - -- bayesm (the source of the data) -- ggplot2 -- dplyr -- tidyr -- jsonlite -- tsibble -- urca -- fable -- fabletools -- feasts - -The easiest way to install them is to run - -```r -install.packages("bayesm") -install.packages("tidyverse") # installs all tidyverse packages -install.packages(c("fable", "feasts", "urca")) -``` - -The Rmarkdown notebooks in this directory are as follows. You should run them in sequence, as each will create output objects (datasets/models) that are used in later notebooks. - -- [`01_dataprep.Rmd`](01_dataprep.Rmd) creates the training and test datasets -- [`02_simplemodels.Rmd`](02_simplemodels.Rmd) fits a range of simple time series models to the data, including ARIMA and ETS models. -- [`02a_simplereg_models.Rmd`](02a_simplereg_models.Rmd) adds independent variables as regressors to the ARIMA model. -- [`03_model_eval.Rmd`](03_model_eval.Rmd) evaluates the goodness of fit of the models on the test data. - diff --git a/R/orange_juice/ojdata_forecast_settings.json b/R/orange_juice/ojdata_forecast_settings.json deleted file mode 100644 index 5bc46a25..00000000 --- a/R/orange_juice/ojdata_forecast_settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "STEP": 2, - "TRAIN_WINDOW": 135, - "START_DATE": "1989-09-14" -} diff --git a/README.md b/README.md index e854f207..1c14a9cd 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,87 @@ # Forecasting Best Practices -This repository contains examples and best practices for building Forecasting solutions and systems, provided as [Jupyter notebooks](examples) and [a library of utility functions](fclib). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on forecasting problems. +Time series forecasting is one of the most important topics in data science. Almost every business needs to predict the future in order to make better decisions and allocate resources more effectively. -## Getting Started +This repository provides examples and best practice guidelines for building forecasting solutions. The goal of this repository is to build a comprehensive set of tools and examples that leverage recent advances in forecasting algorithms to build solutions and operationalize them. Rather than creating implementations from scratch, we draw from existing state-of-the-art libraries and build additional utilities around processing and featurizing the data, optimizing and evaluating models, and scaling up to the cloud. -To get started, navigate to the [Setup Guide](./docs/SETUP.md), which lists instructions on how to set up your environment and dependencies, download the data and run examples provided in the repository. +The examples and best practices are provided as [Python Jupyter notebooks and R markdown files](examples) and [a library of utility functions](fclib). We hope that these examples and utilities can significantly reduce the “time to market” by simplifying the experience from defining the business problem to the development of solutions by orders of magnitude. In addition, the example notebooks would serve as guidelines and showcase best practices and usage of the tools in a wide variety of languages. + + +## Content + +The following is a summary of the examples related to the process of building forecasting solutions covered in this repository. The [examples](examples) are organized according to use cases. Currently, we focus on a retail sales forecasting use case as it is widely used in [assortment planning](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1569&context=edissertations), [inventory optimization](https://en.wikipedia.org/wiki/Inventory_optimization), and [price optimization](https://en.wikipedia.org/wiki/Price_optimization). + +| Example | Models/Methods | Description | Language | +|----------------------------------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|-----------| +| Quick Start | Auto ARIMA, Azure AutoML, Linear Regression, LightGBM | Quick start notebooks that demonstrate workflow of developing a forecast model using one-round training and testing data | Python | +| Data Exploration and Preparation | Statistical Analysis and Data Transformation | Data exploration and preparation examples | Python, R | +| Model Training and Evaluation | Auto ARIMA, LightGBM, Dilated CNN | Deep dive notebooks that perform multi-round training and testing of various classical and deep learning forecast algorithms | Python | +| Model Tuning and Deployment | HyperDrive, LightGBM | Example notebook for model tuning using Azure Machine Learning Service and deploying the best model on Azure | Python | +| R Models | Mean Forecast, ARIMA, ETS, Prophet | Popular statistical forecast models and Prophet model implmented in R | R | + + +## Getting Started in Python + +To quickly get started with the repository on your local machine, use the following commands. + +1. Install Anaconda with Python >= 3.6. [Miniconda](https://conda.io/miniconda.html) is a quick way to get started. + +2. Clone the repository + ``` + git clone https://github.com/microsoft/forecasting + cd forecasting/ + ``` + +3. Run setup scripts to create conda environment. Please execute one of the following commands from the root of Forecasting repo based on your operating system. + + - Linux + ``` + ./tools/environment_setup.sh + ``` + + - Windows + ``` + tools\environment_setup.bat + ``` + + Note that for Windows you need to run the batch script from Anaconda Prompt. The script creates a conda environment `forecasting_env` and installs the forecasting utility library `fclib`. + +4. Start the Jupyter notebook server + ``` + jupyter notebook + ``` + +5. Run the [LightGBM single-round](examples/oj_retail/python/00_quick_start/lightgbm_single_round.ipynb) notebook under the `00_quick_start` folder. Make sure that the selected Jupyter kernel is `forecasting_env`. + +If you have any issues with the above setup, or want to find more detailed instructions on how to set up your environment and run examples provided in the repository, on local or a remote machine, please navigate to the [Setup Guide](./docs/SETUP.md). + +## Getting Started in R + +We assume you already have R installed on your machine. If not, simply follow the [instructions on CRAN](https://cloud.r-project.org/) to download and install R. + +The recommended editor is [RStudio](https://rstudio.com), which supports interactive editing and previewing of R notebooks. However, you can use any editor or IDE that supports RMarkdown. In particular, [Visual Studio Code](https://code.visualstudio.com) with the [R extension](https://marketplace.visualstudio.com/items?itemName=Ikuyadeu.r) can be used to edit and render the notebook files. The rendered `.nb.html` files can be viewed in any modern web browser. + +The examples use the [Tidyverts](https://tidyverts.org) family of packages, which is a modern framework for time series analysis that builds on the widely-used [Tidyverse](https://tidyverse.org) family. The Tidyverts framework is still under active development, so it's recommended that you update your packages regularly to get the latest bug fixes and features. + +## Target Audience +Our target audience for this repository includes data scientists and machine learning engineers with varying levels of knowledge in forecasting as our content is source-only and targets custom machine learning modelling. The utilities and examples provided are intended to be solution accelerators for real-world forecasting problems. ## Contributing We hope that the open source community would contribute to the content and bring in the latest SOTA algorithm. This project welcomes contributions and suggestions. Before contributing, please see our [Contributing Guide](./docs/CONTRIBUTING.md). +## Reference + +The following is a list of related repositories that you may find helpful. + +| | | +|------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------| +| [Deep Learning for Time Series Forecasting](https://github.com/Azure/DeepLearningForTimeSeriesForecasting) | A collection of examples for using deep neural networks for time series forecasting with Keras. | +| [Demand Forecasting and Price Optimization Solution](https://github.com/Azure/cortana-intelligence-price-optimization) | A Cortana Intelligence solution how-to guide for demand forecasting and price optimization. | + + + ## Build Status -| Build | Branch | Status | -| --- | --- | --- | -| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=master) | -| **Linux CPU** | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=staging) | \ No newline at end of file +| Build | Branch | Status | +|---------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=master) | +| **Linux CPU** | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=staging) | diff --git a/R_utils/cluster.R b/R_utils/cluster.R new file mode 100644 index 00000000..949101d0 --- /dev/null +++ b/R_utils/cluster.R @@ -0,0 +1,36 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +#' Creates a local background cluster for parallel computations +#' +#' @param ncores The number of nodes (cores) for the cluster. The default is 2 less than the number of physical cores. +#' @param libs The packages to load on each node, as a character vector. +#' @param useXDR For most platforms, this can be left at its default `FALSE` value. +#' @return +#' A cluster object. +make_cluster <- function(ncores=NULL, libs=character(0), useXDR=FALSE) +{ + if(is.null(ncores)) + ncores <- max(2, parallel::detectCores(logical=FALSE) - 2) + cl <- parallel::makeCluster(ncores, type="PSOCK", useXDR=useXDR) + res <- try(parallel::clusterCall( + cl, + function(libs) + { + for(lib in libs) library(lib, character.only=TRUE) + }, + libs + ), silent=TRUE) + if(inherits(res, "try-error")) + parallel::stopCluster(cl) + else cl +} + + +#' Deletes a local background cluster +#' +#' @param cl The cluster object, as returned from `make_cluster`. +destroy_cluster <- function(cl) +{ + try(parallel::stopCluster(cl), silent=TRUE) +} diff --git a/R_utils/model_eval.R b/R_utils/model_eval.R new file mode 100644 index 00000000..a321ad58 --- /dev/null +++ b/R_utils/model_eval.R @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +#' Computes forecast values on a dataset +#' +#' @param mable A mable (model table) as returned by `fabletools::model`. +#' @param newdata The dataset for which to compute forecasts. +#' @param ... Further arguments to `fabletools::forecast`. +#' @return +#' A tsibble, with one column per model type in `mable`, and one column named `.response` containing the response variable from `newdata`. +get_forecasts <- function(mable, newdata, ...) +{ + fcast <- forecast(mable, new_data=newdata, ...) + keyvars <- key_vars(fcast) + keyvars <- keyvars[-length(keyvars)] + indexvar <- index_var(fcast) + fcastvar <- as.character(attr(fcast, "response")[[1]]) + fcast <- fcast %>% + as_tibble() %>% + pivot_wider( + id_cols=all_of(c(keyvars, indexvar)), + names_from=.model, + values_from=all_of(fcastvar)) + select(newdata, !!keyvars, !!indexvar, !!fcastvar) %>% + rename(.response=!!fcastvar) %>% + inner_join(fcast) +} + + +#' Evaluate quality of forecasts given a criterion +#' +#' @param fcast_df A tsibble as returned from `get_forecasts`. +#' @param gof A goodness-of-fit function. The default is to use `fabletools::MAPE`, which computes the mean absolute percentage error. +#' @return +#' A single-row data frame with the computed goodness-of-fit statistic for each model. +eval_forecasts <- function(fcast_df, gof=fabletools::MAPE) +{ + if(!is.function(gof)) + gof <- get(gof, mode="function") + resp <- fcast_df$.response + keyvars <- key_vars(fcast_df) + indexvar <- index_var(fcast_df) + fcast_df %>% + as_tibble() %>% + select(-all_of(c(keyvars, indexvar, ".response"))) %>% + summarise_all( + function(x, .actual) gof(x - .actual, .actual=.actual), + .actual=resp + ) +} diff --git a/R_utils/save_objects.R b/R_utils/save_objects.R new file mode 100644 index 00000000..b9492491 --- /dev/null +++ b/R_utils/save_objects.R @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +#' Loads serialised objects relating to a given forecasting example into the current workspace +#' +#' @param example The particular forecasting example. +#' @param file The name of the file (with extension). +#' @return +#' This function is run for its side effect, namely loading the given file into the global environment. +load_objects <- function(example, file) +{ + examp_dir <- here::here("examples", example, "R") + load(file.path(examp_dir, file), envir=globalenv()) +} + +#' Saves R objects for a forecasting example to a file +#' +#' @param ... Objects to save, as unquoted names. +#' @param example The particular forecasting example. +#' @param file The name of the file (with extension). +save_objects <- function(..., example, file) +{ + examp_dir <- here::here("examples", example, "R") + save(..., file=file.path(examp_dir, file)) +} diff --git a/fclib/fclib/tuning/back_test_utils.py b/contrib/tsperf/energy_utils/back_test_utils.py similarity index 100% rename from fclib/fclib/tuning/back_test_utils.py rename to contrib/tsperf/energy_utils/back_test_utils.py diff --git a/fclib/fclib/tuning/__init__.py b/contrib/tsperf/energy_utils/feature_engineering/__init__.py similarity index 100% rename from fclib/fclib/tuning/__init__.py rename to contrib/tsperf/energy_utils/feature_engineering/__init__.py diff --git a/fclib/fclib/feature_engineering/base_ts_estimators.py b/contrib/tsperf/energy_utils/feature_engineering/base_ts_estimators.py similarity index 100% rename from fclib/fclib/feature_engineering/base_ts_estimators.py rename to contrib/tsperf/energy_utils/feature_engineering/base_ts_estimators.py diff --git a/fclib/fclib/feature_engineering/feature_engineering.py b/contrib/tsperf/energy_utils/feature_engineering/feature_engineering.py similarity index 100% rename from fclib/fclib/feature_engineering/feature_engineering.py rename to contrib/tsperf/energy_utils/feature_engineering/feature_engineering.py diff --git a/contrib/tsperf/energy_utils/feature_engineering/feature_utils.py b/contrib/tsperf/energy_utils/feature_engineering/feature_utils.py new file mode 100644 index 00000000..55ffcc7e --- /dev/null +++ b/contrib/tsperf/energy_utils/feature_engineering/feature_utils.py @@ -0,0 +1,1004 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +This file contains utility functions for creating features for time +series forecasting applications. All functions defined assume that +there is no missing data. +""" + +import calendar +import itertools +import pandas as pd +import numpy as np +from datetime import timedelta +from sklearn.preprocessing import MinMaxScaler + +from fclib.feature_engineering.utils import is_datetime_like + +# 0: Monday, 2: T/W/TR, 4: F, 5:SA, 6: S +WEEK_DAY_TYPE_MAP = {1: 2, 3: 2} # Map for converting Wednesday and +# Thursday to have the same code as Tuesday +HOLIDAY_CODE = 7 +SEMI_HOLIDAY_CODE = 8 # days before and after a holiday + +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" + + +def day_type(datetime_col, holiday_col=None, semi_holiday_offset=timedelta(days=1)): + """ + Convert datetime_col to 7 day types + 0: Monday + 2: Tuesday, Wednesday, and Thursday + 4: Friday + 5: Saturday + 6: Sunday + 7: Holiday + 8: Days before and after a holiday + + Args: + datetime_col: Datetime column. + holiday_col: Holiday code column. Default value None. + semi_holiday_offset: Time difference between the date before (or after) + the holiday and the holiday. Default value timedelta(days=1). + + Returns: + A numpy array containing converted datatime_col into day types. + """ + + datetype = pd.DataFrame({"DayType": datetime_col.dt.dayofweek}) + datetype.replace({"DayType": WEEK_DAY_TYPE_MAP}, inplace=True) + + if holiday_col is not None: + holiday_mask = holiday_col > 0 + datetype.loc[holiday_mask, "DayType"] = HOLIDAY_CODE + + # Create a temporary Date column to calculate dates near the holidays + datetype["Date"] = pd.to_datetime(datetime_col.dt.date, format=DATETIME_FORMAT) + holiday_dates = set(datetype.loc[holiday_mask, "Date"]) + + semi_holiday_dates = [ + pd.date_range(start=d - semi_holiday_offset, end=d + semi_holiday_offset, freq="D") for d in holiday_dates + ] + + # Flatten the list of lists + semi_holiday_dates = [d for dates in semi_holiday_dates for d in dates] + + semi_holiday_dates = set(semi_holiday_dates) + semi_holiday_dates = semi_holiday_dates.difference(holiday_dates) + + datetype.loc[datetype["Date"].isin(semi_holiday_dates), "DayType"] = SEMI_HOLIDAY_CODE + + return datetype["DayType"].values + + +def hour_of_day(datetime_col): + """Returns the hour from a datetime column.""" + return datetime_col.dt.hour + + +def time_of_year(datetime_col): + """ + Time of year is a cyclic variable that indicates the annual position and + repeats each year. It is each year linearly increasing over time going + from 0 on January 1 at 00:00 to 1 on December 31st at 23:00. The values + are normalized to be between [0; 1]. + + Args: + datetime_col: Datetime column. + + Returns: + A numpy array containing converted datatime_col into time of year. + """ + + time_of_year = pd.DataFrame( + {"DayOfYear": datetime_col.dt.dayofyear, "HourOfDay": datetime_col.dt.hour, "Year": datetime_col.dt.year} + ) + time_of_year["TimeOfYear"] = (time_of_year["DayOfYear"] - 1) * 24 + time_of_year["HourOfDay"] + + time_of_year["YearLength"] = time_of_year["Year"].apply(lambda y: 366 if calendar.isleap(y) else 365) + + time_of_year["TimeOfYear"] = time_of_year["TimeOfYear"] / (time_of_year["YearLength"] * 24 - 1) + + return time_of_year["TimeOfYear"].values + + +def week_of_year(datetime_col): + """Returns the week from a datetime column.""" + return datetime_col.dt.week + + +def week_of_month(date_time): + """Returns the week of the month for a specified date. + + Args: + dt (Datetime): Input date + + Returns: + wom (Integer): Week of the month of the input date + """ + + def _week_of_month(date_time): + from math import ceil + + first_day = date_time.replace(day=1) + dom = date_time.day + adjusted_dom = dom + first_day.weekday() + wom = int(ceil(adjusted_dom / 7.0)) + return wom + + if isinstance(date_time, pd.Series): + return date_time.apply(lambda x: _week_of_month(x)) + else: + return _week_of_month(date_time) + + +def month_of_year(date_time_col): + """Returns the month from a datetime column.""" + return date_time_col.dt.month + + +def day_of_week(date_time_col): + """Returns the day of week from a datetime column.""" + return date_time_col.dt.dayofweek + + +def day_of_month(date_time_col): + """Returns the day of month from a datetime column.""" + return date_time_col.dt.day + + +def day_of_year(date_time_col): + """Returns the day of year from a datetime column.""" + return date_time_col.dt.dayofyear + + +def encoded_month_of_year(month_of_year): + """ + Create one hot encoding of month of year. + """ + month_of_year = pd.get_dummies(month_of_year, prefix="MonthOfYear") + + return month_of_year + + +def encoded_day_of_week(day_of_week): + """ + Create one hot encoding of day_of_week. + """ + day_of_week = pd.get_dummies(day_of_week, prefix="DayOfWeek") + + return day_of_week + + +def encoded_day_of_month(day_of_month): + """ + Create one hot encoding of day_of_month. + """ + day_of_month = pd.get_dummies(day_of_month, prefix="DayOfMonth") + + return day_of_month + + +def encoded_day_of_year(day_of_year): + """ + Create one hot encoding of day_of_year. + """ + day_of_year = pd.get_dummies(day_of_year) + + return day_of_year + + +def encoded_hour_of_day(hour_of_day): + """ + Create one hot encoding of hour_of_day. + """ + hour_of_day = pd.get_dummies(hour_of_day, prefix="HourOfDay") + + return hour_of_day + + +def encoded_week_of_year(week_of_year): + """ + Create one hot encoding of week_of_year. + """ + week_of_year = pd.get_dummies(week_of_year, prefix="WeekOfYear") + + return week_of_year + + +def normalized_current_year(datetime_col, min_year, max_year): + """ + Temporal feature indicating the position of the year of a record in the + entire time period under consideration, normalized to be between 0 and 1. + + Args: + datetime_col: Datetime column. + min_year: minimum value of year. + max_year: maximum value of year. + + Returns: + float: the position of the current year in the min_year:max_year range + """ + year = datetime_col.dt.year + + if max_year != min_year: + current_year = (year - min_year) / (max_year - min_year) + elif max_year == min_year: + current_year = 0 + + return current_year + + +def normalized_current_date(datetime_col, min_date, max_date): + """ + Temporal feature indicating the position of the date of a record in the + entire time period under consideration, normalized to be between 0 and 1. + + Args: + datetime_col: Datetime column. + min_date: minimum value of date. + max_date: maximum value of date. + + Returns: + float: the position of the current date in the min_date:max_date range + """ + date = datetime_col.dt.date + current_date = (date - min_date).apply(lambda x: x.days) + + if max_date != min_date: + current_date = current_date / (max_date - min_date).days + elif max_date == min_date: + current_date = 0 + + return current_date + + +def normalized_current_datehour(datetime_col, min_datehour, max_datehour): + """ + Temporal feature indicating the position of the hour of a record in the + entire time period under consideration, normalized to be between 0 and 1. + + Args: + datetime_col: Datetime column. + min_datehour: minimum value of datehour. + max_datehour: maximum value of datehour. + + Returns: + float: the position of the current datehour in the min_datehour:max_datehour range + """ + current_datehour = (datetime_col - min_datehour).apply(lambda x: x.days * 24 + x.seconds / 3600) + + max_min_diff = max_datehour - min_datehour + + if max_min_diff != 0: + current_datehour = current_datehour / (max_min_diff.days * 24 + max_min_diff.seconds / 3600) + elif max_min_diff == 0: + current_datehour = 0 + + return current_datehour + + +def normalized_columns(datetime_col, value_col, mode="log", output_colname="normalized_columns"): + """ + Creates columns normalized to be log of input columns devided by global average of each columns, + or normalized using maximum and minimum. + + Args: + datetime_col: Datetime column. + value_col: Value column to be normalized. + mode: Normalization mode, + accepted values are 'log' and 'minmax'. Default value 'log'. + + Returns: + Normalized value column. + """ + + if not is_datetime_like(datetime_col): + datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT) + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + if not df.index.is_monotonic: + df.sort_index(inplace=True) + + if mode == "log": + mean_value = df["value"].mean() + if mean_value != 0: + df[output_colname] = np.log(df["value"] / mean_value) + elif mean_value == 0: + df[output_colname] = 0 + elif mode == "minmax": + min_value = min(df["value"]) + max_value = max(df["value"]) + if min_value != max_value: + df[output_colname] = (df["value"] - min_value) / (max_value - min_value) + elif min_value == max_value: + df[output_colname] = 0 + else: + raise ValueError("Valid values for mode are 'log' and 'minmax'") + + return df[[output_colname]] + + +def fourier_approximation(t, n, period): + """ + Generic helper function to create Fourier Series at different harmonies (n) and periods. + + Args: + t: Datetime column. + n: Harmonies, n=0, 1, 2, 3,... + period: Period of the datetime variable t. + + Returns: + float: Sine component + float: Cosine component + """ + x = n * 2 * np.pi * t / period + x_sin = np.sin(x) + x_cos = np.cos(x) + + return x_sin, x_cos + + +def annual_fourier(datetime_col, n_harmonics): + """ + Creates Annual Fourier Series at different harmonies (n). + + Args: + datetime_col: Datetime column. + n_harmonics: Harmonies, n=0, 1, 2, 3,... + + Returns: + dict: Output dictionary containing sine and cosine components of + the Fourier series for all harmonies. + """ + day_of_year = datetime_col.dt.dayofyear + + output_dict = {} + for n in range(1, n_harmonics + 1): + sin, cos = fourier_approximation(day_of_year, n, 365.24) + + output_dict["annual_sin_" + str(n)] = sin + output_dict["annual_cos_" + str(n)] = cos + + return output_dict + + +def weekly_fourier(datetime_col, n_harmonics): + """ + Creates Weekly Fourier Series at different harmonies (n). + + Args: + datetime_col: Datetime column. + n_harmonics: Harmonies, n=0, 1, 2, 3,... + + Returns: + dict: Output dictionary containing sine and cosine components of + the Fourier series for all harmonies. + """ + day_of_week = datetime_col.dt.dayofweek + 1 + + output_dict = {} + for n in range(1, n_harmonics + 1): + sin, cos = fourier_approximation(day_of_week, n, 7) + + output_dict["weekly_sin_" + str(n)] = sin + output_dict["weekly_cos_" + str(n)] = cos + + return output_dict + + +def daily_fourier(datetime_col, n_harmonics): + """ + Creates Daily Fourier Series at different harmonies (n). + + Args: + datetime_col: Datetime column. + n_harmonics: Harmonies, n=0, 1, 2, 3,... + + Returns: + dict: Output dictionary containing sine and cosine components of + the Fourier series for all harmonies. + """ + hour_of_day = datetime_col.dt.hour + 1 + + output_dict = {} + for n in range(1, n_harmonics + 1): + sin, cos = fourier_approximation(hour_of_day, n, 24) + + output_dict["daily_sin_" + str(n)] = sin + output_dict["daily_cos_" + str(n)] = cos + + return output_dict + + +def same_week_day_hour_lag( + datetime_col, value_col, n_years=3, week_window=1, agg_func="mean", q=None, output_colname="SameWeekHourLag" +): + """ + Creates a lag feature by calculating quantiles, mean and std of values of and + around the same week, same day of week, and same hour of day, of previous years. + + Args: + datetime_col: Datetime column. + value_col: Feature value column to create lag feature from. + n_years: Number of previous years data to use. Default value 3. + week_window: Number of weeks before and after the same week to use, + which should help reduce noise in the data. Default value 1. + agg_func: Aggregation function to apply on multiple previous values, + accepted values are 'mean', 'quantile', 'std'. Default value 'mean'. + q: If agg_func is 'quantile', taking value between 0 and 1. + output_colname: name of the output lag feature column. + Default value 'SameWeekHourLag'. + + Returns: + pd.DataFrame: data frame containing the newly created lag + feature as a column. + """ + + if not is_datetime_like(datetime_col): + datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT) + min_time_stamp = min(datetime_col) + max_time_stamp = max(datetime_col) + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + week_lag_base = 52 + week_lag_last_year = list(range(week_lag_base - week_window, week_lag_base + week_window + 1)) + week_lag_all = [] + for y in range(n_years): + week_lag_all += [x + y * 52 for x in week_lag_last_year] + + week_lag_cols = [] + for w in week_lag_all: + if (max_time_stamp - timedelta(weeks=w)) >= min_time_stamp: + col_name = "week_lag_" + str(w) + week_lag_cols.append(col_name) + + lag_datetime = df.index.get_level_values(0) - timedelta(weeks=w) + valid_lag_mask = lag_datetime >= min_time_stamp + + df[col_name] = np.nan + + df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values + + # Additional aggregation options will be added as needed + if agg_func == "mean" and q is None: + df[output_colname] = round(df[week_lag_cols].mean(axis=1)) + elif agg_func == "quantile" and q is not None: + df[output_colname] = round(df[week_lag_cols].quantile(q, axis=1)) + elif agg_func == "std" and q is None: + df[output_colname] = round(df[week_lag_cols].std(axis=1)) + + return df[[output_colname]] + + +def same_day_hour_lag( + datetime_col, value_col, n_years=3, day_window=1, agg_func="mean", q=None, output_colname="SameDayHourLag" +): + """ + Creates a lag feature by calculating quantiles, mean, and std of values of + and around the same day of year, and same hour of day, of previous years. + + Args: + datetime_col: Datetime column. + value_col: Feature value column to create lag feature from. + n_years: Number of previous years data to use. Default value 3. + day_window: Number of days before and after the same day to use, + which should help reduce noise in the data. Default value 1. + agg_func: Aggregation function to apply on multiple previous values, + accepted values are 'mean', 'quantile', 'std'. Default value 'mean'. + q: If agg_func is 'quantile', taking value between 0 and 1. + output_colname: name of the output lag feature column. + Default value 'SameDayHourLag'. + + Returns: + pd.DataFrame: data frame containing the newly created lag + feature as a column. + """ + + if not is_datetime_like(datetime_col): + datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT) + min_time_stamp = min(datetime_col) + max_time_stamp = max(datetime_col) + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + day_lag_base = 365 + day_lag_last_year = list(range(day_lag_base - day_window, day_lag_base + day_window + 1)) + day_lag_all = [] + for y in range(n_years): + day_lag_all += [x + y * 365 for x in day_lag_last_year] + + day_lag_cols = [] + for d in day_lag_all: + if (max_time_stamp - timedelta(days=d)) >= min_time_stamp: + col_name = "day_lag_" + str(d) + day_lag_cols.append(col_name) + + lag_datetime = df.index.get_level_values(0) - timedelta(days=d) + valid_lag_mask = lag_datetime >= min_time_stamp + + df[col_name] = np.nan + + df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values + + # Additional aggregation options will be added as needed + if agg_func == "mean" and q is None: + df[output_colname] = round(df[day_lag_cols].mean(axis=1)) + elif agg_func == "quantile" and q is not None: + df[output_colname] = round(df[day_lag_cols].quantile(q, axis=1)) + elif agg_func == "std" and q is None: + df[output_colname] = round(df[day_lag_cols].std(axis=1)) + + return df[[output_colname]] + + +def same_day_hour_moving_average( + datetime_col, + value_col, + window_size, + start_week, + average_count, + forecast_creation_time, + output_col_prefix="moving_average_lag_", +): + """ + Creates moving average features by averaging values of the same day of + week and same hour of day of previous weeks. + + Args: + datetime_col: Datetime column + value_col: Feature value column to create moving average features from. + window_size: Number of weeks used to compute the average. + start_week: First week of the first moving average feature. + average_count: Number of moving average features to create. + forecast_creation_time: The time point when the feature is created. + This value is used to prevent using data that are not available + at forecast creation time to compute features. + output_col_prefix: Prefix of the output columns. The start week of each + moving average feature is added at the end. Default value 'moving_average_lag_'. + + Returns: + pd.DataFrame: data frame containing the newly created lag features as + columns. + + For example, start_week = 9, window_size=4, and average_count = 3 will + create three moving average features. + 1) moving_average_lag_9: average the same day and hour values of the 9th, + 10th, 11th, and 12th weeks before the current week. + 2) moving_average_lag_10: average the same day and hour values of the + 10th, 11th, 12th, and 13th weeks before the current week. + 3) moving_average_lag_11: average the same day and hour values of the + 11th, 12th, 13th, and 14th weeks before the current week. + """ + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + df = df.asfreq("H") + + if not df.index.is_monotonic: + df.sort_index(inplace=True) + + df["fct_diff"] = df.index - forecast_creation_time + df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600) + max_diff = max(df["fct_diff"]) + + for i in range(average_count): + output_col = output_col_prefix + str(start_week + i) + week_lag_start = start_week + i + hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)] + hour_lags = [h for h in hour_lags if h > max_diff] + if len(hour_lags) > 0: + tmp_df = df[["value"]].copy() + tmp_col_all = [] + for h in hour_lags: + tmp_col = "tmp_lag_" + str(h) + tmp_col_all.append(tmp_col) + tmp_df[tmp_col] = tmp_df["value"].shift(h) + + df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1)) + df.drop(["fct_diff", "value"], inplace=True, axis=1) + + return df + + +def same_day_hour_moving_quantile( + datetime_col, + value_col, + window_size, + start_week, + quantile_count, + q, + forecast_creation_time, + output_col_prefix="moving_quatile_lag_", +): + """ + Creates a series of quantiles features by calculating quantiles of values of + the same day of week and same hour of day of previous weeks. + + Args: + datetime_col: Datetime column + value_col: Feature value column to create quantile features from. + window_size: Number of weeks used to compute the quantile. + start_week: First week of the first moving quantile feature. + quantile_count: Number of quantile features to create. + q: quantile to compute from history values, should be between 0 and 1. + forecast_creation_time: The time point when the feature is created. + This value is used to prevent using data that are not available + at forecast creation time to compute features. + output_col_prefix: Prefix of the output columns. The start week of each + moving average feature is added at the end. Default value 'moving_quatile_lag_'. + + Returns: + pd.DataFrame: data frame containing the newly created lag features as + columns. + + For example, start_week = 9, window_size=4, and quantile_count = 3 will + create three quantiles features. + 1) moving_quantile_lag_9: calculate quantile of the same day and hour values of the 9th, + 10th, 11th, and 12th weeks before the current week. + 2) moving_quantile_lag_10: calculate quantile of average the same day and hour values of the + 10th, 11th, 12th, and 13th weeks before the current week. + 3) moving_quantile_lag_11: calculate quantile of average the same day and hour values of the + 11th, 12th, 13th, and 14th weeks before the current week. + """ + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + df = df.asfreq("H") + + if not df.index.is_monotonic: + df.sort_index(inplace=True) + + df["fct_diff"] = df.index - forecast_creation_time + df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600) + max_diff = max(df["fct_diff"]) + + for i in range(quantile_count): + output_col = output_col_prefix + str(start_week + i) + week_lag_start = start_week + i + hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)] + hour_lags = [h for h in hour_lags if h > max_diff] + if len(hour_lags) > 0: + tmp_df = df[["value"]].copy() + tmp_col_all = [] + for h in hour_lags: + tmp_col = "tmp_lag_" + str(h) + tmp_col_all.append(tmp_col) + tmp_df[tmp_col] = tmp_df["value"].shift(h) + + df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1)) + + df.drop(["fct_diff", "value"], inplace=True, axis=1) + + return df + + +def same_day_hour_moving_std( + datetime_col, + value_col, + window_size, + start_week, + std_count, + forecast_creation_time, + output_col_prefix="moving_std_lag_", +): + """ + Creates standard deviation features by calculating std of values of the + same day of week and same hour of day of previous weeks. + + Args: + datetime_col: Datetime column + value_col: Feature value column to create moving std features from. + window_size: Number of weeks used to compute the std. + start_week: First week of the first moving std feature. + std_count: Number of moving std features to create. + forecast_creation_time: The time point when the feature is created. + This value is used to prevent using data that are not available at + forecast creation time to compute features. + output_col_prefix: Prefix of the output columns. The start week of each + moving average feature is added at the end. Default value 'moving_std_lag_'. + + Returns: + pd.DataFrame: data frame containing the newly created lag features as + columns. + + For example, start_week = 9, window_size=4, and std_count = 3 will + create three moving std features. + 1) moving_std_lag_9: calculate std of the same day and hour values of the 9th, + 10th, 11th, and 12th weeks before the current week. + 2) moving_std_lag_10: calculate std of the same day and hour values of the + 10th, 11th, 12th, and 13th weeks before the current week. + 3) moving_std_lag_11: calculate std of the same day and hour values of the + 11th, 12th, 13th, and 14th weeks before the current week. + """ + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + df = df.asfreq("H") + + if not df.index.is_monotonic: + df.sort_index(inplace=True) + + df["fct_diff"] = df.index - forecast_creation_time + df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600) + max_diff = max(df["fct_diff"]) + + for i in range(std_count): + output_col = output_col_prefix + str(start_week + i) + week_lag_start = start_week + i + hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)] + hour_lags = [h for h in hour_lags if h > max_diff] + if len(hour_lags) > 0: + tmp_df = df[["value"]].copy() + tmp_col_all = [] + for h in hour_lags: + tmp_col = "tmp_lag_" + str(h) + tmp_col_all.append(tmp_col) + tmp_df[tmp_col] = tmp_df["value"].shift(h) + + df[output_col] = round(tmp_df[tmp_col_all].std(axis=1)) + + df.drop(["value", "fct_diff"], inplace=True, axis=1) + + return df + + +def same_day_hour_moving_agg( + datetime_col, + value_col, + window_size, + start_week, + count, + forecast_creation_time, + agg_func="mean", + q=None, + output_col_prefix="moving_agg_lag_", +): + """ + Creates a series of aggregation features by calculating mean, quantiles, + or std of values of the same day of week and same hour of day of previous weeks. + + Args: + datetime_col: Datetime column + value_col: Feature value column to create aggregation features from. + window_size: Number of weeks used to compute the aggregation. + start_week: First week of the first aggregation feature. + count: Number of aggregation features to create. + forecast_creation_time: The time point when the feature is created. + This value is used to prevent using data that are not available + at forecast creation time to compute features. + agg_func: Aggregation function to apply on multiple previous values, + accepted values are 'mean', 'quantile', 'std'. + q: If agg_func is 'quantile', taking value between 0 and 1. + output_col_prefix: Prefix of the output columns. The start week of each + moving average feature is added at the end. Default value 'moving_agg_lag_'. + + Returns: + pd.DataFrame: data frame containing the newly created lag features as + columns. + + For example, start_week = 9, window_size=4, and count = 3 will + create three aggregation of features. + 1) moving_agg_lag_9: aggregate the same day and hour values of the 9th, + 10th, 11th, and 12th weeks before the current week. + 2) moving_agg_lag_10: aggregate the same day and hour values of the + 10th, 11th, 12th, and 13th weeks before the current week. + 3) moving_agg_lag_11: aggregate the same day and hour values of the + 11th, 12th, 13th, and 14th weeks before the current week. + """ + + df = pd.DataFrame({"Datetime": datetime_col, "value": value_col}) + df.set_index("Datetime", inplace=True) + + df = df.asfreq("H") + + if not df.index.is_monotonic: + df.sort_index(inplace=True) + + df["fct_diff"] = df.index - forecast_creation_time + df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600) + max_diff = max(df["fct_diff"]) + + for i in range(count): + output_col = output_col_prefix + str(start_week + i) + week_lag_start = start_week + i + hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)] + hour_lags = [h for h in hour_lags if h > max_diff] + if len(hour_lags) > 0: + tmp_df = df[["value"]].copy() + tmp_col_all = [] + for h in hour_lags: + tmp_col = "tmp_lag_" + str(h) + tmp_col_all.append(tmp_col) + tmp_df[tmp_col] = tmp_df["value"].shift(h) + + if agg_func == "mean" and q is None: + df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1)) + elif agg_func == "quantile" and q is not None: + df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1)) + elif agg_func == "std" and q is None: + df[output_col] = round(tmp_df[tmp_col_all].std(axis=1)) + + df.drop(["fct_diff", "value"], inplace=True, axis=1) + + return df + + +def df_from_cartesian_product(dict_in): + """Generate a Pandas dataframe from Cartesian product of lists. + + Args: + dict_in (Dictionary): Dictionary containing multiple lists, e.g. {"fea1": list1, "fea2": list2} + + Returns: + df (Dataframe): Dataframe corresponding to the Caresian product of the lists + """ + from itertools import product + + cart = list(product(*dict_in.values())) + df = pd.DataFrame(cart, columns=dict_in.keys()) + return df + + +def lagged_features(df, lags): + """Create lagged features based on time series data. + + Args: + df (Dataframe): Input time series data sorted by time + lags (List): Lag lengths + + Returns: + fea (Dataframe): Lagged features + """ + df_list = [] + for lag in lags: + df_shifted = df.shift(lag) + df_shifted.columns = [x + "_lag" + str(lag) for x in df_shifted.columns] + df_list.append(df_shifted) + fea = pd.concat(df_list, axis=1) + return fea + + +def moving_averages(df, start_step, window_size=None): + """Compute averages of every feature over moving time windows. + + Args: + df (Dataframe): Input features as a dataframe + start_step (Integer): Starting time step of rolling mean + window_size (Integer): Windows size of rolling mean + + Returns: + fea (Dataframe): Dataframe consisting of the moving averages + """ + if window_size is None: + # Use a large window to compute average over all historical data + window_size = df.shape[0] + fea = df.shift(start_step).rolling(min_periods=1, center=False, window=window_size).mean() + fea.columns = fea.columns + "_mean" + return fea + + +def combine_features(df, lag_fea, lags, window_size, used_columns): + """Combine lag features, moving average features, and orignal features in the data. + + Args: + df (Dataframe): Time series data including the target series and external features + lag_fea (List): A list of column names for creating lagged features + lags (Numpy Array): Numpy array including all the lags + window_size (Integer): Window size of rolling mean + used_columns (List): A list containing the names of columns that are needed in the + input dataframe (including the target column) + + Returns: + fea_all (Dataframe): Dataframe including all the features + """ + lagged_fea = lagged_features(df[lag_fea], lags) + moving_avg = moving_averages(df[lag_fea], 2, window_size) + fea_all = pd.concat([df[used_columns], lagged_fea, moving_avg], axis=1) + return fea_all + + +def gen_sequence(df, seq_len, seq_cols, start_timestep=0, end_timestep=None): + """Reshape time series features into an array of dimension (# of time steps, # of + features). + + Args: + df (pd.Dataframe): Dataframe including time series data for a specific grain of a + multi-granular time series, e.g., data of a specific store-brand combination for + time series data involving multiple stores and brands + seq_len (int): Number of previous time series values to be used to form feature + sequences which can be used for model training + seq_cols (list[str]): A list of names of the feature columns + start_timestep (int): First time step you can use to create feature sequences + end_timestep (int): Last time step you can use to create feature sequences + + Returns: + object: A generator object for iterating all the feature sequences + """ + data_array = df[seq_cols].values + if end_timestep is None: + end_timestep = df.shape[0] + for start, stop in zip( + range(start_timestep, end_timestep - seq_len + 2), range(start_timestep + seq_len, end_timestep + 2) + ): + yield data_array[start:stop, :] + + +def gen_sequence_array(df_all, seq_len, seq_cols, grain1_name, grain2_name, start_timestep=0, end_timestep=None): + """Combine feature sequences for all the combinations of (grain1_name, grain2_name) into a + 3-dimensional array. + + Args: + df_all (pd.Dataframe): Time series data of all the grains for multi-granular data + seq_len (int): Number of previous time series values to be used to form sequences + seq_cols (list[str]): A list of names of the feature columns + grain1_name (str): Name of the 1st column indicating the time series graunularity + grain2_name (str): Name of the 2nd column indicating the time series graunularity + start_timestep (int): First time step you can use to create feature sequences + end_timestep (int): Last time step you can use to create feature sequences + + Returns: + seq_array (np.array): An array of feature sequences for all combinations of granularities + """ + seq_gen = ( + list( + gen_sequence( + df_all[(df_all[grain1_name] == grain1) & (df_all[grain2_name] == grain2)], + seq_len, + seq_cols, + start_timestep, + end_timestep, + ) + ) + for grain1, grain2 in itertools.product(df_all[grain1_name].unique(), df_all[grain2_name].unique()) + ) + seq_array = np.concatenate(list(seq_gen)).astype(np.float32) + return seq_array + + +def static_feature_array(df_all, total_timesteps, seq_cols, grain1_name, grain2_name): + """Generate an arary which encodes all the static features. + + Args: + df_all (pd.DataFrame): Time series data of all the grains for multi-granular data + total_timesteps (int): Total number of training samples for modeling + seq_cols (list[str]): A list of names of the static feature columns, e.g. store ID + grain1_name (str): Name of the 1st column indicating the time series graunularity + grain2_name (str): Name of the 2nd column indicating the time series graunularity + + Return: + fea_array (np.array): An array of static features of all the grains, e.g. all the + combinations of stores and brands in retail sale forecasting + """ + fea_df = ( + df_all.groupby([grain1_name, grain2_name]).apply(lambda x: x.iloc[:total_timesteps, :]).reset_index(drop=True) + ) + fea_array = fea_df[seq_cols].values + return fea_array + + +def normalize_columns(df, seq_cols, scaler=MinMaxScaler()): + """Normalize a subset of columns of a dataframe. + + Args: + df (pd.DataFrame): Input dataframe + seq_cols (list[str]): A list of names of columns to be normalized + scaler (object): A scikit learn scaler object + + Returns: + pd.DataFrame: Normalized dataframe + object: Scaler object + """ + cols_fixed = df.columns.difference(seq_cols) + df_scaled = pd.DataFrame(scaler.fit_transform(df[seq_cols]), columns=seq_cols, index=df.index) + df_scaled = pd.concat([df[cols_fixed], df_scaled], axis=1) + return df_scaled, scaler diff --git a/fclib/fclib/feature_engineering/lag.py b/contrib/tsperf/energy_utils/feature_engineering/lag.py similarity index 100% rename from fclib/fclib/feature_engineering/lag.py rename to contrib/tsperf/energy_utils/feature_engineering/lag.py diff --git a/fclib/fclib/feature_engineering/normalization.py b/contrib/tsperf/energy_utils/feature_engineering/normalization.py similarity index 100% rename from fclib/fclib/feature_engineering/normalization.py rename to contrib/tsperf/energy_utils/feature_engineering/normalization.py diff --git a/fclib/fclib/feature_engineering/rolling_window.py b/contrib/tsperf/energy_utils/feature_engineering/rolling_window.py similarity index 100% rename from fclib/fclib/feature_engineering/rolling_window.py rename to contrib/tsperf/energy_utils/feature_engineering/rolling_window.py diff --git a/fclib/fclib/feature_engineering/stats.py b/contrib/tsperf/energy_utils/feature_engineering/stats.py similarity index 100% rename from fclib/fclib/feature_engineering/stats.py rename to contrib/tsperf/energy_utils/feature_engineering/stats.py diff --git a/fclib/fclib/feature_engineering/temporal.py b/contrib/tsperf/energy_utils/feature_engineering/temporal.py similarity index 100% rename from fclib/fclib/feature_engineering/temporal.py rename to contrib/tsperf/energy_utils/feature_engineering/temporal.py diff --git a/fclib/fclib/feature_engineering/us_holidays.csv b/contrib/tsperf/energy_utils/feature_engineering/us_holidays.csv similarity index 100% rename from fclib/fclib/feature_engineering/us_holidays.csv rename to contrib/tsperf/energy_utils/feature_engineering/us_holidays.csv diff --git a/fclib/fclib/feature_engineering/utils.py b/contrib/tsperf/energy_utils/feature_engineering/utils.py similarity index 100% rename from fclib/fclib/feature_engineering/utils.py rename to contrib/tsperf/energy_utils/feature_engineering/utils.py diff --git a/fclib/fclib/evaluation/train_utils.py b/contrib/tsperf/energy_utils/train_utils.py similarity index 100% rename from fclib/fclib/evaluation/train_utils.py rename to contrib/tsperf/energy_utils/train_utils.py diff --git a/tools/readme_generator/Benchmarks.csv b/contrib/tsperf/readme_generator/Benchmarks.csv similarity index 100% rename from tools/readme_generator/Benchmarks.csv rename to contrib/tsperf/readme_generator/Benchmarks.csv diff --git a/tools/readme_generator/TSPerfBoard-Energy.csv b/contrib/tsperf/readme_generator/TSPerfBoard-Energy.csv similarity index 100% rename from tools/readme_generator/TSPerfBoard-Energy.csv rename to contrib/tsperf/readme_generator/TSPerfBoard-Energy.csv diff --git a/tools/readme_generator/TSPerfBoard-Retail.csv b/contrib/tsperf/readme_generator/TSPerfBoard-Retail.csv similarity index 100% rename from tools/readme_generator/TSPerfBoard-Retail.csv rename to contrib/tsperf/readme_generator/TSPerfBoard-Retail.csv diff --git a/contrib/tsperf/readme_generator/readme_generator.py b/contrib/tsperf/readme_generator/readme_generator.py new file mode 100644 index 00000000..6e5e057a --- /dev/null +++ b/contrib/tsperf/readme_generator/readme_generator.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# coding: utf-8 + +import csvtomd +import matplotlib.pyplot as plt +import pandas as pd + + +### Generating performance charts +################################################# + +# Function to plot a performance chart +def plot_perf(x, y, df): + + # extract submission name from submission URL + labels = df.apply(lambda x: x["Submission Name"][1:].split("]")[0], axis=1) + + fig = plt.scatter(x=df[x], y=df[y], label=labels, s=150, alpha=0.5, c=["b", "g", "r", "c", "m", "y", "k"]) + plt.xlabel(x) + plt.ylabel(y) + plt.title(y + " by " + x) + offset = (max(df[y]) - min(df[y])) / 50 + for i, name in enumerate(labels): + ax = df[x][i] + ay = df[y][i] + offset * (-2.5 + i % 5) + plt.text(ax, ay, name, fontsize=10) + + return fig + + +### Printing the Readme.md file +############################################ +readmefile = "../../Readme.md" +# Write header +# print(file=open(readmefile)) +print("# TSPerf\n", file=open(readmefile, "w")) + +print( + "TSPerf is a collection of implementations of time-series forecasting algorithms in Azure cloud and comparison of their performance over benchmark datasets. \ +Algorithm implementations are compared by model accuracy, training and scoring time and cost. Each implementation includes all the necessary \ +instructions and tools that ensure its reproducibility.", + file=open(readmefile, "a"), +) + +print("The following table summarizes benchmarks that are currently included in TSPerf.\n", file=open(readmefile, "a")) + +# Read the benchmark table the CSV file and converrt to a table in md format +with open("Benchmarks.csv", "r") as f: + table = csvtomd.csv_to_table(f, ",") +print(csvtomd.md_table(table), file=open(readmefile, "a")) +print("\n\n\n", file=open(readmefile, "a")) + +print( + "A complete documentation of TSPerf, along with the instructions for submitting and reviewing implementations, \ +can be found [here](./docs/tsperf_rules.md). The tables below show performance of implementations that are developed so far. Source code of \ +implementations and instructions for reproducing their performance can be found in submission folders, which are linked in the first column.\n", + file=open(readmefile, "a"), +) + +### Write the Energy section +# ============================ + +print("## Probabilistic energy forecasting performance board\n\n", file=open(readmefile, "a")) +print( + "The following table lists the current submision for the energy forecasting and their respective performances.\n\n", + file=open(readmefile, "a"), +) + +# Read the energy perfromane board from the CSV file and converrt to a table in md format +with open("TSPerfBoard-Energy.csv", "r") as f: + table = csvtomd.csv_to_table(f, ",") +print(csvtomd.md_table(table), file=open(readmefile, "a")) + +# Read Energy Performance Board CSV file +df = pd.read_csv("TSPerfBoard-Energy.csv", engine="python") +# df + +# Plot ,'Pinball Loss' by 'Training and Scoring Cost($)' chart +fig4 = plt.figure(figsize=(12, 8), dpi=80, facecolor="w", edgecolor="k") # this sets the plotting area size +fig4 = plot_perf("Training and Scoring Cost($)", "Pinball Loss", df) +plt.savefig("../../docs/images/Energy-Cost.png") + + +# insetting the performance charts +print( + "\n\nThe following chart compares the submissions performance on accuracy in Pinball Loss vs. Training and Scoring cost in $:\n\n ", + file=open(readmefile, "a"), +) +print("![EnergyPBLvsTime](./docs/images/Energy-Cost.png)", file=open(readmefile, "a")) +print("\n\n\n", file=open(readmefile, "a")) + + +# print the retail sales forcsating section +# ======================================== +print("## Retail sales forecasting performance board\n\n", file=open(readmefile, "a")) +print( + "The following table lists the current submision for the retail forecasting and their respective performances.\n\n", + file=open(readmefile, "a"), +) + +# Read the energy perfromane board from the CSV file and converrt to a table in md format +with open("TSPerfBoard-Retail.csv", "r") as f: + table = csvtomd.csv_to_table(f, ",") +print(csvtomd.md_table(table), file=open(readmefile, "a")) +print("\n\n\n", file=open(readmefile, "a")) + +# Read Retail Performane Board CSV file +df = pd.read_csv("TSPerfBoard-Retail.csv", engine="python") +# df + +# Plot MAPE (%) by Training and Scoring Cost ($) chart +fig2 = plt.figure(figsize=(12, 8), dpi=80, facecolor="w", edgecolor="k") # this sets the plotting area size +fig2 = plot_perf("Training and Scoring Cost ($)", "MAPE (%)", df) +plt.savefig("../../docs/images/Retail-Cost.png") + + +# insetting the performance charts +print( + "\n\nThe following chart compares the submissions performance on accuracy in %MAPE vs. Training and Scoring cost in $:\n\n ", + file=open(readmefile, "a"), +) +print("![EnergyPBLvsTime](./docs/images/Retail-Cost.png)", file=open(readmefile, "a")) +print("\n\n\n", file=open(readmefile, "a")) + +# insertting build status badge +print("## Build Status\n\n", file=open(readmefile, "a")) +print("| Build Type | Branch | Status | | Branch | Status |", file=open(readmefile, "a")) +print("| --- | --- | --- | --- | --- | --- |", file=open(readmefile, "a")) +print( + "| **Python Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/python_unit_tests_base?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=12&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/python_unit_tests_base?branchName=chenhui/python_test_pipeline)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=12&branchName=chenhui/python_test_pipeline) |", + file=open(readmefile, "a"), +) +print( + "| **R Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/Forecasting/r_unit_tests_prototype?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=9&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/Forecasting/r_unit_tests_prototype?branchName=zhouf/r_test_pipeline)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=9&branchName=zhouf/r_test_pipeline) |", + file=open(readmefile, "a"), +) +print("\n\n\n", file=open(readmefile, "a")) + + +print("A new Readme.md file has been generated successfully.") diff --git a/fclib/fclib/evaluation/evaluate.py b/contrib/tsperf/scripts/evaluate.py similarity index 100% rename from fclib/fclib/evaluation/evaluate.py rename to contrib/tsperf/scripts/evaluate.py diff --git a/docs/SETUP.md b/docs/SETUP.md index e60c4e64..a59aae0c 100644 --- a/docs/SETUP.md +++ b/docs/SETUP.md @@ -4,9 +4,12 @@ Please follow these instructions to read about the preferred compute environment ### Compute environment -The code in this repo has been developed and tested on an Azure Linux VM. Therefore, we recommend using an [Azure Data Science Virtual Machine (DSVM) for Linux (Ubuntu)](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro) to run the example notebooks and scripts. This VM will come installed with all the system requirements that are needed to create the conda environment described below and then run the notebooks in this repository. +The code in this repo has been developed and tested on an Azure Linux VM. Therefore, we recommend using an [Azure Data Science Virtual Machine (DSVM) for Linux (Ubuntu)](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro) to run the example notebooks and scripts. This VM will come installed with all the system requirements that are needed to create the conda environment described below and then run the notebooks in this repository. If you are using a Linux machine without conda installed, please install Miniconda by following the instructions in this [link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html). + +You can also use a Windows machine to run the example notebooks and scripts. In this case, you may either work with a [Windows Server 2019 Data Science Virtual Machine on Azure](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/provision-vm) or a local Windows machine. Azure Windows VW comes with conda pre-installed. If conda is not installed on your machine, please follow the instructions in this [link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.html) to install Miniconda. ### Clone the repository + To clone the Forecasting repository to your local machine, please run: ``` @@ -14,27 +17,51 @@ git clone https://github.com/microsoft/forecasting.git cd forecasting/ ``` -Next, follow the instruction below to install all dependencies required to run the examples provided in the repository. Follow [Automated environment setup](#automated-environment-setup) section to setup the environment automatically using a script. Alternatively, follow the [Manual environment setup](#manual-environment-setup) section for a step-by-step guide to setting up the environment. +Next, follow the instruction below to install all dependencies required to run the examples provided in the repository. Follow [Automated environment setup](#automated-environment-setup) section to set up the environment automatically using a script. Alternatively, follow the [Manual environment setup](#manual-environment-setup) section for a step-by-step guide to setting up the environment. ### Automated environment setup -We provide a script to install all dependencies automatically on a Linux machine. To execute the script, please run: +We provide scripts to install all dependencies automatically on a Linux machine as well as on a Windows machine. +#### Linux + +If you are using a Linux machine, please run the following command to execute the shell script for Linux ``` ./tools/environment_setup.sh ``` -from the root of Forecasting repo. If you have issues with running the setup script, please follow the [Manual environment setup](#manual-environment-setup) instructions below. +from the root of Forecasting repo. -Once you've executed the setup script, you can run example notebooks under [examples/](./examples) directory. +#### Windows + +Similarly, if you are using a Windows machine, please run the batch script for Windows via +``` +tools\environment_setup.bat +``` +from the root of Forecasting repo. Note that you need to run the above command from Anaconda Prompt (a terminal with conda available), which can be started by opening the Windows Start menu and clicking `Anaconda Prompt (Miniconda3)` as follows + +

+ +

+ +Once you've executed the setup script, please activate the newly created conda environment: + +``` +conda activate forecasting_env +``` + +>!NOTE: If you have issues with running the setup script, please follow the [Manual environment setup](#manual-environment-setup) instructions below. + +Next, navigate to [Starting the Jupyter Notebook Server](#starting-the-jupyter-notebook-server) section below to start the Jupyter server necessary for running the examples. ### Manual environment setup + #### Conda environment To install the package contained in this repository, navigate to the directory where you pulled the Forecasting repo to run: ```bash conda update conda -conda env create -f tools/environment.yaml +conda env create -f tools/environment.yml ``` This will create the appropriate conda environment to run experiments. Next activate the installed environment: ```bash @@ -63,6 +90,24 @@ In order to run the example notebooks, make sure to run the notebooks in the con python -m ipykernel install --user --name forecasting_env ``` -Once you've set up the environment, you can run example notebooks under [examples/](./examples) directory. +### Starting the Jupyter Notebook Server +In order to run the example notebooks provided in this repository, you will have to start a Jupyter notebook server. +For running examples on your **local machine**, please open your terminal application and run the following command: +``` +jupyter notebook +``` + +If you are working on a remote VM, you can start the notebook server with the following command: +``` +jupyter notebook --no-browser --port=8889 +``` +and forward the port where the notebooks are running (e.g., 8889) to the local machine via running the following command from the local machine: +``` +ssh -L localhost:8889:localhost:8889 @ +``` + +To access the notebooks, type `localhost:8889/` in the browser on your local machine. + +Now you're ready to run the examples provided in the `examples/`, by simply opening and executing the notebooks in the Jupyter server. Please also navigate to the [examples README file](../examples/README.md) to read about the available notebooks. diff --git a/examples/00_quick_start/azure_automl_forecast.ipynb b/examples/00_quick_start/azure_automl_forecast.ipynb deleted file mode 100644 index cce1a92a..00000000 --- a/examples/00_quick_start/azure_automl_forecast.ipynb +++ /dev/null @@ -1,756 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Automated Machine Learning (AutoML) on Azure for Retail Sales Forecasting\n", - "\n", - "This notebook demonstrates how to apply [AutoML in Azure Machine Learning services](https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml) to train and tune machine learning models for forecasting product sales in retail. We will use the Orange Juice dataset to illustrate the steps of utilizing AutoML as well as how to combine an AutoML model with a custom model for better performance.\n", - "\n", - "AutoML is a process of automating the tasks of machine learning model development. It helps data scientists and other practioners build machine learning models with high scalability and quality in less amount of time. AutoML in Azure Machine Learning allows you to train and tune a model using a target metric that you specify. This service iterates through machine learning algorithms and feature selection approaches, producing a score that measures the quality of each machine learning pipeline. The best model will then be selected based on the scores. For more technical details about Azure AutoML, please check [this paper](https://papers.nips.cc/paper/7595-probabilistic-matrix-factorization-for-automated-machine-learning.pdf).\n", - "\n", - "This notebook uses [Azure ML SDK](https://docs.microsoft.com/en-us/python/api/overview/azureml-sdk/?view=azure-ml-py) which is included in the `forecasting_env` conda environment. If you are running in Azure Notebooks or another Microsoft managed environment, the SDK is already installed. On the other hand, if you are running this notebook in your own environment, please follow [SDK installation instructions](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-environment) to install the SDK." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global Settings and Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "import math\n", - "import datetime\n", - "import logging\n", - "import azureml.core\n", - "import azureml.automl\n", - "import pandas as pd\n", - "\n", - "from matplotlib import pyplot as plt\n", - "from fclib.common.utils import git_repo_path\n", - "from fclib.evaluation.evaluation_utils import MAPE\n", - "from fclib.dataset.ojdata import download_ojdata, FIRST_WEEK_START\n", - "from fclib.common.utils import align_outputs\n", - "from fclib.models.multiple_linear_regression import fit, predict\n", - "\n", - "from azureml.core import Workspace\n", - "from azureml.core.dataset import Dataset\n", - "from azureml.core.experiment import Experiment\n", - "from automl.client.core.common import constants\n", - "from azureml.train.automl import AutoMLConfig\n", - "from azureml.core.compute import ComputeTarget, AmlCompute\n", - "from azureml.core.compute_target import ComputeTargetException\n", - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"This notebook was created using version 1.0.85 of the Azure ML SDK\")\n", - "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use False if you've already downloaded and split the data\n", - "DOWNLOAD_SPLIT_DATA = True\n", - "\n", - "# Data directory\n", - "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n", - "\n", - "# Forecasting settings\n", - "GAP = 2\n", - "LAST_WEEK = 138\n", - "\n", - "# Number of test periods\n", - "NUM_TEST_PERIODS = 3\n", - "\n", - "# Column names\n", - "time_column_name = \"week_start\"\n", - "target_column_name = \"move\"\n", - "grain_column_names = [\"store\", \"brand\"]\n", - "index_column_names = [time_column_name] + grain_column_names\n", - "\n", - "# Subset of stores used in the notebook\n", - "USE_STORES = [2, 5, 8]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up Azure Machine Learning Workspace\n", - "\n", - "An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models. To create an Azure ML workspace, first you need access to an Azure subscription. An Azure subscription allows you to manage storage, compute, and other assets in the Azure cloud. You can [create a new subscription](https://azure.microsoft.com/en-us/free/) or access existing subscription information from the [Azure portal](https://portal.azure.com/). Given that you have access to your Azure subscription, you can further create an Azure ML workspace by following the instructions [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace). You can also do so [using Azure CLI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli) or the `Workspace.create()` method in Azure SDK.\n", - "\n", - "In the following cell, please replace the value of each parameter with the value of the corresponding attribute of your workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "subscription_id = \"\"\n", - "resource_group = \"\"\n", - "workspace_name = \"\"\n", - "workspace_region = \"eastus2\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Access Azure ML Workspace\n", - "\n", - "In what follows, we use Azure ML SDK to attempt to load the workspace specified by your parameters. The cell can fail if the specified workspace doesn't exist or you don't have permissions to access it. Hence, you may need to log into your Azure account and change the default subscription to the one which the workspace belongs to using Azure CLI `az account set --subscription `." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " ws = Workspace.create(subscription_id=subscription_id, resource_group=resource_group, \n", - " name=workspace_name, create_resource_group=True, exist_ok=True, \n", - " location=workspace_region)\n", - " # write the details of the workspace to a configuration file to the notebook library\n", - " ws.write_config()\n", - " print(\"Workspace configuration succeeded. Skip the workspace creation steps below\")\n", - "except ValueError:\n", - " raise Exception(\"Workspace not accessible. Change your parameters or create a new workspace below\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create compute resources for your experiments\n", - "\n", - "We run AutoML on a dynamically scalable compute cluster. To create a compute cluster, you need to specify a compute configuration that specifies the type of machine to be used and the scalability behaviors. Then you choose a name for the cluster that is unique within the workspace that can be used to address the cluster later." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Choose a name for your CPU cluster\n", - "cpu_cluster_name = \"cpu-cluster\"\n", - "\n", - "# Verify that cluster does not exist already\n", - "workspace_compute = ws.compute_targets\n", - "if cpu_cluster_name in workspace_compute:\n", - " print(\"Found existing cpu-cluster\")\n", - " cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n", - "else: \n", - " print(\"Creating new cpu-cluster\")\n", - "\n", - " # Specify the configuration for the new cluster\n", - " compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_D2_V2\", min_nodes=4, max_nodes=4)\n", - "\n", - " # Create the cluster with the specified name and configuration\n", - " cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n", - "\n", - " # Wait for the cluster to complete, show the output log\n", - " cpu_cluster.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Experiment\n", - "\n", - "To run AutoML, you need to create an Experiment. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# choose a name for the run history container in the workspace\n", - "experiment_name = \"automl-ojforecasting\"\n", - "\n", - "experiment = Experiment(ws, experiment_name)\n", - "\n", - "output = {}\n", - "output[\"SDK version\"] = azureml.core.VERSION\n", - "output[\"Workspace\"] = ws.name\n", - "output[\"SKU\"] = ws.sku\n", - "output[\"Resource Group\"] = ws.resource_group\n", - "output[\"Location\"] = ws.location\n", - "output[\"Run History Name\"] = experiment_name\n", - "pd.set_option(\"display.max_colwidth\", -1)\n", - "outputDf = pd.DataFrame(data=output, index=[\"\"])\n", - "outputDf.T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n", - "\n", - "We need to download the Orange Juice data and split it into training and test sets. By default, the following cell will download and spit the data. If you've already done so, you may skip this part by switching `DOWNLOAD_SPLIT_DATA` to `False`.\n", - "\n", - "We store the training data and test data using dataframes. The training data includes `train_df` and `aux_df` with `train_df` containing the historical sales up to week 135 (the time we make forecasts) and `aux_df` containing price/promotion information up until week 138. We assume that future price and promotion information up to a certain number of weeks ahead is predetermined and known. The test data is stored in `test_df` which contains the sales of each product in week 137 and 138. Assuming the current week is week 135, our goal is to forecast the sales in week 137 and 138 using the training data. There is a one-week gap between the current week and the first target week of forecasting as we want to leave time for planning inventory in practice." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data download and split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if DOWNLOAD_SPLIT_DATA:\n", - " download_ojdata(DATA_DIR)\n", - " df = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n", - " df = df.loc[df.week <= LAST_WEEK]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert logarithm of the unit sales to unit sales\n", - "df[\"move\"] = df[\"logmove\"].apply(lambda x: round(math.exp(x)))\n", - "# Add timestamp column\n", - "df[\"week_start\"] = df[\"week\"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n", - "# Select a subset of stores for demo purpose\n", - "df_sub = df[df.store.isin(USE_STORES)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training and test sets\n", - "def split_last_n_by_grain(df, n):\n", - " \"\"\"Group df by grain and split on last n rows for each group.\"\"\"\n", - " df_grouped = df.sort_values(time_column_name).groupby( # Sort by ascending time\n", - " grain_column_names, group_keys=False\n", - " )\n", - " df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])\n", - " df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])\n", - " return df_head, df_tail\n", - "\n", - "\n", - "train_df, test_df = split_last_n_by_grain(df_sub, NUM_TEST_PERIODS)\n", - "train_df.reset_index(drop=True)\n", - "test_df.reset_index(drop=True)\n", - "\n", - "# Save data locally\n", - "local_data_pathes = [\n", - " os.path.join(DATA_DIR, \"train.csv\"),\n", - " os.path.join(DATA_DIR, \"test.csv\"),\n", - "]\n", - "\n", - "train_df.to_csv(local_data_pathes[0], index=None, header=True)\n", - "test_df.to_csv(local_data_pathes[1], index=None, header=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Upload data to datastore\n", - "\n", - "The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace), is paired with the storage account, which contains the default data store. We will use it to upload the train and test data and create [tabular datasets](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training and testing. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datastore = ws.get_default_datastore()\n", - "datastore.upload_files(files=local_data_pathes, target_path=\"dataset/\", overwrite=True, show_progress=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create dataset for training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_dataset = Dataset.Tabular.from_delimited_files(path=datastore.path(\"dataset/train.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_dataset.to_pandas_dataframe().tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Modeling\n", - "\n", - "For forecasting tasks, AutoML uses pre-processing and estimation steps that are specific to time-series. AutoML will undertake the following pre-processing steps:\n", - "* Detect time-series sample frequency (e.g. hourly, daily, weekly) and create new records for absent time points to make the series regular. A regular time series has a well-defined frequency and has a value at every sample point in a contiguous time span\n", - "* Impute missing values in the target (via forward-fill) and feature columns (using median column values)\n", - "* Create grain-based features to enable fixed effects across different series\n", - "* Create time-based features to assist in learning seasonal patterns\n", - "* Encode categorical variables to numeric quantities\n", - "\n", - "In this notebook, AutoML will train a single, regression-type model across all time-series in a given training set. This allows the model to generalize across related series. To create a training job, we use AutoML Config object to define the settings and data. Here is a summary of the meanings of the AutoMLConfig parameters:\n", - "\n", - "|Property|Description|\n", - "|-|-|\n", - "|**task**|forecasting|\n", - "|**primary_metric**|This is the metric that you want to optimize.
Forecasting supports the following primary metrics
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error\n", - "|**experiment_timeout_hours**|Experimentation timeout in hours.|\n", - "|**enable_early_stopping**|If early stopping is on, training will stop when the primary metric is no longer improving.|\n", - "|**training_data**|Input dataset, containing both features and label column.|\n", - "|**label_column_name**|The name of the label column.|\n", - "|**compute_target**|The remote compute for training.|\n", - "|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection|\n", - "|**enable_voting_ensemble**|Allow AutoML to create a Voting ensemble of the best performing models|\n", - "|**enable_stack_ensemble**|Allow AutoML to create a Stack ensemble of the best performing models|\n", - "|**debug_log**|Log file path for writing debugging information|\n", - "|**time_column_name**|Name of the datetime column in the input data|\n", - "|**grain_column_names**|Name(s) of the columns defining individual series in the input data|\n", - "|**drop_column_names**|Name(s) of columns to drop prior to modeling|\n", - "|**max_horizon**|Maximum desired forecast horizon in units of time-series frequency|" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "time_series_settings = {\n", - " \"time_column_name\": time_column_name,\n", - " \"grain_column_names\": grain_column_names,\n", - " \"drop_column_names\": [\"logmove\"], # 'logmove' is a leaky feature, so we remove it.\n", - " \"max_horizon\": NUM_TEST_PERIODS,\n", - "}\n", - "\n", - "automl_config = AutoMLConfig(\n", - " task=\"forecasting\",\n", - " debug_log=\"automl_oj_sales_errors.log\",\n", - " primary_metric=\"normalized_mean_absolute_error\",\n", - " experiment_timeout_hours=0.6, # You may increase this number to improve model accuracy\n", - " training_data=train_dataset,\n", - " label_column_name=target_column_name,\n", - " compute_target=cpu_cluster,\n", - " enable_early_stopping=True,\n", - " n_cross_validations=3,\n", - " verbosity=logging.INFO,\n", - " **time_series_settings\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remote_run = experiment.submit(automl_config, show_output=False)\n", - "remote_run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remote_run.wait_for_completion()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieve the best model\n", - "\n", - "Each run within an Experiment stores serialized (i.e. pickled) pipelines from the AutoML iterations. After the training job is done, we can retrieve the pipeline with the best performance on the validation dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run, fitted_model = remote_run.get_output()\n", - "print(fitted_model.steps)\n", - "model_name = best_run.properties[\"model_name\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Forecasting\n", - "\n", - "Now that we have retrieved the best model pipeline, we can apply it to generate forecasts for the target weeks. To do this, we first remove the target values from the test set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate forecasts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_test = test_df\n", - "y_test = X_test.pop(target_column_name).values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The featurized data, aligned to y, will also be returned. It contains the assumptions\n", - "# that were made in the forecast and helps align the forecast to the original data.\n", - "y_predictions, X_trans = fitted_model.forecast(X_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_automl = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n", - "pred_automl.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Results evaluation & visualization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use automl metrics module\n", - "scores = metrics.compute_metrics_regression(\n", - " pred_automl[\"predicted\"],\n", - " pred_automl[target_column_name],\n", - " list(constants.Metric.SCALAR_REGRESSION_SET),\n", - " None,\n", - " None,\n", - " None,\n", - ")\n", - "\n", - "print(\"[Test data scores]\\n\")\n", - "for key, value in scores.items():\n", - " print(\"{}: {:.3f}\".format(key, value))\n", - "\n", - "# Plot outputs\n", - "%matplotlib inline\n", - "test_pred = plt.scatter(pred_automl[target_column_name], pred_automl[\"predicted\"], color=\"b\")\n", - "test_test = plt.scatter(pred_automl[target_column_name], pred_automl[target_column_name], color=\"g\")\n", - "plt.legend((test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We also compute MAPE of the forecasts in the last two weeks of the forecast period in order to be consistent with the evaluation period that is used in other quick start examples." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_automl_sub = pred_automl.loc[pred_automl.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", - "mape_automl_sub = MAPE(pred_automl_sub[\"predicted\"], pred_automl_sub[\"move\"]) * 100\n", - "print(\"MAPE of forecasts obtained by AutoML in the last two weeks: \" + str(mape_automl_sub))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Combine AutoML Model with a Custom Model\n", - "\n", - "So far we have demonstrated how we can quickly build a forecasting model with AutoML in Azure. Next, we further show a simple way to achieve more robust and accurate forecasts by combining the forecasts from AutoML and a custom model that the user may have. Here we assume that the user have also constructed a series of linear regression models with each model forecasts the sales of a specfic store-brand using `scikit-learn` package." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multiple linear regression models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create price features\n", - "df_sub[\"price\"] = df_sub.apply(lambda x: x.loc[\"price\" + str(int(x.loc[\"brand\"]))], axis=1)\n", - "price_cols = [\n", - " \"price1\",\n", - " \"price2\",\n", - " \"price3\",\n", - " \"price4\",\n", - " \"price5\",\n", - " \"price6\",\n", - " \"price7\",\n", - " \"price8\",\n", - " \"price9\",\n", - " \"price10\",\n", - " \"price11\",\n", - "]\n", - "df_sub[\"avg_price\"] = df_sub[price_cols].sum(axis=1).apply(lambda x: x / len(price_cols))\n", - "df_sub[\"price_ratio\"] = df_sub.apply(lambda x: x[\"price\"] / x[\"avg_price\"], axis=1)\n", - "\n", - "# Create lag features on unit sales\n", - "df_sub[\"move_lag1\"] = df_sub[\"move\"].shift(1)\n", - "df_sub[\"move_lag2\"] = df_sub[\"move\"].shift(2)\n", - "\n", - "# Drop rows with NaN values\n", - "df_sub.dropna(inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After splitting the data, we use `fit()` and `predit()` functions from `fclib.models.multiple_linear_regression` to train separate linear regression model for each invididual time series and generate forecasts for the sales during the test period." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training and test sets\n", - "train_df, test_df = split_last_n_by_grain(df_sub, NUM_TEST_PERIODS)\n", - "train_df.reset_index(drop=True)\n", - "test_df.reset_index(drop=True)\n", - "\n", - "# Train multiple linear regression models\n", - "fea_column_names = [\"move_lag1\", \"move_lag2\", \"price\", \"price_ratio\"]\n", - "lr_models = fit(train_df, grain_column_names, fea_column_names, target_column_name)\n", - "\n", - "# Generate forecasts with the trained models\n", - "pred_all = predict(test_df, lr_models, time_column_name, grain_column_names, fea_column_names)\n", - "\n", - "pred_lr = pd.merge(pred_all, test_df, on=index_column_names)\n", - "pred_lr.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the accuracy of the predictions on the entire forecast period as well as in the last two weeks of the forecast period.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mape_lr_entire = MAPE(pred_lr[\"prediction\"], pred_lr[\"move\"]) * 100\n", - "print(\"MAPE of forecasts obtained by multiple linear regression on entire test period: \" + str(mape_lr_entire))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_lr_sub = pred_lr.loc[pred_lr.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", - "mape_lr_sub = MAPE(pred_lr_sub[\"prediction\"], pred_lr_sub[\"move\"]) * 100\n", - "print(\"MAPE of forecasts obtained by multiple linear regression in the last two weeks: \" + str(mape_lr_sub))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Combine forecasts from different methods\n", - "\n", - "We can combine the forecasts obtained by AutoML and multiple linear regression using weighted average and evaluate the final forecasts. Usually the combined forecasts will be more robust as a combination of two methods can reduce the chance of model overfitting. Here we use equal weights which can be further adjusted according to our confidence on each model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_final = pd.merge(\n", - " pred_automl[index_column_names + [\"predicted\", \"move\", \"week\"]],\n", - " pred_lr[index_column_names + [\"prediction\"]],\n", - " on=index_column_names,\n", - " how=\"left\",\n", - ")\n", - "pred_final[\"combined_prediction\"] = pred_final[\"predicted\"] * 0.5 + pred_final[\"prediction\"] * 0.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mape_entire = MAPE(pred_final[\"combined_prediction\"], pred_final[\"move\"]) * 100\n", - "print(\"MAPE of forecasts obtained by the combined model on entire test period: \" + str(mape_entire))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_final_sub = pred_final.loc[pred_final.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", - "mape_final_sub = MAPE(pred_final_sub[\"combined_prediction\"], pred_final_sub[\"move\"]) * 100\n", - "print(\"MAPE of forecasts obtained by the combined model in the last two weeks: \" + str(mape_final_sub))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional Reading\n", - "\n", - "\\[1\\] Nicolo Fusi, Rishit Sheth, and Melih Elibol. 2018. Probabilistic Matrix Factorization for Automated Machine Learning. In Advances in Neural Information Processing Systems. 3348-3357.
\n", - "\\[2\\] Azure AutoML Package Docs: https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl?view=azure-ml-py
\n", - "\\[3\\] Azure Automated Machine Learning Examples: https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning
\n", - "\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "forecasting_env", - "language": "python", - "name": "forecasting_env" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/examples/README.md b/examples/README.md index 69f5fa79..39b28c5b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,16 +1,16 @@ # Forecasting examples -This folder contains Python examples for building forecasting solutions. To run the notebooks, please execute `jupyter notebook` and select the Jupyter kernel `forecasting_env` if you are using a local machine. Otherwise, if you use a remote VM, you can start the notebooks via `jupyter notebook --no-browser` and forward the port where the notebooks are running (e.g., 8888) to the local machine via `ssh @ -L 8888:localhost:8888`. +This folder contains Python and R examples for building forecasting solutions presented in Python Jupyter notebooks and R Markdown files, respectively. The examples are organized according to forecasting scenarios in different use cases with each subdirectory under `examples/` named after the specific use case. + +At the moment, the repository contains a single retail sales forecasting scenario utilizing [Dominick's OrangeJuice data set](https://www.chicagobooth.edu/research/kilts/datasets/dominicks). The name of the directory is `grocery_sales`. ## Summary -The following summarizes each directory of the best practice notebooks. +The following table summarizes each forecasting scenario contained in the repository, and links available content within that scenario. + +| Directory | Content | Description | +|----------------------------------|----------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| [grocery_sales](./grocery_sales) | [python/](./grocery_sales/python)
[R/](./grocery_sales/R) | Python and R examples for forecasting sales of orange juice in [Dominick's dataset](https://www.chicagobooth.edu/research/kilts/datasets/dominicks). | -| Directory | Content | Description | -| --- | --- | --- | -| [00_quick_start](./00_quick_start)| [auto_arima_forecasting.ipynb](./00_quick_start/auto_arima_forecasting.ipynb)
[azure_automl_forecast.ipynb](./00_quick_start/azure_automl_forecast.ipynb)
[lightgbm_point_forecast.ipynb](./00_quick_start/lightgbm_point_forecast.ipynb) | Quick start notebooks that demonstrate workflow of developing a forecasting model using one-round training and testing data| -| [01_prepare_data](./01_prepare_data) | [ojdata_exploration_retail.ipynb](./01_prepare_data/ojdata_exploration_retail.ipynb)
[ojdata_preparation_retail.ipynb](./01_prepare_data/ojdata_preparation_retail.ipynb) | Data exploration and preparation notebooks| -| [02_model](./02_model) | [dilatedcnn_point_forecast_multiround.ipynb](./02_model/dilatedcnn_point_forecast_multiround.ipynb)
[lightgbm_point_forecast_multiround.ipynb](./02_model/lightgbm_point_forecast_multiround.ipynb) | Deep dive notebooks that perform multi-round training and testing of various classical and deep learning forecast algorithms| -| [03_model_select_deploy](03_model_select_deploy) | Example notebook to be added soon | Best practice notebook for model selecting by using Azure Machine Learning Service and deploying the best model on Azure| diff --git a/examples/grocery_sales/R/01_dataprep.Rmd b/examples/grocery_sales/R/01_dataprep.Rmd new file mode 100644 index 00000000..05cd7d02 --- /dev/null +++ b/examples/grocery_sales/R/01_dataprep.Rmd @@ -0,0 +1,96 @@ +--- +title: Data preparation +output: html_notebook +--- + +_Copyright (c) Microsoft Corporation._
+_Licensed under the MIT License._ + +In this notebook, we generate the datasets that will be used for model training and validating. + +The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida. Rather than installing the entire package (which is very complex), we download the dataset itself from the GitHub mirror of the CRAN repository. + +```{r, results="hide", message=FALSE} +# download the data from the GitHub mirror of the bayesm package source +ojfile <- tempfile(fileext=".rda") +download.file("https://github.com/cran/bayesm/raw/master/data/orangeJuice.rda", ojfile) +load(ojfile) +file.remove(ojfile) +``` + +The dataset generation parameters are obtained from the file `ojdata_forecast_settings.yaml`; you can modify that file to vary the experimental setup. The settings are + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `N_SPLITS` | The number of splits to make. | 10 | +| `HORIZON` | The forecast horizon for the test dataset for each split. | 2 | +| `GAP` | The gap in weeks from the end of the training period to the start of the testing period; see below. | 2 | +| `FIRST_WEEK` | The first week of data to use. | 40 | +| `LAST_WEEK` | The last week of data to use. | 156 | +| `START_DATE` | The actual calendar date for the start of the first week in the data. | `1989-09-14` | + +A complicating factor is that the data does not include every possible combination of store, brand and date, so we have to pad out the missing rows with `complete`. In addition, one store/brand combination has no data beyond week 156; we therefore end the analysis at this week. We also do _not_ fill in the missing values in the data, as many of the modelling functions in the fable package can handle this innately. + +```{r, results="hide", message=FALSE} +library(tidyr) +library(dplyr) +library(tsibble) +library(feasts) +library(fable) + +settings <- yaml::read_yaml(here::here("examples/grocery_sales/R/forecast_settings.yaml")) +start_date <- as.Date(settings$START_DATE) +train_periods <- seq(to=settings$LAST_WEEK - settings$HORIZON - settings$GAP + 1, + by=settings$HORIZON, + length.out=settings$N_SPLITS) + +oj_data <- orangeJuice$yx %>% + complete(store, brand, week) %>% + mutate(week=yearweek(start_date + week*7)) %>% + as_tsibble(index=week, key=c(store, brand)) +``` + +Here are some glimpses of what the data looks like. The dependent variable is `logmove`, the logarithm of the total sales for a given brand and store, in a particular week. + +```{r} +head(oj_data) +``` + +The time series plots for a small subset of brands and stores are shown below. We can make the following observations: + +- There appears to be little seasonal variation in sales (probably because Florida is a state without very different seasons). In any case, with less than 2 years of observations, the time series is not long enough for many model-fitting functions in the fable package to automatically estimate seasonal parameters. +- While some store/brand combinations show weak trends over time, this is far from universal. +- Different brands can exhibit very different behaviour, especially in terms of variation about the mean. +- Many of the time series have missing values, indicating that the dataset is incomplete. + + +```{r, fig.height=10} +library(ggplot2) + +oj_data %>% + filter(store < 25, brand < 5) %>% + ggplot(aes(x=week, y=logmove)) + + geom_line() + + scale_x_date(labels=NULL) + + facet_grid(vars(store), vars(brand), labeller="label_both") +``` + +Finally, we split the dataset into separate samples for training and testing. The schema used is broadly time series cross-validation, whereby we train a model on data up to time $t$, test it on data for times $t+1$ to $t+k$, then train on data up to time $t+k$, test it on data for times $t+k+1$ to $t+2k$, and so on. In this specific case study, however, we introduce a small extra piece of complexity based on discussions with domain experts. We train a model on data up to week $t$, then test it on week $t+2$ to $t+3$. Then we train on data up to week $t+2$, and test it on weeks $t+4$ to $t+5$, and so on. There is thus always a gap of one week between the training and test samples. The reason for this is because in reality, inventory planning always takes some time; the gap allows store managers to prepare the stock based on the forecasted demand. + +```{r} +subset_oj_data <- function(start, end) +{ + start <- yearweek(start_date + start*7) + end <- yearweek(start_date + end*7) + filter(oj_data, week >= start, week <= end) +} + +oj_train <- lapply(train_periods, function(i) subset_oj_data(settings$FIRST_WEEK, i)) +oj_test <- lapply(train_periods, function(i) subset_oj_data(i + settings$GAP, i + settings$GAP + settings$HORIZON - 1)) + +save(oj_train, oj_test, file=here::here("examples/grocery_sales/R/data.Rdata")) + +head(oj_train[[1]]) + +head(oj_test[[1]]) +``` diff --git a/R/orange_juice/01_dataprep.nb.html b/examples/grocery_sales/R/01_dataprep.nb.html similarity index 69% rename from R/orange_juice/01_dataprep.nb.html rename to examples/grocery_sales/R/01_dataprep.nb.html index 8378a894..bf37aa54 100644 --- a/R/orange_juice/01_dataprep.nb.html +++ b/examples/grocery_sales/R/01_dataprep.nb.html @@ -226,41 +226,91 @@ summary { +

Copyright (c) Microsoft Corporation.
Licensed under the MIT License.

+

In this notebook, we generate the datasets that will be used for model training and validating.

+

The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida. Rather than installing the entire package (which is very complex), we download the dataset itself from the GitHub mirror of the CRAN repository.

+ +
# download the data from the GitHub mirror of the bayesm package source
+ojfile <- tempfile(fileext=".rda")
+download.file("https://github.com/cran/bayesm/raw/master/data/orangeJuice.rda", ojfile)
+load(ojfile)
+file.remove(ojfile)
+ -

In this notebook, we generate the datasets that will be used for model training and validating. The experiment parameters are obtained from the file ojdata_forecast_settings.json; you can modify that file to vary the experimental setup, or just edit the values in this notebook.

-

The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida.

-

A complicating factor is that the data is in a hybrid of long and wide format: while the sales figures are long (one column of sales data for every store and brand), the prices are wide (one price column for each brand). Therefore we need to reshape the data if we want to use prices for modelling. As part of this, we also compute a new column maxpricediff: this represents the log-ratio of the price of this brand compared to the best competing price. A positive maxpricediff means this brand is cheaper than all the other brands, and a negative maxpricediff means it is more expensive.

+

The dataset generation parameters are obtained from the file ojdata_forecast_settings.yaml; you can modify that file to vary the experimental setup. The settings are

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault
N_SPLITSThe number of splits to make.10
HORIZONThe forecast horizon for the test dataset for each split.2
GAPThe gap in weeks from the end of the training period to the start of the testing period; see below.2
FIRST_WEEKThe first week of data to use.40
LAST_WEEKThe last week of data to use.156
START_DATEThe actual calendar date for the start of the first week in the data.1989-09-14
+

A complicating factor is that the data does not include every possible combination of store, brand and date, so we have to pad out the missing rows with complete. In addition, one store/brand combination has no data beyond week 156; we therefore end the analysis at this week. We also do not fill in the missing values in the data, as many of the modelling functions in the fable package can handle this innately.

- -
settings <- jsonlite::fromJSON("ojdata_forecast_settings.json")
+
+
library(tidyr)
+library(dplyr)
+library(tsibble)
+library(feasts)
+library(fable)
 
-train_periods <- seq(settings$TRAIN_WINDOW, 160 - settings$STEP - 1, settings$STEP)
+settings <- yaml::read_yaml(here::here("examples/grocery_sales/R/forecast_settings.yaml"))
 start_date <- as.Date(settings$START_DATE)
-
-data(orangeJuice, package="bayesm")
+train_periods <- seq(to=settings$LAST_WEEK - settings$HORIZON - settings$GAP + 1,
+                     by=settings$HORIZON,
+                     length.out=settings$N_SPLITS)
 
 oj_data <- orangeJuice$yx %>%
     complete(store, brand, week) %>%
-    group_by(store, brand) %>%
-    group_modify(~ {
-        pricevars <- grep("price", names(.x), value=TRUE)
-        thispricevar <- paste0("price", .y$brand)
-        best_other_price <- do.call(pmin, .x[setdiff(pricevars, thispricevar)])
-        .x$price <- .x[[thispricevar]]
-        .x$maxpricediff <- log(best_other_price/.x$price)
-        select(.x, week, logmove, deal, feat, price, maxpricediff)
-    }) %>%
-    ungroup() %>%
-    mutate(week=yearweek(start_date + week*7)) %>%  # do this separately because of tsibble/vctrs issues
+    mutate(week=yearweek(start_date + week*7)) %>%
     as_tsibble(index=week, key=c(store, brand))
-

Here are some glimpses of what the data looks like. The dependent variable is logmove, the logarithm of the total sales for a given brand and store, in a particular week. Note that we do not fill in the missing values in the data, as (with the exception of ETS) the modelling functions in the fable package can handle this innately.

+

Here are some glimpses of what the data looks like. The dependent variable is logmove, the logarithm of the total sales for a given brand and store, in a particular week.

@@ -268,34 +318,39 @@ oj_data <- orangeJuice$yx %>%
-

The time series plots for a small subset of brands and stores are shown below. It is clear that the statistical behaviour of the data varies by store and brand.

+

The time series plots for a small subset of brands and stores are shown below. We can make the following observations:

+
    +
  • There appears to be little seasonal variation in sales (probably because Florida is a state without very different seasons). In any case, with less than 2 years of observations, the time series is not long enough for many model-fitting functions in the fable package to automatically estimate seasonal parameters.
  • +
  • While some store/brand combinations show weak trends over time, this is far from universal.
  • +
  • Different brands can exhibit very different behaviour, especially in terms of variation about the mean.
  • +
  • Many of the time series have missing values, indicating that the dataset is incomplete.
  • +
- +
library(ggplot2)
 
 oj_data %>%
-    filter(store < 10, brand < 5) %>%
+    filter(store < 25, brand < 5) %>%
     ggplot(aes(x=week, y=logmove)) +
         geom_line() +
         scale_x_date(labels=NULL) +
         facet_grid(vars(store), vars(brand), labeller="label_both")
-

+

-

Finally, we split the dataset into separate samples for training and testing. The schema used is broadly time series cross-validation, whereby we train a model on data up to time \(t\), test it on data for times \(t+1\) to \(t+k\), then train on data up to time \(t+k\), test it on data for times \(t+k+1\) to \(t+2k\), and so on.

-

In this specific case study we introduce a small extra piece of complexity. We train a model on data up to month \(t\), then test it on months \(t+2\) to \(t+3\). Then we train on data up to month \(t+2\), and test it on months \(t+4\) to \(t+5\), and so on. Thus there is always a gap of one month between the training and test samples, a complicating factor introduced after discussions with domain experts.

+

Finally, we split the dataset into separate samples for training and testing. The schema used is broadly time series cross-validation, whereby we train a model on data up to time \(t\), test it on data for times \(t+1\) to \(t+k\), then train on data up to time \(t+k\), test it on data for times \(t+k+1\) to \(t+2k\), and so on. In this specific case study, however, we introduce a small extra piece of complexity based on discussions with domain experts. We train a model on data up to week \(t\), then test it on week \(t+2\) to \(t+3\). Then we train on data up to week \(t+2\), and test it on weeks \(t+4\) to \(t+5\), and so on. There is thus always a gap of one week between the training and test samples. The reason for this is because in reality, inventory planning always takes some time; the gap allows store managers to prepare the stock based on the forecasted demand.

- +
subset_oj_data <- function(start, end)
 {
     start <- yearweek(start_date + start*7)
@@ -303,16 +358,16 @@ oj_data %>%
     filter(oj_data, week >= start, week <= end)
 }
 
-oj_train <- lapply(train_periods, function(i) subset_oj_data(40, i))
-oj_test <- lapply(train_periods, function(i) subset_oj_data(i + 2, i + settings$STEP + 1))
+oj_train <- lapply(train_periods, function(i) subset_oj_data(settings$FIRST_WEEK, i))
+oj_test <- lapply(train_periods, function(i) subset_oj_data(i + settings$GAP, i + settings$GAP + settings$HORIZON - 1))
 
-save(oj_train, oj_test, file="oj_data.Rdata")
+save(oj_train, oj_test, file=here::here("examples/grocery_sales/R/data.Rdata"))
 
 head(oj_train[[1]])
@@ -320,12 +375,12 @@ head(oj_train[[1]])
-
LS0tCnRpdGxlOiBEYXRhIHByZXBhcmF0aW9uCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyLCBlY2hvPUZBTFNFLCByZXN1bHRzPSJoaWRlIiwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0c2liYmxlKQpsaWJyYXJ5KGZlYXN0cykKbGlicmFyeShmYWJsZSkKYGBgCgpJbiB0aGlzIG5vdGVib29rLCB3ZSBnZW5lcmF0ZSB0aGUgZGF0YXNldHMgdGhhdCB3aWxsIGJlIHVzZWQgZm9yIG1vZGVsIHRyYWluaW5nIGFuZCB2YWxpZGF0aW5nLiBUaGUgZXhwZXJpbWVudCBwYXJhbWV0ZXJzIGFyZSBvYnRhaW5lZCBmcm9tIHRoZSBmaWxlIGBvamRhdGFfZm9yZWNhc3Rfc2V0dGluZ3MuanNvbmA7IHlvdSBjYW4gbW9kaWZ5IHRoYXQgZmlsZSB0byB2YXJ5IHRoZSBleHBlcmltZW50YWwgc2V0dXAsIG9yIGp1c3QgZWRpdCB0aGUgdmFsdWVzIGluIHRoaXMgbm90ZWJvb2suCgpUaGUgb3JhbmdlIGp1aWNlIGRhdGFzZXQgY29tZXMgZnJvbSB0aGUgYmF5ZXNtIHBhY2thZ2UsIGFuZCBnaXZlcyBwcmljaW5nIGFuZCBzYWxlcyBmaWd1cmVzIG92ZXIgdGltZSBmb3IgYSB2YXJpZXR5IG9mIG9yYW5nZSBqdWljZSBicmFuZHMgaW4gc2V2ZXJhbCBzdG9yZXMgaW4gRmxvcmlkYS4KCkEgY29tcGxpY2F0aW5nIGZhY3RvciBpcyB0aGF0IHRoZSBkYXRhIGlzIGluIGEgaHlicmlkIG9mIGxvbmcgYW5kIHdpZGUgZm9ybWF0OiB3aGlsZSB0aGUgc2FsZXMgZmlndXJlcyBhcmUgbG9uZyAob25lIGNvbHVtbiBvZiBzYWxlcyBkYXRhIGZvciBldmVyeSBzdG9yZSBhbmQgYnJhbmQpLCB0aGUgcHJpY2VzIGFyZSB3aWRlIChvbmUgcHJpY2UgY29sdW1uIGZvciBlYWNoIGJyYW5kKS4gVGhlcmVmb3JlIHdlIG5lZWQgdG8gcmVzaGFwZSB0aGUgZGF0YSBpZiB3ZSB3YW50IHRvIHVzZSBwcmljZXMgZm9yIG1vZGVsbGluZy4gQXMgcGFydCBvZiB0aGlzLCB3ZSBhbHNvIGNvbXB1dGUgYSBuZXcgY29sdW1uIGBtYXhwcmljZWRpZmZgOiB0aGlzIHJlcHJlc2VudHMgdGhlIGxvZy1yYXRpbyBvZiB0aGUgcHJpY2Ugb2YgdGhpcyBicmFuZCBjb21wYXJlZCB0byB0aGUgYmVzdCBjb21wZXRpbmcgcHJpY2UuIEEgcG9zaXRpdmUgYG1heHByaWNlZGlmZmAgbWVhbnMgdGhpcyBicmFuZCBpcyBjaGVhcGVyIHRoYW4gYWxsIHRoZSBvdGhlciBicmFuZHMsIGFuZCBhIG5lZ2F0aXZlIGBtYXhwcmljZWRpZmZgIG1lYW5zIGl0IGlzIG1vcmUgZXhwZW5zaXZlLgoKYGBge3J9CnNldHRpbmdzIDwtIGpzb25saXRlOjpmcm9tSlNPTigib2pkYXRhX2ZvcmVjYXN0X3NldHRpbmdzLmpzb24iKQoKdHJhaW5fcGVyaW9kcyA8LSBzZXEoc2V0dGluZ3MkVFJBSU5fV0lORE9XLCAxNjAgLSBzZXR0aW5ncyRTVEVQIC0gMSwgc2V0dGluZ3MkU1RFUCkKc3RhcnRfZGF0ZSA8LSBhcy5EYXRlKHNldHRpbmdzJFNUQVJUX0RBVEUpCgpkYXRhKG9yYW5nZUp1aWNlLCBwYWNrYWdlPSJiYXllc20iKQoKb2pfZGF0YSA8LSBvcmFuZ2VKdWljZSR5eCAlPiUKICAgIGNvbXBsZXRlKHN0b3JlLCBicmFuZCwgd2VlaykgJT4lCiAgICBncm91cF9ieShzdG9yZSwgYnJhbmQpICU+JQogICAgZ3JvdXBfbW9kaWZ5KH4gewogICAgICAgIHByaWNldmFycyA8LSBncmVwKCJwcmljZSIsIG5hbWVzKC54KSwgdmFsdWU9VFJVRSkKICAgICAgICB0aGlzcHJpY2V2YXIgPC0gcGFzdGUwKCJwcmljZSIsIC55JGJyYW5kKQogICAgICAgIGJlc3Rfb3RoZXJfcHJpY2UgPC0gZG8uY2FsbChwbWluLCAueFtzZXRkaWZmKHByaWNldmFycywgdGhpc3ByaWNldmFyKV0pCiAgICAgICAgLngkcHJpY2UgPC0gLnhbW3RoaXNwcmljZXZhcl1dCiAgICAgICAgLngkbWF4cHJpY2VkaWZmIDwtIGxvZyhiZXN0X290aGVyX3ByaWNlLy54JHByaWNlKQogICAgICAgIHNlbGVjdCgueCwgd2VlaywgbG9nbW92ZSwgZGVhbCwgZmVhdCwgcHJpY2UsIG1heHByaWNlZGlmZikKICAgIH0pICU+JQogICAgdW5ncm91cCgpICU+JQogICAgbXV0YXRlKHdlZWs9eWVhcndlZWsoc3RhcnRfZGF0ZSArIHdlZWsqNykpICU+JSAgIyBkbyB0aGlzIHNlcGFyYXRlbHkgYmVjYXVzZSBvZiB0c2liYmxlL3ZjdHJzIGlzc3VlcwogICAgYXNfdHNpYmJsZShpbmRleD13ZWVrLCBrZXk9YyhzdG9yZSwgYnJhbmQpKQpgYGAKCkhlcmUgYXJlIHNvbWUgZ2xpbXBzZXMgb2Ygd2hhdCB0aGUgZGF0YSBsb29rcyBsaWtlLiBUaGUgZGVwZW5kZW50IHZhcmlhYmxlIGlzIGBsb2dtb3ZlYCwgdGhlIGxvZ2FyaXRobSBvZiB0aGUgdG90YWwgc2FsZXMgZm9yIGEgZ2l2ZW4gYnJhbmQgYW5kIHN0b3JlLCBpbiBhIHBhcnRpY3VsYXIgd2Vlay4gTm90ZSB0aGF0IHdlIGRvIF9ub3RfIGZpbGwgaW4gdGhlIG1pc3NpbmcgdmFsdWVzIGluIHRoZSBkYXRhLCBhcyAod2l0aCB0aGUgZXhjZXB0aW9uIG9mIGBFVFNgKSB0aGUgbW9kZWxsaW5nIGZ1bmN0aW9ucyBpbiB0aGUgZmFibGUgcGFja2FnZSBjYW4gaGFuZGxlIHRoaXMgaW5uYXRlbHkuCgpgYGB7cn0KaGVhZChval9kYXRhKQpgYGAKClRoZSB0aW1lIHNlcmllcyBwbG90cyBmb3IgYSBzbWFsbCBzdWJzZXQgb2YgYnJhbmRzIGFuZCBzdG9yZXMgYXJlIHNob3duIGJlbG93LiBJdCBpcyBjbGVhciB0aGF0IHRoZSBzdGF0aXN0aWNhbCBiZWhhdmlvdXIgb2YgdGhlIGRhdGEgdmFyaWVzIGJ5IHN0b3JlIGFuZCBicmFuZC4KCmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCgpval9kYXRhICU+JQogICAgZmlsdGVyKHN0b3JlIDwgMTAsIGJyYW5kIDwgNSkgJT4lCiAgICBnZ3Bsb3QoYWVzKHg9d2VlaywgeT1sb2dtb3ZlKSkgKwogICAgICAgIGdlb21fbGluZSgpICsKICAgICAgICBzY2FsZV94X2RhdGUobGFiZWxzPU5VTEwpICsKICAgICAgICBmYWNldF9ncmlkKHZhcnMoc3RvcmUpLCB2YXJzKGJyYW5kKSwgbGFiZWxsZXI9ImxhYmVsX2JvdGgiKQpgYGAKCkZpbmFsbHksIHdlIHNwbGl0IHRoZSBkYXRhc2V0IGludG8gc2VwYXJhdGUgc2FtcGxlcyBmb3IgdHJhaW5pbmcgYW5kIHRlc3RpbmcuIFRoZSBzY2hlbWEgdXNlZCBpcyBicm9hZGx5IHRpbWUgc2VyaWVzIGNyb3NzLXZhbGlkYXRpb24sIHdoZXJlYnkgd2UgdHJhaW4gYSBtb2RlbCBvbiBkYXRhIHVwIHRvIHRpbWUgJHQkLCB0ZXN0IGl0IG9uIGRhdGEgZm9yIHRpbWVzICR0KzEkIHRvICR0K2skLCB0aGVuIHRyYWluIG9uIGRhdGEgdXAgdG8gdGltZSAkdCtrJCwgdGVzdCBpdCBvbiBkYXRhIGZvciB0aW1lcyAkdCtrKzEkIHRvICR0KzJrJCwgYW5kIHNvIG9uLgoKSW4gdGhpcyBzcGVjaWZpYyBjYXNlIHN0dWR5IHdlIGludHJvZHVjZSBhIHNtYWxsIGV4dHJhIHBpZWNlIG9mIGNvbXBsZXhpdHkuIFdlIHRyYWluIGEgbW9kZWwgb24gZGF0YSB1cCB0byBtb250aCAkdCQsIHRoZW4gdGVzdCBpdCBvbiBtb250aHMgJHQrMiQgdG8gJHQrMyQuIFRoZW4gd2UgdHJhaW4gb24gZGF0YSB1cCB0byBtb250aCAkdCsyJCwgYW5kIHRlc3QgaXQgb24gbW9udGhzICR0KzQkIHRvICR0KzUkLCBhbmQgc28gb24uIFRodXMgdGhlcmUgaXMgYWx3YXlzIGEgZ2FwIG9mIG9uZSBtb250aCBiZXR3ZWVuIHRoZSB0cmFpbmluZyBhbmQgdGVzdCBzYW1wbGVzLCBhIGNvbXBsaWNhdGluZyBmYWN0b3IgaW50cm9kdWNlZCBhZnRlciBkaXNjdXNzaW9ucyB3aXRoIGRvbWFpbiBleHBlcnRzLgoKYGBge3J9CnN1YnNldF9val9kYXRhIDwtIGZ1bmN0aW9uKHN0YXJ0LCBlbmQpCnsKICAgIHN0YXJ0IDwtIHllYXJ3ZWVrKHN0YXJ0X2RhdGUgKyBzdGFydCo3KQogICAgZW5kIDwtIHllYXJ3ZWVrKHN0YXJ0X2RhdGUgKyBlbmQqNykKICAgIGZpbHRlcihval9kYXRhLCB3ZWVrID49IHN0YXJ0LCB3ZWVrIDw9IGVuZCkKfQoKb2pfdHJhaW4gPC0gbGFwcGx5KHRyYWluX3BlcmlvZHMsIGZ1bmN0aW9uKGkpIHN1YnNldF9val9kYXRhKDQwLCBpKSkKb2pfdGVzdCA8LSBsYXBwbHkodHJhaW5fcGVyaW9kcywgZnVuY3Rpb24oaSkgc3Vic2V0X29qX2RhdGEoaSArIDIsIGkgKyBzZXR0aW5ncyRTVEVQICsgMSkpCgpzYXZlKG9qX3RyYWluLCBval90ZXN0LCBmaWxlPSJval9kYXRhLlJkYXRhIikKCmhlYWQob2pfdHJhaW5bWzFdXSkKCmhlYWQob2pfdGVzdFtbMV1dKQpgYGAK
+
LS0tCnRpdGxlOiBEYXRhIHByZXBhcmF0aW9uCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCl9Db3B5cmlnaHQgKGMpIE1pY3Jvc29mdCBDb3Jwb3JhdGlvbi5fPGJyLz4KX0xpY2Vuc2VkIHVuZGVyIHRoZSBNSVQgTGljZW5zZS5fCgpJbiB0aGlzIG5vdGVib29rLCB3ZSBnZW5lcmF0ZSB0aGUgZGF0YXNldHMgdGhhdCB3aWxsIGJlIHVzZWQgZm9yIG1vZGVsIHRyYWluaW5nIGFuZCB2YWxpZGF0aW5nLiAKClRoZSBvcmFuZ2UganVpY2UgZGF0YXNldCBjb21lcyBmcm9tIHRoZSBiYXllc20gcGFja2FnZSwgYW5kIGdpdmVzIHByaWNpbmcgYW5kIHNhbGVzIGZpZ3VyZXMgb3ZlciB0aW1lIGZvciBhIHZhcmlldHkgb2Ygb3JhbmdlIGp1aWNlIGJyYW5kcyBpbiBzZXZlcmFsIHN0b3JlcyBpbiBGbG9yaWRhLiBSYXRoZXIgdGhhbiBpbnN0YWxsaW5nIHRoZSBlbnRpcmUgcGFja2FnZSAod2hpY2ggaXMgdmVyeSBjb21wbGV4KSwgd2UgZG93bmxvYWQgdGhlIGRhdGFzZXQgaXRzZWxmIGZyb20gdGhlIEdpdEh1YiBtaXJyb3Igb2YgdGhlIENSQU4gcmVwb3NpdG9yeS4KCmBgYHtyLCByZXN1bHRzPSJoaWRlIiwgbWVzc2FnZT1GQUxTRX0KIyBkb3dubG9hZCB0aGUgZGF0YSBmcm9tIHRoZSBHaXRIdWIgbWlycm9yIG9mIHRoZSBiYXllc20gcGFja2FnZSBzb3VyY2UKb2pmaWxlIDwtIHRlbXBmaWxlKGZpbGVleHQ9Ii5yZGEiKQpkb3dubG9hZC5maWxlKCJodHRwczovL2dpdGh1Yi5jb20vY3Jhbi9iYXllc20vcmF3L21hc3Rlci9kYXRhL29yYW5nZUp1aWNlLnJkYSIsIG9qZmlsZSkKbG9hZChvamZpbGUpCmZpbGUucmVtb3ZlKG9qZmlsZSkKYGBgCgpUaGUgZGF0YXNldCBnZW5lcmF0aW9uIHBhcmFtZXRlcnMgYXJlIG9idGFpbmVkIGZyb20gdGhlIGZpbGUgYG9qZGF0YV9mb3JlY2FzdF9zZXR0aW5ncy55YW1sYDsgeW91IGNhbiBtb2RpZnkgdGhhdCBmaWxlIHRvIHZhcnkgdGhlIGV4cGVyaW1lbnRhbCBzZXR1cC4gVGhlIHNldHRpbmdzIGFyZQoKfCBQYXJhbWV0ZXIgfCBEZXNjcmlwdGlvbiB8IERlZmF1bHQgfCAKfC0tLS0tLS0tLS0tfC0tLS0tLS0tLS0tLS18LS0tLS0tLS0tfAp8IGBOX1NQTElUU2AgfCBUaGUgbnVtYmVyIG9mIHNwbGl0cyB0byBtYWtlLiB8IDEwIHwKfCBgSE9SSVpPTmAgfCBUaGUgZm9yZWNhc3QgaG9yaXpvbiBmb3IgdGhlIHRlc3QgZGF0YXNldCBmb3IgZWFjaCBzcGxpdC4gfCAyIHwKfCBgR0FQYCB8IFRoZSBnYXAgaW4gd2Vla3MgZnJvbSB0aGUgZW5kIG9mIHRoZSB0cmFpbmluZyBwZXJpb2QgdG8gdGhlIHN0YXJ0IG9mIHRoZSB0ZXN0aW5nIHBlcmlvZDsgc2VlIGJlbG93LiB8IDIgfAp8IGBGSVJTVF9XRUVLYCB8IFRoZSBmaXJzdCB3ZWVrIG9mIGRhdGEgdG8gdXNlLiB8IDQwIHwKfCBgTEFTVF9XRUVLYCB8IFRoZSBsYXN0IHdlZWsgb2YgZGF0YSB0byB1c2UuIHwgMTU2IHwKfCBgU1RBUlRfREFURWAgfCBUaGUgYWN0dWFsIGNhbGVuZGFyIGRhdGUgZm9yIHRoZSBzdGFydCBvZiB0aGUgZmlyc3Qgd2VlayBpbiB0aGUgZGF0YS4gfCBgMTk4OS0wOS0xNGAgfAoKQSBjb21wbGljYXRpbmcgZmFjdG9yIGlzIHRoYXQgdGhlIGRhdGEgZG9lcyBub3QgaW5jbHVkZSBldmVyeSBwb3NzaWJsZSBjb21iaW5hdGlvbiBvZiBzdG9yZSwgYnJhbmQgYW5kIGRhdGUsIHNvIHdlIGhhdmUgdG8gcGFkIG91dCB0aGUgbWlzc2luZyByb3dzIHdpdGggYGNvbXBsZXRlYC4gSW4gYWRkaXRpb24sIG9uZSBzdG9yZS9icmFuZCBjb21iaW5hdGlvbiBoYXMgbm8gZGF0YSBiZXlvbmQgd2VlayAxNTY7IHdlIHRoZXJlZm9yZSBlbmQgdGhlIGFuYWx5c2lzIGF0IHRoaXMgd2Vlay4gV2UgYWxzbyBkbyBfbm90XyBmaWxsIGluIHRoZSBtaXNzaW5nIHZhbHVlcyBpbiB0aGUgZGF0YSwgYXMgbWFueSBvZiB0aGUgbW9kZWxsaW5nIGZ1bmN0aW9ucyBpbiB0aGUgZmFibGUgcGFja2FnZSBjYW4gaGFuZGxlIHRoaXMgaW5uYXRlbHkuCgpgYGB7ciwgcmVzdWx0cz0iaGlkZSIsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodHNpYmJsZSkKbGlicmFyeShmZWFzdHMpCmxpYnJhcnkoZmFibGUpCgpzZXR0aW5ncyA8LSB5YW1sOjpyZWFkX3lhbWwoaGVyZTo6aGVyZSgiZXhhbXBsZXMvZ3JvY2VyeV9zYWxlcy9SL2ZvcmVjYXN0X3NldHRpbmdzLnlhbWwiKSkKc3RhcnRfZGF0ZSA8LSBhcy5EYXRlKHNldHRpbmdzJFNUQVJUX0RBVEUpCnRyYWluX3BlcmlvZHMgPC0gc2VxKHRvPXNldHRpbmdzJExBU1RfV0VFSyAtIHNldHRpbmdzJEhPUklaT04gLSBzZXR0aW5ncyRHQVAgKyAxLAogICAgICAgICAgICAgICAgICAgICBieT1zZXR0aW5ncyRIT1JJWk9OLAogICAgICAgICAgICAgICAgICAgICBsZW5ndGgub3V0PXNldHRpbmdzJE5fU1BMSVRTKQoKb2pfZGF0YSA8LSBvcmFuZ2VKdWljZSR5eCAlPiUKICAgIGNvbXBsZXRlKHN0b3JlLCBicmFuZCwgd2VlaykgJT4lCiAgICBtdXRhdGUod2Vlaz15ZWFyd2VlayhzdGFydF9kYXRlICsgd2Vlayo3KSkgJT4lCiAgICBhc190c2liYmxlKGluZGV4PXdlZWssIGtleT1jKHN0b3JlLCBicmFuZCkpCmBgYAoKSGVyZSBhcmUgc29tZSBnbGltcHNlcyBvZiB3aGF0IHRoZSBkYXRhIGxvb2tzIGxpa2UuIFRoZSBkZXBlbmRlbnQgdmFyaWFibGUgaXMgYGxvZ21vdmVgLCB0aGUgbG9nYXJpdGhtIG9mIHRoZSB0b3RhbCBzYWxlcyBmb3IgYSBnaXZlbiBicmFuZCBhbmQgc3RvcmUsIGluIGEgcGFydGljdWxhciB3ZWVrLgoKYGBge3J9CmhlYWQob2pfZGF0YSkKYGBgCgpUaGUgdGltZSBzZXJpZXMgcGxvdHMgZm9yIGEgc21hbGwgc3Vic2V0IG9mIGJyYW5kcyBhbmQgc3RvcmVzIGFyZSBzaG93biBiZWxvdy4gV2UgY2FuIG1ha2UgdGhlIGZvbGxvd2luZyBvYnNlcnZhdGlvbnM6CgotIFRoZXJlIGFwcGVhcnMgdG8gYmUgbGl0dGxlIHNlYXNvbmFsIHZhcmlhdGlvbiBpbiBzYWxlcyAocHJvYmFibHkgYmVjYXVzZSBGbG9yaWRhIGlzIGEgc3RhdGUgd2l0aG91dCB2ZXJ5IGRpZmZlcmVudCBzZWFzb25zKS4gSW4gYW55IGNhc2UsIHdpdGggbGVzcyB0aGFuIDIgeWVhcnMgb2Ygb2JzZXJ2YXRpb25zLCB0aGUgdGltZSBzZXJpZXMgaXMgbm90IGxvbmcgZW5vdWdoIGZvciBtYW55IG1vZGVsLWZpdHRpbmcgZnVuY3Rpb25zIGluIHRoZSBmYWJsZSBwYWNrYWdlIHRvIGF1dG9tYXRpY2FsbHkgZXN0aW1hdGUgc2Vhc29uYWwgcGFyYW1ldGVycy4KLSBXaGlsZSBzb21lIHN0b3JlL2JyYW5kIGNvbWJpbmF0aW9ucyBzaG93IHdlYWsgdHJlbmRzIG92ZXIgdGltZSwgdGhpcyBpcyBmYXIgZnJvbSB1bml2ZXJzYWwuCi0gRGlmZmVyZW50IGJyYW5kcyBjYW4gZXhoaWJpdCB2ZXJ5IGRpZmZlcmVudCBiZWhhdmlvdXIsIGVzcGVjaWFsbHkgaW4gdGVybXMgb2YgdmFyaWF0aW9uIGFib3V0IHRoZSBtZWFuLgotIE1hbnkgb2YgdGhlIHRpbWUgc2VyaWVzIGhhdmUgbWlzc2luZyB2YWx1ZXMsIGluZGljYXRpbmcgdGhhdCB0aGUgZGF0YXNldCBpcyBpbmNvbXBsZXRlLgoKCmBgYHtyLCBmaWcuaGVpZ2h0PTEwfQpsaWJyYXJ5KGdncGxvdDIpCgpval9kYXRhICU+JQogICAgZmlsdGVyKHN0b3JlIDwgMjUsIGJyYW5kIDwgNSkgJT4lCiAgICBnZ3Bsb3QoYWVzKHg9d2VlaywgeT1sb2dtb3ZlKSkgKwogICAgICAgIGdlb21fbGluZSgpICsKICAgICAgICBzY2FsZV94X2RhdGUobGFiZWxzPU5VTEwpICsKICAgICAgICBmYWNldF9ncmlkKHZhcnMoc3RvcmUpLCB2YXJzKGJyYW5kKSwgbGFiZWxsZXI9ImxhYmVsX2JvdGgiKQpgYGAKCkZpbmFsbHksIHdlIHNwbGl0IHRoZSBkYXRhc2V0IGludG8gc2VwYXJhdGUgc2FtcGxlcyBmb3IgdHJhaW5pbmcgYW5kIHRlc3RpbmcuIFRoZSBzY2hlbWEgdXNlZCBpcyBicm9hZGx5IHRpbWUgc2VyaWVzIGNyb3NzLXZhbGlkYXRpb24sIHdoZXJlYnkgd2UgdHJhaW4gYSBtb2RlbCBvbiBkYXRhIHVwIHRvIHRpbWUgJHQkLCB0ZXN0IGl0IG9uIGRhdGEgZm9yIHRpbWVzICR0KzEkIHRvICR0K2skLCB0aGVuIHRyYWluIG9uIGRhdGEgdXAgdG8gdGltZSAkdCtrJCwgdGVzdCBpdCBvbiBkYXRhIGZvciB0aW1lcyAkdCtrKzEkIHRvICR0KzJrJCwgYW5kIHNvIG9uLiBJbiB0aGlzIHNwZWNpZmljIGNhc2Ugc3R1ZHksIGhvd2V2ZXIsIHdlIGludHJvZHVjZSBhIHNtYWxsIGV4dHJhIHBpZWNlIG9mIGNvbXBsZXhpdHkgYmFzZWQgb24gZGlzY3Vzc2lvbnMgd2l0aCBkb21haW4gZXhwZXJ0cy4gV2UgdHJhaW4gYSBtb2RlbCBvbiBkYXRhIHVwIHRvIHdlZWsgJHQkLCB0aGVuIHRlc3QgaXQgb24gd2VlayAkdCsyJCB0byAkdCszJC4gVGhlbiB3ZSB0cmFpbiBvbiBkYXRhIHVwIHRvIHdlZWsgJHQrMiQsIGFuZCB0ZXN0IGl0IG9uIHdlZWtzICR0KzQkIHRvICR0KzUkLCBhbmQgc28gb24uIFRoZXJlIGlzIHRodXMgYWx3YXlzIGEgZ2FwIG9mIG9uZSB3ZWVrIGJldHdlZW4gdGhlIHRyYWluaW5nIGFuZCB0ZXN0IHNhbXBsZXMuIFRoZSByZWFzb24gZm9yIHRoaXMgaXMgYmVjYXVzZSBpbiByZWFsaXR5LCBpbnZlbnRvcnkgcGxhbm5pbmcgYWx3YXlzIHRha2VzIHNvbWUgdGltZTsgdGhlIGdhcCBhbGxvd3Mgc3RvcmUgbWFuYWdlcnMgdG8gcHJlcGFyZSB0aGUgc3RvY2sgYmFzZWQgb24gdGhlIGZvcmVjYXN0ZWQgZGVtYW5kLgoKYGBge3J9CnN1YnNldF9val9kYXRhIDwtIGZ1bmN0aW9uKHN0YXJ0LCBlbmQpCnsKICAgIHN0YXJ0IDwtIHllYXJ3ZWVrKHN0YXJ0X2RhdGUgKyBzdGFydCo3KQogICAgZW5kIDwtIHllYXJ3ZWVrKHN0YXJ0X2RhdGUgKyBlbmQqNykKICAgIGZpbHRlcihval9kYXRhLCB3ZWVrID49IHN0YXJ0LCB3ZWVrIDw9IGVuZCkKfQoKb2pfdHJhaW4gPC0gbGFwcGx5KHRyYWluX3BlcmlvZHMsIGZ1bmN0aW9uKGkpIHN1YnNldF9val9kYXRhKHNldHRpbmdzJEZJUlNUX1dFRUssIGkpKQpval90ZXN0IDwtIGxhcHBseSh0cmFpbl9wZXJpb2RzLCBmdW5jdGlvbihpKSBzdWJzZXRfb2pfZGF0YShpICsgc2V0dGluZ3MkR0FQLCBpICsgc2V0dGluZ3MkR0FQICsgc2V0dGluZ3MkSE9SSVpPTiAtIDEpKQoKc2F2ZShval90cmFpbiwgb2pfdGVzdCwgZmlsZT1oZXJlOjpoZXJlKCJleGFtcGxlcy9ncm9jZXJ5X3NhbGVzL1IvZGF0YS5SZGF0YSIpKQoKaGVhZChval90cmFpbltbMV1dKQoKaGVhZChval90ZXN0W1sxXV0pCmBgYAo=
diff --git a/examples/grocery_sales/R/02_basic_models.Rmd b/examples/grocery_sales/R/02_basic_models.Rmd new file mode 100644 index 00000000..6dda9451 --- /dev/null +++ b/examples/grocery_sales/R/02_basic_models.Rmd @@ -0,0 +1,87 @@ +--- +title: Basic models +output: html_notebook +--- + +_Copyright (c) Microsoft Corporation._
+_Licensed under the MIT License._ + +```{r, echo=FALSE, results="hide", message=FALSE} +library(tidyr) +library(dplyr) +library(tsibble) +library(feasts) +library(fable) +``` + +We fit some simple models to the orange juice data for illustrative purposes. Here, each model is actually a _group_ of models, one for each combination of store and brand. This is the standard approach taken in statistical forecasting, and is supported out-of-the-box by the tidyverts framework. + +- `mean`: This is just a simple mean. +- `naive`: A random walk model without any other components. This amounts to setting all forecast values to the last observed value. +- `drift`: This adjusts the `naive` model to incorporate a straight-line trend. +- `arima`: An ARIMA model with the parameter values estimated from the data. + +Note that the model training process is embarrassingly parallel on 3 levels: + +- We have multiple independent training datasets; +- For which we fit multiple independent models; +- Within which we have independent sub-models for each store and brand. + +This lets us speed up the training significantly. While the `fable::model` function can fit multiple models in parallel, we will run it sequentially here and instead parallelise by dataset. This avoids contention for cores, and also results in the simplest code. As a guard against returning invalid results, we also specify the argument `.safely=FALSE`; this forces `model` to throw an error if a model algorithm fails. + +```{r} +srcdir <- here::here("R_utils") +for(src in dir(srcdir, full.names=TRUE)) source(src) + +load_objects("grocery_sales", "data.Rdata") + +cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts")) + +oj_modelset_basic <- parallel::parLapply(cl, oj_train, function(df) +{ + model(df, + mean=MEAN(logmove), + naive=NAIVE(logmove), + drift=RW(logmove ~ drift()), + arima=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0)), + .safely=FALSE + ) +}) +oj_fcast_basic <- parallel::clusterMap(cl, get_forecasts, oj_modelset_basic, oj_test) + +save_objects(oj_modelset_basic, oj_fcast_basic, + example="grocery_sales", file="model_basic.Rdata") + +do.call(rbind, oj_fcast_basic) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts() +``` + +The ARIMA model does the best of the simple models, but not any better than a simple mean. + +Having fit some basic models, we can also try an exponential smoothing model, fit using the `ETS` function. Unlike the others, `ETS` does not currently support time series with missing values; we therefore have to use one of the other models to impute missing values first via the `interpolate` function. + +```{r} +oj_modelset_ets <- parallel::clusterMap(cl, function(df, basicmod) +{ + df %>% + interpolate(object=select(basicmod, -c(mean, naive, drift))) %>% + model( + ets=ETS(logmove ~ error("A") + trend("A") + season("N")), + .safely=FALSE + ) +}, oj_train, oj_modelset_basic) + +oj_fcast_ets <- parallel::clusterMap(cl, get_forecasts, oj_modelset_ets, oj_test) + +destroy_cluster(cl) + +save_objects(oj_modelset_ets, oj_fcast_ets, + example="grocery_sales", file="model_ets.Rdata") + +do.call(rbind, oj_fcast_ets) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts() +``` + +The ETS model does _worse_ than the ARIMA model, something that should not be a surprise given the lack of strong seasonality and trend in this dataset. We conclude that any simple univariate approach is unlikely to do well. diff --git a/R/orange_juice/03_model_eval.nb.html b/examples/grocery_sales/R/02_basic_models.nb.html similarity index 98% rename from R/orange_juice/03_model_eval.nb.html rename to examples/grocery_sales/R/02_basic_models.nb.html index 3540448f..6f91ee65 100644 --- a/R/orange_juice/03_model_eval.nb.html +++ b/examples/grocery_sales/R/02_basic_models.nb.html @@ -11,7 +11,7 @@ -Model evaluation +Basic models @@ -220,71 +220,104 @@ summary { -

Model evaluation

+

Basic models

+

Copyright (c) Microsoft Corporation.
Licensed under the MIT License.

-

Having fit the models, let’s examine their rolling goodness of fit, using the MAPE (mean absolute percentage error) metric.

-

First, we compute the forecasts for each dataset and model, again in parallel.

+

We fit some simple models to the orange juice data for illustrative purposes. Here, each model is actually a group of models, one for each combination of store and brand. This is the standard approach taken in statistical forecasting, and is supported out-of-the-box by the tidyverts framework.

+
    +
  • mean: This is just a simple mean.
  • +
  • naive: A random walk model without any other components. This amounts to setting all forecast values to the last observed value.
  • +
  • drift: This adjusts the naive model to incorporate a straight-line trend.
  • +
  • arima: An ARIMA model with the parameter values estimated from the data.
  • +
+

Note that the model training process is embarrassingly parallel on 3 levels:

+
    +
  • We have multiple independent training datasets;
  • +
  • For which we fit multiple independent models;
  • +
  • Within which we have independent sub-models for each store and brand.
  • +
+

This lets us speed up the training significantly. While the fable::model function can fit multiple models in parallel, we will run it sequentially here and instead parallelise by dataset. This avoids contention for cores, and also results in the simplest code. As a guard against returning invalid results, we also specify the argument .safely=FALSE; this forces model to throw an error if a model algorithm fails.

- -
for(f in dir(pattern="Rdata$"))
-    load(f)
+
+
srcdir <- here::here("R_utils")
+for(src in dir(srcdir, full.names=TRUE)) source(src)
 
-ncores <- max(2, parallel::detectCores(logical=FALSE) - 2)
-cl <- parallel::makeCluster(ncores)
-parallel::clusterEvalQ(cl,
+load_objects("grocery_sales", "data.Rdata")
+
+cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts"))
+
+oj_modelset_basic <- parallel::parLapply(cl, oj_train, function(df)
 {
-    library(feasts)
-    library(fable)
-    library(tsibble)
-})
- - -
fcast_sets <- lapply(ls(pattern="^oj_modelset"), function(mod)
-    parallel::clusterMap(cl, function(mod, df) forecast(mod, df), get(mod), oj_test)
-)
+    model(df,
+        mean=MEAN(logmove),
+        naive=NAIVE(logmove),
+        drift=RW(logmove ~ drift()),
+        arima=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0)),
+        .safely=FALSE
+    )
+})
+oj_fcast_basic <- parallel::clusterMap(cl, get_forecasts, oj_modelset_basic, oj_test)
 
-parallel::stopCluster(cl)
- - - -

Next, we compute the MAPE for each model. It is apparent that adding independent variables as regressors improves the quality of the fit substantially. Adding a simple trend does not improve the fit, indicating that the level of sales does not appear to change over time (at least over the period included in the data).

- - - -
orig <- do.call(rbind, oj_test) %>%
-    as_tibble() %>%
-    select(store, brand, week, logmove) %>%
-    mutate(move=exp(logmove))
+save_objects(oj_modelset_basic, oj_fcast_basic,
+             example="grocery_sales", file="model_basic.Rdata")
 
-gof <- function(fcast_data)
-{
-    fcast_data <- do.call(rbind, fcast_data) %>%
-        as_tibble() %>%
-        select(store, brand, week, .model, logmove) %>%
-        pivot_wider(id_cols=c(store, brand, week), names_from=.model, values_from=logmove) %>%
-        select(-store, -brand, -week) %>%
-        summarise_all(function(x) MAPE(exp(x) - orig$move, orig$move))
-}
-
-lapply(fcast_sets, gof) %>% bind_cols()
+do.call(rbind, oj_fcast_basic) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts()
+ +

The ARIMA model does the best of the simple models, but not any better than a simple mean.

+

Having fit some basic models, we can also try an exponential smoothing model, fit using the ETS function. Unlike the others, ETS does not currently support time series with missing values; we therefore have to use one of the other models to impute missing values first via the interpolate function.

+ + + +
oj_modelset_ets <- parallel::clusterMap(cl, function(df, basicmod)
+{
+    df %>%
+        interpolate(object=select(basicmod, -c(mean, naive, drift))) %>%
+        model(
+            ets=ETS(logmove ~ error("A") + trend("A") + season("N")),
+            .safely=FALSE
+        )
+}, oj_train, oj_modelset_basic)
 
-
LS0tCnRpdGxlOiBNb2RlbCBldmFsdWF0aW9uCm91dHB1dDogaHRtbF9ub3RlYm9vawplbmNvZGluZzogdXRmOAotLS0KCmBgYHtyLCBlY2hvPUZBTFNFLCByZXN1bHRzPSJoaWRlIiwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0c2liYmxlKQpsaWJyYXJ5KGZlYXN0cykKbGlicmFyeShmYWJsZSkKYGBgCgpIYXZpbmcgZml0IHRoZSBtb2RlbHMsIGxldCdzIGV4YW1pbmUgdGhlaXIgcm9sbGluZyBnb29kbmVzcyBvZiBmaXQsIHVzaW5nIHRoZSBNQVBFIChtZWFuIGFic29sdXRlIHBlcmNlbnRhZ2UgZXJyb3IpIG1ldHJpYy4KCkZpcnN0LCB3ZSBjb21wdXRlIHRoZSBmb3JlY2FzdHMgZm9yIGVhY2ggZGF0YXNldCBhbmQgbW9kZWwsIGFnYWluIGluIHBhcmFsbGVsLgoKYGBge3IsIHJlc3VsdHM9ImhpZGUifQpmb3IoZiBpbiBkaXIocGF0dGVybj0iUmRhdGEkIikpCiAgICBsb2FkKGYpCgpuY29yZXMgPC0gbWF4KDIsIHBhcmFsbGVsOjpkZXRlY3RDb3Jlcyhsb2dpY2FsPUZBTFNFKSAtIDIpCmNsIDwtIHBhcmFsbGVsOjptYWtlQ2x1c3RlcihuY29yZXMpCnBhcmFsbGVsOjpjbHVzdGVyRXZhbFEoY2wsCnsKICAgIGxpYnJhcnkoZmVhc3RzKQogICAgbGlicmFyeShmYWJsZSkKICAgIGxpYnJhcnkodHNpYmJsZSkKfSkKCmZjYXN0X3NldHMgPC0gbGFwcGx5KGxzKHBhdHRlcm49Il5val9tb2RlbHNldCIpLCBmdW5jdGlvbihtb2QpCiAgICBwYXJhbGxlbDo6Y2x1c3Rlck1hcChjbCwgZnVuY3Rpb24obW9kLCBkZikgZm9yZWNhc3QobW9kLCBkZiksIGdldChtb2QpLCBval90ZXN0KQopCgpwYXJhbGxlbDo6c3RvcENsdXN0ZXIoY2wpCmBgYAoKTmV4dCwgd2UgY29tcHV0ZSB0aGUgTUFQRSBmb3IgZWFjaCBtb2RlbC4gSXQgaXMgYXBwYXJlbnQgdGhhdCBhZGRpbmcgaW5kZXBlbmRlbnQgdmFyaWFibGVzIGFzIHJlZ3Jlc3NvcnMgaW1wcm92ZXMgdGhlIHF1YWxpdHkgb2YgdGhlIGZpdCBzdWJzdGFudGlhbGx5LiBBZGRpbmcgYSBzaW1wbGUgdHJlbmQgZG9lcyBfbm90XyBpbXByb3ZlIHRoZSBmaXQsIGluZGljYXRpbmcgdGhhdCB0aGUgbGV2ZWwgb2Ygc2FsZXMgZG9lcyBub3QgYXBwZWFyIHRvIGNoYW5nZSBvdmVyIHRpbWUgKGF0IGxlYXN0IG92ZXIgdGhlIHBlcmlvZCBpbmNsdWRlZCBpbiB0aGUgZGF0YSkuCgpgYGB7cn0Kb3JpZyA8LSBkby5jYWxsKHJiaW5kLCBval90ZXN0KSAlPiUKICAgIGFzX3RpYmJsZSgpICU+JQogICAgc2VsZWN0KHN0b3JlLCBicmFuZCwgd2VlaywgbG9nbW92ZSkgJT4lCiAgICBtdXRhdGUobW92ZT1leHAobG9nbW92ZSkpCgpnb2YgPC0gZnVuY3Rpb24oZmNhc3RfZGF0YSkKewogICAgZmNhc3RfZGF0YSA8LSBkby5jYWxsKHJiaW5kLCBmY2FzdF9kYXRhKSAlPiUKICAgICAgICBhc190aWJibGUoKSAlPiUKICAgICAgICBzZWxlY3Qoc3RvcmUsIGJyYW5kLCB3ZWVrLCAubW9kZWwsIGxvZ21vdmUpICU+JQogICAgICAgIHBpdm90X3dpZGVyKGlkX2NvbHM9YyhzdG9yZSwgYnJhbmQsIHdlZWspLCBuYW1lc19mcm9tPS5tb2RlbCwgdmFsdWVzX2Zyb209bG9nbW92ZSkgJT4lCiAgICAgICAgc2VsZWN0KC1zdG9yZSwgLWJyYW5kLCAtd2VlaykgJT4lCiAgICAgICAgc3VtbWFyaXNlX2FsbChmdW5jdGlvbih4KSBNQVBFKGV4cCh4KSAtIG9yaWckbW92ZSwgb3JpZyRtb3ZlKSkKfQoKbGFwcGx5KGZjYXN0X3NldHMsIGdvZikgJT4lIGJpbmRfY29scygpCmBgYAo=
+oj_fcast_ets <- parallel::clusterMap(cl, get_forecasts, oj_modelset_ets, oj_test) + +destroy_cluster(cl) + +save_objects(oj_modelset_ets, oj_fcast_ets, + example="grocery_sales", file="model_ets.Rdata") + +do.call(rbind, oj_fcast_ets) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts()
+ +
+ +
+ + +

The ETS model does worse than the ARIMA model, something that should not be a surprise given the lack of strong seasonality and trend in this dataset. We conclude that any simple univariate approach is unlikely to do well.

+ + +
LS0tCnRpdGxlOiBCYXNpYyBtb2RlbHMKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKX0NvcHlyaWdodCAoYykgTWljcm9zb2Z0IENvcnBvcmF0aW9uLl88YnIvPgpfTGljZW5zZWQgdW5kZXIgdGhlIE1JVCBMaWNlbnNlLl8KCmBgYHtyLCBlY2hvPUZBTFNFLCByZXN1bHRzPSJoaWRlIiwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0c2liYmxlKQpsaWJyYXJ5KGZlYXN0cykKbGlicmFyeShmYWJsZSkKYGBgCgpXZSBmaXQgc29tZSBzaW1wbGUgbW9kZWxzIHRvIHRoZSBvcmFuZ2UganVpY2UgZGF0YSBmb3IgaWxsdXN0cmF0aXZlIHB1cnBvc2VzLiBIZXJlLCBlYWNoIG1vZGVsIGlzIGFjdHVhbGx5IGEgX2dyb3VwXyBvZiBtb2RlbHMsIG9uZSBmb3IgZWFjaCBjb21iaW5hdGlvbiBvZiBzdG9yZSBhbmQgYnJhbmQuIFRoaXMgaXMgdGhlIHN0YW5kYXJkIGFwcHJvYWNoIHRha2VuIGluIHN0YXRpc3RpY2FsIGZvcmVjYXN0aW5nLCBhbmQgaXMgc3VwcG9ydGVkIG91dC1vZi10aGUtYm94IGJ5IHRoZSB0aWR5dmVydHMgZnJhbWV3b3JrLgoKLSBgbWVhbmA6IFRoaXMgaXMganVzdCBhIHNpbXBsZSBtZWFuLgotIGBuYWl2ZWA6IEEgcmFuZG9tIHdhbGsgbW9kZWwgd2l0aG91dCBhbnkgb3RoZXIgY29tcG9uZW50cy4gVGhpcyBhbW91bnRzIHRvIHNldHRpbmcgYWxsIGZvcmVjYXN0IHZhbHVlcyB0byB0aGUgbGFzdCBvYnNlcnZlZCB2YWx1ZS4KLSBgZHJpZnRgOiBUaGlzIGFkanVzdHMgdGhlIGBuYWl2ZWAgbW9kZWwgdG8gaW5jb3Jwb3JhdGUgYSBzdHJhaWdodC1saW5lIHRyZW5kLgotIGBhcmltYWA6IEFuIEFSSU1BIG1vZGVsIHdpdGggdGhlIHBhcmFtZXRlciB2YWx1ZXMgZXN0aW1hdGVkIGZyb20gdGhlIGRhdGEuCgpOb3RlIHRoYXQgdGhlIG1vZGVsIHRyYWluaW5nIHByb2Nlc3MgaXMgZW1iYXJyYXNzaW5nbHkgcGFyYWxsZWwgb24gMyBsZXZlbHM6CgotIFdlIGhhdmUgbXVsdGlwbGUgaW5kZXBlbmRlbnQgdHJhaW5pbmcgZGF0YXNldHM7Ci0gRm9yIHdoaWNoIHdlIGZpdCBtdWx0aXBsZSBpbmRlcGVuZGVudCBtb2RlbHM7Ci0gV2l0aGluIHdoaWNoIHdlIGhhdmUgaW5kZXBlbmRlbnQgc3ViLW1vZGVscyBmb3IgZWFjaCBzdG9yZSBhbmQgYnJhbmQuCgpUaGlzIGxldHMgdXMgc3BlZWQgdXAgdGhlIHRyYWluaW5nIHNpZ25pZmljYW50bHkuIFdoaWxlIHRoZSBgZmFibGU6Om1vZGVsYCBmdW5jdGlvbiBjYW4gZml0IG11bHRpcGxlIG1vZGVscyBpbiBwYXJhbGxlbCwgd2Ugd2lsbCBydW4gaXQgc2VxdWVudGlhbGx5IGhlcmUgYW5kIGluc3RlYWQgcGFyYWxsZWxpc2UgYnkgZGF0YXNldC4gVGhpcyBhdm9pZHMgY29udGVudGlvbiBmb3IgY29yZXMsIGFuZCBhbHNvIHJlc3VsdHMgaW4gdGhlIHNpbXBsZXN0IGNvZGUuIEFzIGEgZ3VhcmQgYWdhaW5zdCByZXR1cm5pbmcgaW52YWxpZCByZXN1bHRzLCB3ZSBhbHNvIHNwZWNpZnkgdGhlIGFyZ3VtZW50IGAuc2FmZWx5PUZBTFNFYDsgdGhpcyBmb3JjZXMgYG1vZGVsYCB0byB0aHJvdyBhbiBlcnJvciBpZiBhIG1vZGVsIGFsZ29yaXRobSBmYWlscy4KCmBgYHtyfQpzcmNkaXIgPC0gaGVyZTo6aGVyZSgiUl91dGlscyIpCmZvcihzcmMgaW4gZGlyKHNyY2RpciwgZnVsbC5uYW1lcz1UUlVFKSkgc291cmNlKHNyYykKCmxvYWRfb2JqZWN0cygiZ3JvY2VyeV9zYWxlcyIsICJkYXRhLlJkYXRhIikKCmNsIDwtIG1ha2VfY2x1c3RlcihsaWJzPWMoInRpZHlyIiwgImRwbHlyIiwgImZhYmxlIiwgInRzaWJibGUiLCAiZmVhc3RzIikpCgpval9tb2RlbHNldF9iYXNpYyA8LSBwYXJhbGxlbDo6cGFyTGFwcGx5KGNsLCBval90cmFpbiwgZnVuY3Rpb24oZGYpCnsKICAgIG1vZGVsKGRmLAogICAgICAgIG1lYW49TUVBTihsb2dtb3ZlKSwKICAgICAgICBuYWl2ZT1OQUlWRShsb2dtb3ZlKSwKICAgICAgICBkcmlmdD1SVyhsb2dtb3ZlIH4gZHJpZnQoKSksCiAgICAgICAgYXJpbWE9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApKSwKICAgICAgICAuc2FmZWx5PUZBTFNFCiAgICApCn0pCm9qX2ZjYXN0X2Jhc2ljIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBnZXRfZm9yZWNhc3RzLCBval9tb2RlbHNldF9iYXNpYywgb2pfdGVzdCkKCnNhdmVfb2JqZWN0cyhval9tb2RlbHNldF9iYXNpYywgb2pfZmNhc3RfYmFzaWMsCiAgICAgICAgICAgICBleGFtcGxlPSJncm9jZXJ5X3NhbGVzIiwgZmlsZT0ibW9kZWxfYmFzaWMuUmRhdGEiKQoKZG8uY2FsbChyYmluZCwgb2pfZmNhc3RfYmFzaWMpICU+JQogICAgbXV0YXRlX2F0KC0oMTozKSwgZXhwKSAlPiUKICAgIGV2YWxfZm9yZWNhc3RzKCkKYGBgCgpUaGUgQVJJTUEgbW9kZWwgZG9lcyB0aGUgYmVzdCBvZiB0aGUgc2ltcGxlIG1vZGVscywgYnV0IG5vdCBhbnkgYmV0dGVyIHRoYW4gYSBzaW1wbGUgbWVhbi4KCkhhdmluZyBmaXQgc29tZSBiYXNpYyBtb2RlbHMsIHdlIGNhbiBhbHNvIHRyeSBhbiBleHBvbmVudGlhbCBzbW9vdGhpbmcgbW9kZWwsIGZpdCB1c2luZyB0aGUgYEVUU2AgZnVuY3Rpb24uIFVubGlrZSB0aGUgb3RoZXJzLCBgRVRTYCBkb2VzIG5vdCBjdXJyZW50bHkgc3VwcG9ydCB0aW1lIHNlcmllcyB3aXRoIG1pc3NpbmcgdmFsdWVzOyB3ZSB0aGVyZWZvcmUgaGF2ZSB0byB1c2Ugb25lIG9mIHRoZSBvdGhlciBtb2RlbHMgdG8gaW1wdXRlIG1pc3NpbmcgdmFsdWVzIGZpcnN0IHZpYSB0aGUgYGludGVycG9sYXRlYCBmdW5jdGlvbi4KCmBgYHtyfQpval9tb2RlbHNldF9ldHMgPC0gcGFyYWxsZWw6OmNsdXN0ZXJNYXAoY2wsIGZ1bmN0aW9uKGRmLCBiYXNpY21vZCkKewogICAgZGYgJT4lCiAgICAgICAgaW50ZXJwb2xhdGUob2JqZWN0PXNlbGVjdChiYXNpY21vZCwgLWMobWVhbiwgbmFpdmUsIGRyaWZ0KSkpICU+JQogICAgICAgIG1vZGVsKAogICAgICAgICAgICBldHM9RVRTKGxvZ21vdmUgfiBlcnJvcigiQSIpICsgdHJlbmQoIkEiKSArIHNlYXNvbigiTiIpKSwKICAgICAgICAgICAgLnNhZmVseT1GQUxTRQogICAgICAgICkKfSwgb2pfdHJhaW4sIG9qX21vZGVsc2V0X2Jhc2ljKQoKb2pfZmNhc3RfZXRzIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBnZXRfZm9yZWNhc3RzLCBval9tb2RlbHNldF9ldHMsIG9qX3Rlc3QpCgpkZXN0cm95X2NsdXN0ZXIoY2wpCgpzYXZlX29iamVjdHMob2pfbW9kZWxzZXRfZXRzLCBval9mY2FzdF9ldHMsCiAgICAgICAgICAgICBleGFtcGxlPSJncm9jZXJ5X3NhbGVzIiwgZmlsZT0ibW9kZWxfZXRzLlJkYXRhIikKCmRvLmNhbGwocmJpbmQsIG9qX2ZjYXN0X2V0cykgJT4lCiAgICBtdXRhdGVfYXQoLSgxOjMpLCBleHApICU+JQogICAgZXZhbF9mb3JlY2FzdHMoKQpgYGAKClRoZSBFVFMgbW9kZWwgZG9lcyBfd29yc2VfIHRoYW4gdGhlIEFSSU1BIG1vZGVsLCBzb21ldGhpbmcgdGhhdCBzaG91bGQgbm90IGJlIGEgc3VycHJpc2UgZ2l2ZW4gdGhlIGxhY2sgb2Ygc3Ryb25nIHNlYXNvbmFsaXR5IGFuZCB0cmVuZCBpbiB0aGlzIGRhdGFzZXQuIFdlIGNvbmNsdWRlIHRoYXQgYW55IHNpbXBsZSB1bml2YXJpYXRlIGFwcHJvYWNoIGlzIHVubGlrZWx5IHRvIGRvIHdlbGwuCg==
@@ -331,7 +364,7 @@ $(document).ready(function () { diff --git a/examples/grocery_sales/R/02a_reg_models.Rmd b/examples/grocery_sales/R/02a_reg_models.Rmd new file mode 100644 index 00000000..581f4e35 --- /dev/null +++ b/examples/grocery_sales/R/02a_reg_models.Rmd @@ -0,0 +1,86 @@ +--- +title: ARIMA-Regression models +output: html_notebook +--- + +_Copyright (c) Microsoft Corporation._
+_Licensed under the MIT License._ + +```{r, echo=FALSE, results="hide", message=FALSE} +library(tidyr) +library(dplyr) +library(tsibble) +library(feasts) +library(fable) +``` + +This notebook builds on the output from "Basic models" by including regressor variables in the ARIMA model(s). We fit the following model types: + +- `ar_trend` includes only a linear trend over time. +- `ar_reg` allows stepwise selection of independent regressors. +- `ar_reg_price`: rather than allowing the algorithm to select from the 11 price variables, we use only the price relevant to each brand. This is to guard against possible overfitting, something that classical stepwise procedures are wont to do. +- `ar_reg_price_trend` is the same as `ar_reg_price`, but including a linear trend. + +As part of the modelling, we also compute a new independent variable `maxpricediff`, the log-ratio of the price of this brand compared to the best competing price. A positive `maxpricediff` means this brand is cheaper than all the other brands, and a negative `maxpricediff` means it is more expensive. + +```{r} +srcdir <- here::here("R_utils") +for(src in dir(srcdir, full.names=TRUE)) source(src) + +load_objects("grocery_sales", "data.Rdata") + +cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts")) + +# add extra regression variables to training and test datasets +add_regvars <- function(df) +{ + df %>% + group_by(store, brand) %>% + group_modify(~ { + pricevars <- grep("price", names(.x), value=TRUE) + thispricevar <- unique(paste0("price", .y$brand)) + best_other_price <- do.call(pmin, .x[setdiff(pricevars, thispricevar)]) + .x$price <- .x[[thispricevar]] + .x$maxpricediff <- log(best_other_price/.x$price) + .x + }) %>% + ungroup() %>% + mutate(week=yearweek(week)) %>% # need to recreate this variable because of tsibble/vctrs issues + as_tsibble(week, key=c(store, brand)) +} + +oj_trainreg <- parallel::parLapply(cl, oj_train, add_regvars) +oj_testreg <- parallel::parLapply(cl, oj_test, add_regvars) + +save_objects(oj_trainreg, oj_testreg, + example="grocery_sales", file="data_reg.Rdata") + +oj_modelset_reg <- parallel::parLapply(cl, oj_trainreg, function(df) +{ + model(df, + ar_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend()), + + ar_reg=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff + + price1 + price2 + price3 + price4 + price5 + price6 + price7 + price8 + price9 + price10 + price11), + + ar_reg_price=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff + price), + + ar_reg_price_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend() + deal + feat + maxpricediff + price), + + .safely=FALSE + ) +}) + +oj_fcast_reg <- parallel::clusterMap(cl, get_forecasts, oj_modelset_reg, oj_testreg) + +destroy_cluster(cl) + +save_objects(oj_modelset_reg, oj_fcast_reg, + example="grocery_sales", file="model_reg.Rdata") + +do.call(rbind, oj_fcast_reg) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts() +``` + +This shows that the models incorporating price are a significant improvement over the previous naive models. The model that uses stepwise selection to choose the best price variable does worse than the one where we choose the price beforehand, confirming the suspicion that stepwise leads to overfitting in this case. diff --git a/R/orange_juice/02a_simplereg_models.nb.html b/examples/grocery_sales/R/02a_reg_models.nb.html similarity index 98% rename from R/orange_juice/02a_simplereg_models.nb.html rename to examples/grocery_sales/R/02a_reg_models.nb.html index 4aece593..f5e186af 100644 --- a/R/orange_juice/02a_simplereg_models.nb.html +++ b/examples/grocery_sales/R/02a_reg_models.nb.html @@ -11,7 +11,7 @@ -Regression models +ARIMA-Regression models @@ -220,47 +220,97 @@ summary { -

Regression models

+

ARIMA-Regression models

+

Copyright (c) Microsoft Corporation.
Licensed under the MIT License.

-

This notebook builds on the output from “Simple models” by including regressor variables in the ARIMA model(s).

+

This notebook builds on the output from “Basic models” by including regressor variables in the ARIMA model(s). We fit the following model types:

+
    +
  • ar_trend includes only a linear trend over time.
  • +
  • ar_reg allows stepwise selection of independent regressors.
  • +
  • ar_reg_price: rather than allowing the algorithm to select from the 11 price variables, we use only the price relevant to each brand. This is to guard against possible overfitting, something that classical stepwise procedures are wont to do.
  • +
  • ar_reg_price_trend is the same as ar_reg_price, but including a linear trend.
  • +
+

As part of the modelling, we also compute a new independent variable maxpricediff, the log-ratio of the price of this brand compared to the best competing price. A positive maxpricediff means this brand is cheaper than all the other brands, and a negative maxpricediff means it is more expensive.

- -
load("oj_data.Rdata")
+
+
srcdir <- here::here("R_utils")
+for(src in dir(srcdir, full.names=TRUE)) source(src)
 
-ncores <- max(2, parallel::detectCores(logical=FALSE) - 2)
-cl <- parallel::makeCluster(ncores)
-parallel::clusterEvalQ(cl,
+load_objects("grocery_sales", "data.Rdata")
+
+cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts"))
+
+# add extra regression variables to training and test datasets
+add_regvars <- function(df)
 {
-    library(feasts)
-    library(fable)
-    library(tsibble)
-})
- - -
oj_modelset_reg <- parallel::parLapply(cl, oj_train, function(df)
+    df %>%
+        group_by(store, brand) %>%
+        group_modify(~ {
+            pricevars <- grep("price", names(.x), value=TRUE)
+            thispricevar <- unique(paste0("price", .y$brand))
+            best_other_price <- do.call(pmin, .x[setdiff(pricevars, thispricevar)])
+            .x$price <- .x[[thispricevar]]
+            .x$maxpricediff <- log(best_other_price/.x$price)
+            .x
+        }) %>%
+        ungroup() %>%
+        mutate(week=yearweek(week)) %>%  # need to recreate this variable because of tsibble/vctrs issues
+        as_tsibble(week, key=c(store, brand))
+}
+
+oj_trainreg <- parallel::parLapply(cl, oj_train, add_regvars)
+oj_testreg <- parallel::parLapply(cl, oj_test, add_regvars)
+
+save_objects(oj_trainreg, oj_testreg,
+             example="grocery_sales", file="data_reg.Rdata")
+
+oj_modelset_reg <- parallel::parLapply(cl, oj_trainreg, function(df)
 {
     model(df,
-        ar_reg=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + price + maxpricediff),
         ar_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend()),
-        ar_regtrend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend() + deal + feat + price + maxpricediff)
+
+        ar_reg=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff +
+            price1 + price2 + price3 + price4 + price5 + price6 + price7 + price8 + price9 + price10 + price11),
+
+        ar_reg_price=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff + price),
+
+        ar_reg_price_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend() + deal + feat + maxpricediff + price),
+
+        .safely=FALSE
     )
 })
 
-parallel::stopCluster(cl)
-save(oj_modelset_reg, file="oj_modelset_reg.Rdata")
- - +oj_fcast_reg <- parallel::clusterMap(cl, get_forecasts, oj_modelset_reg, oj_testreg) -
LS0tCnRpdGxlOiBSZWdyZXNzaW9uIG1vZGVscwpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7ciwgZWNobz1GQUxTRSwgcmVzdWx0cz0iaGlkZSIsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodHNpYmJsZSkKbGlicmFyeShmZWFzdHMpCmxpYnJhcnkoZmFibGUpCmBgYAoKVGhpcyBub3RlYm9vayBidWlsZHMgb24gdGhlIG91dHB1dCBmcm9tICJTaW1wbGUgbW9kZWxzIiBieSBpbmNsdWRpbmcgcmVncmVzc29yIHZhcmlhYmxlcyBpbiB0aGUgQVJJTUEgbW9kZWwocykuCgpgYGB7ciwgcmVzdWx0cz0iaGlkZSJ9CmxvYWQoIm9qX2RhdGEuUmRhdGEiKQoKbmNvcmVzIDwtIG1heCgyLCBwYXJhbGxlbDo6ZGV0ZWN0Q29yZXMobG9naWNhbD1GQUxTRSkgLSAyKQpjbCA8LSBwYXJhbGxlbDo6bWFrZUNsdXN0ZXIobmNvcmVzKQpwYXJhbGxlbDo6Y2x1c3RlckV2YWxRKGNsLAp7CiAgICBsaWJyYXJ5KGZlYXN0cykKICAgIGxpYnJhcnkoZmFibGUpCiAgICBsaWJyYXJ5KHRzaWJibGUpCn0pCgpval9tb2RlbHNldF9yZWcgPC0gcGFyYWxsZWw6OnBhckxhcHBseShjbCwgb2pfdHJhaW4sIGZ1bmN0aW9uKGRmKQp7CiAgICBtb2RlbChkZiwKICAgICAgICBhcl9yZWc9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApICsgZGVhbCArIGZlYXQgKyBwcmljZSArIG1heHByaWNlZGlmZiksCiAgICAgICAgYXJfdHJlbmQ9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApICsgdHJlbmQoKSksCiAgICAgICAgYXJfcmVndHJlbmQ9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApICsgdHJlbmQoKSArIGRlYWwgKyBmZWF0ICsgcHJpY2UgKyBtYXhwcmljZWRpZmYpCiAgICApCn0pCgpwYXJhbGxlbDo6c3RvcENsdXN0ZXIoY2wpCnNhdmUob2pfbW9kZWxzZXRfcmVnLCBmaWxlPSJval9tb2RlbHNldF9yZWcuUmRhdGEiKQpgYGAK
+destroy_cluster(cl) + +save_objects(oj_modelset_reg, oj_fcast_reg, + example="grocery_sales", file="model_reg.Rdata") + +do.call(rbind, oj_fcast_reg) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts()
+ +
+ +
+ + +

This shows that the models incorporating price are a significant improvement over the previous naive models. The model that uses stepwise selection to choose the best price variable does worse than the one where we choose the price beforehand, confirming the suspicion that stepwise leads to overfitting in this case.

+ + +
LS0tCnRpdGxlOiBBUklNQS1SZWdyZXNzaW9uIG1vZGVscwpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpfQ29weXJpZ2h0IChjKSBNaWNyb3NvZnQgQ29ycG9yYXRpb24uXzxici8+Cl9MaWNlbnNlZCB1bmRlciB0aGUgTUlUIExpY2Vuc2UuXwoKYGBge3IsIGVjaG89RkFMU0UsIHJlc3VsdHM9ImhpZGUiLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KHRzaWJibGUpCmxpYnJhcnkoZmVhc3RzKQpsaWJyYXJ5KGZhYmxlKQpgYGAKClRoaXMgbm90ZWJvb2sgYnVpbGRzIG9uIHRoZSBvdXRwdXQgZnJvbSAiQmFzaWMgbW9kZWxzIiBieSBpbmNsdWRpbmcgcmVncmVzc29yIHZhcmlhYmxlcyBpbiB0aGUgQVJJTUEgbW9kZWwocykuIFdlIGZpdCB0aGUgZm9sbG93aW5nIG1vZGVsIHR5cGVzOgoKLSBgYXJfdHJlbmRgIGluY2x1ZGVzIG9ubHkgYSBsaW5lYXIgdHJlbmQgb3ZlciB0aW1lLgotIGBhcl9yZWdgIGFsbG93cyBzdGVwd2lzZSBzZWxlY3Rpb24gb2YgaW5kZXBlbmRlbnQgcmVncmVzc29ycy4KLSBgYXJfcmVnX3ByaWNlYDogcmF0aGVyIHRoYW4gYWxsb3dpbmcgdGhlIGFsZ29yaXRobSB0byBzZWxlY3QgZnJvbSB0aGUgMTEgcHJpY2UgdmFyaWFibGVzLCB3ZSB1c2Ugb25seSB0aGUgcHJpY2UgcmVsZXZhbnQgdG8gZWFjaCBicmFuZC4gVGhpcyBpcyB0byBndWFyZCBhZ2FpbnN0IHBvc3NpYmxlIG92ZXJmaXR0aW5nLCBzb21ldGhpbmcgdGhhdCBjbGFzc2ljYWwgc3RlcHdpc2UgcHJvY2VkdXJlcyBhcmUgd29udCB0byBkby4KLSBgYXJfcmVnX3ByaWNlX3RyZW5kYCBpcyB0aGUgc2FtZSBhcyBgYXJfcmVnX3ByaWNlYCwgYnV0IGluY2x1ZGluZyBhIGxpbmVhciB0cmVuZC4KCkFzIHBhcnQgb2YgdGhlIG1vZGVsbGluZywgd2UgYWxzbyBjb21wdXRlIGEgbmV3IGluZGVwZW5kZW50IHZhcmlhYmxlIGBtYXhwcmljZWRpZmZgLCB0aGUgbG9nLXJhdGlvIG9mIHRoZSBwcmljZSBvZiB0aGlzIGJyYW5kIGNvbXBhcmVkIHRvIHRoZSBiZXN0IGNvbXBldGluZyBwcmljZS4gQSBwb3NpdGl2ZSBgbWF4cHJpY2VkaWZmYCBtZWFucyB0aGlzIGJyYW5kIGlzIGNoZWFwZXIgdGhhbiBhbGwgdGhlIG90aGVyIGJyYW5kcywgYW5kIGEgbmVnYXRpdmUgYG1heHByaWNlZGlmZmAgbWVhbnMgaXQgaXMgbW9yZSBleHBlbnNpdmUuCgpgYGB7cn0Kc3JjZGlyIDwtIGhlcmU6OmhlcmUoIlJfdXRpbHMiKQpmb3Ioc3JjIGluIGRpcihzcmNkaXIsIGZ1bGwubmFtZXM9VFJVRSkpIHNvdXJjZShzcmMpCgpsb2FkX29iamVjdHMoImdyb2Nlcnlfc2FsZXMiLCAiZGF0YS5SZGF0YSIpCgpjbCA8LSBtYWtlX2NsdXN0ZXIobGlicz1jKCJ0aWR5ciIsICJkcGx5ciIsICJmYWJsZSIsICJ0c2liYmxlIiwgImZlYXN0cyIpKQoKIyBhZGQgZXh0cmEgcmVncmVzc2lvbiB2YXJpYWJsZXMgdG8gdHJhaW5pbmcgYW5kIHRlc3QgZGF0YXNldHMKYWRkX3JlZ3ZhcnMgPC0gZnVuY3Rpb24oZGYpCnsKICAgIGRmICU+JQogICAgICAgIGdyb3VwX2J5KHN0b3JlLCBicmFuZCkgJT4lCiAgICAgICAgZ3JvdXBfbW9kaWZ5KH4gewogICAgICAgICAgICBwcmljZXZhcnMgPC0gZ3JlcCgicHJpY2UiLCBuYW1lcygueCksIHZhbHVlPVRSVUUpCiAgICAgICAgICAgIHRoaXNwcmljZXZhciA8LSB1bmlxdWUocGFzdGUwKCJwcmljZSIsIC55JGJyYW5kKSkKICAgICAgICAgICAgYmVzdF9vdGhlcl9wcmljZSA8LSBkby5jYWxsKHBtaW4sIC54W3NldGRpZmYocHJpY2V2YXJzLCB0aGlzcHJpY2V2YXIpXSkKICAgICAgICAgICAgLngkcHJpY2UgPC0gLnhbW3RoaXNwcmljZXZhcl1dCiAgICAgICAgICAgIC54JG1heHByaWNlZGlmZiA8LSBsb2coYmVzdF9vdGhlcl9wcmljZS8ueCRwcmljZSkKICAgICAgICAgICAgLngKICAgICAgICB9KSAlPiUKICAgICAgICB1bmdyb3VwKCkgJT4lCiAgICAgICAgbXV0YXRlKHdlZWs9eWVhcndlZWsod2VlaykpICU+JSAgIyBuZWVkIHRvIHJlY3JlYXRlIHRoaXMgdmFyaWFibGUgYmVjYXVzZSBvZiB0c2liYmxlL3ZjdHJzIGlzc3VlcwogICAgICAgIGFzX3RzaWJibGUod2Vlaywga2V5PWMoc3RvcmUsIGJyYW5kKSkKfQoKb2pfdHJhaW5yZWcgPC0gcGFyYWxsZWw6OnBhckxhcHBseShjbCwgb2pfdHJhaW4sIGFkZF9yZWd2YXJzKQpval90ZXN0cmVnIDwtIHBhcmFsbGVsOjpwYXJMYXBwbHkoY2wsIG9qX3Rlc3QsIGFkZF9yZWd2YXJzKQoKc2F2ZV9vYmplY3RzKG9qX3RyYWlucmVnLCBval90ZXN0cmVnLAogICAgICAgICAgICAgZXhhbXBsZT0iZ3JvY2VyeV9zYWxlcyIsIGZpbGU9ImRhdGFfcmVnLlJkYXRhIikKCm9qX21vZGVsc2V0X3JlZyA8LSBwYXJhbGxlbDo6cGFyTGFwcGx5KGNsLCBval90cmFpbnJlZywgZnVuY3Rpb24oZGYpCnsKICAgIG1vZGVsKGRmLAogICAgICAgIGFyX3RyZW5kPUFSSU1BKGxvZ21vdmUgfiBwZHEoKSArIFBEUSgwLCAwLCAwKSArIHRyZW5kKCkpLAoKICAgICAgICBhcl9yZWc9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApICsgZGVhbCArIGZlYXQgKyBtYXhwcmljZWRpZmYgKwogICAgICAgICAgICBwcmljZTEgKyBwcmljZTIgKyBwcmljZTMgKyBwcmljZTQgKyBwcmljZTUgKyBwcmljZTYgKyBwcmljZTcgKyBwcmljZTggKyBwcmljZTkgKyBwcmljZTEwICsgcHJpY2UxMSksCgogICAgICAgIGFyX3JlZ19wcmljZT1BUklNQShsb2dtb3ZlIH4gcGRxKCkgKyBQRFEoMCwgMCwgMCkgKyBkZWFsICsgZmVhdCArIG1heHByaWNlZGlmZiArIHByaWNlKSwKCiAgICAgICAgYXJfcmVnX3ByaWNlX3RyZW5kPUFSSU1BKGxvZ21vdmUgfiBwZHEoKSArIFBEUSgwLCAwLCAwKSArIHRyZW5kKCkgKyBkZWFsICsgZmVhdCArIG1heHByaWNlZGlmZiArIHByaWNlKSwKCiAgICAgICAgLnNhZmVseT1GQUxTRQogICAgKQp9KQoKb2pfZmNhc3RfcmVnIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBnZXRfZm9yZWNhc3RzLCBval9tb2RlbHNldF9yZWcsIG9qX3Rlc3RyZWcpCgpkZXN0cm95X2NsdXN0ZXIoY2wpCgpzYXZlX29iamVjdHMob2pfbW9kZWxzZXRfcmVnLCBval9mY2FzdF9yZWcsCiAgICAgICAgICAgICBleGFtcGxlPSJncm9jZXJ5X3NhbGVzIiwgZmlsZT0ibW9kZWxfcmVnLlJkYXRhIikKCmRvLmNhbGwocmJpbmQsIG9qX2ZjYXN0X3JlZykgJT4lCiAgICBtdXRhdGVfYXQoLSgxOjMpLCBleHApICU+JQogICAgZXZhbF9mb3JlY2FzdHMoKQpgYGAKClRoaXMgc2hvd3MgdGhhdCB0aGUgbW9kZWxzIGluY29ycG9yYXRpbmcgcHJpY2UgYXJlIGEgc2lnbmlmaWNhbnQgaW1wcm92ZW1lbnQgb3ZlciB0aGUgcHJldmlvdXMgbmFpdmUgbW9kZWxzLiBUaGUgbW9kZWwgdGhhdCB1c2VzIHN0ZXB3aXNlIHNlbGVjdGlvbiB0byBjaG9vc2UgdGhlIGJlc3QgcHJpY2UgdmFyaWFibGUgZG9lcyB3b3JzZSB0aGFuIHRoZSBvbmUgd2hlcmUgd2UgY2hvb3NlIHRoZSBwcmljZSBiZWZvcmVoYW5kLCBjb25maXJtaW5nIHRoZSBzdXNwaWNpb24gdGhhdCBzdGVwd2lzZSBsZWFkcyB0byBvdmVyZml0dGluZyBpbiB0aGlzIGNhc2UuCg==
@@ -307,7 +357,7 @@ $(document).ready(function () { diff --git a/examples/grocery_sales/R/02b_prophet_models.Rmd b/examples/grocery_sales/R/02b_prophet_models.Rmd new file mode 100644 index 00000000..0d416e11 --- /dev/null +++ b/examples/grocery_sales/R/02b_prophet_models.Rmd @@ -0,0 +1,67 @@ +--- +title: Prophet models +output: html_notebook +--- + +_Copyright (c) Microsoft Corporation._
+_Licensed under the MIT License._ + +```{r, echo=FALSE, results="hide", message=FALSE} +library(tidyr) +library(dplyr) +library(tsibble) +library(feasts) +library(fable) +library(prophet) +library(fable.prophet) +``` + +This notebook builds a forecasting model using the [Prophet](https://facebook.github.io/prophet/) algorithm. Prophet is a time series model developed by Facebook that is designed to be simple for non-experts to use, yet flexible and powerful. + +> Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well. + +Here, we will use the fable.prophet package which provides a tidyverts frontend to the prophet package itself. As with ETS, prophet does not support time series with missing values, so we again impute them using the ARIMA model forecasts. + +```{r} +srcdir <- here::here("R_utils") +for(src in dir(srcdir, full.names=TRUE)) source(src) + +load_objects("grocery_sales", "data_reg.Rdata") +load_objects("grocery_sales", "model_basic.Rdata") + +cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts", "prophet", "fable.prophet")) + +oj_modelset_pr <- parallel::clusterMap(cl, function(df, basicmod) +{ + df$logmove <- interpolate(select(basicmod, -c(mean, naive, drift)), df)$logmove + df %>% + group_by(store, brand) %>% + fill(deal:maxpricediff, .direction="downup") %>% + model( + pr=prophet(logmove ~ deal + feat + price + maxpricediff), + + pr_tune=prophet(logmove ~ deal + feat + price + maxpricediff + + growth(n_changepoints=2) + season(period=52, order=5, prior_scale=2)), + + .safely=FALSE + ) +}, oj_trainreg, oj_modelset_basic) + +oj_fcast_pr <- parallel::clusterMap(cl, function(mable, newdata, fcast_func) +{ + newdata <- newdata %>% + fill(deal:maxpricediff, .direction="downup") + fcast_func(mable, newdata) +}, oj_modelset_pr, oj_testreg, MoreArgs=list(fcast_func=get_forecasts)) + +destroy_cluster(cl) + +save_objects(oj_modelset_pr, oj_fcast_pr, + example="grocery_sales", file="model_pr.Rdata") + +do.call(rbind, oj_fcast_pr) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts() +``` + +It appears that Prophet does _not_ do better than the simple ARIMA model with regression variables. This is possibly because the dataset does not have a strong time series nature: there is no seasonality, and only weak or nonexistent trends. These are features which the Prophet algorithm is designed to detect, and their absence means that there would be little advantage in using it. diff --git a/R/orange_juice/02_simplemodels.nb.html b/examples/grocery_sales/R/02b_prophet_models.nb.html similarity index 98% rename from R/orange_juice/02_simplemodels.nb.html rename to examples/grocery_sales/R/02b_prophet_models.nb.html index 735d3223..4bf9ae60 100644 --- a/R/orange_juice/02_simplemodels.nb.html +++ b/examples/grocery_sales/R/02b_prophet_models.nb.html @@ -11,7 +11,7 @@ -Simple models +Prophet models @@ -220,98 +220,76 @@ summary { -

Simple models

+

Prophet models

+

Copyright (c) Microsoft Corporation.
Licensed under the MIT License.

-

We fit some simple models to the orange juice data. One model is fit for each combination of store and brand.

-
    -
  • mean: This is just a simple mean.
  • -
  • naive: A random walk model without any other components. This amounts to setting all forecast values to the last observed value.
  • -
  • drift: This adjusts the naive model to incorporate a trend.
  • -
  • arima: An ARIMA model with the parameter values estimated from the data.
  • -
  • ets: An exponentially weighted model, again with parameter values estimated from the data.
  • -
-

Note that the model training process is embarrassingly parallel on 3 levels:

-
    -
  • We have multiple independent training datasets;
  • -
  • For which we fit multiple independent models;
  • -
  • Within which we have independent sub-models for each store and brand.
  • -
-

This lets us speed up the training significantly. While the fable::model function can fit multiple models in parallel, we will run it sequentially here and instead parallelise by dataset. This avoids contention for cores, and also results in the simplest code.

+

This notebook builds a forecasting model using the Prophet algorithm. Prophet is a time series model developed by Facebook that is designed to be simple for non-experts to use, yet flexible and powerful.

+
+

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

+
+

Here, we will use the fable.prophet package which provides a tidyverts frontend to the prophet package itself. As with ETS, prophet does not support time series with missing values, so we again impute them using the ARIMA model forecasts.

- -
load("oj_data.Rdata")
+
+
srcdir <- here::here("R_utils")
+for(src in dir(srcdir, full.names=TRUE)) source(src)
 
-ncores <- max(2, parallel::detectCores(logical=FALSE) - 2)
-cl <- parallel::makeCluster(ncores)
-parallel::clusterEvalQ(cl,
-{
-    library(tidyr)
-    library(feasts)
-    library(fable)
-    library(tsibble)
-})
- - - -

First, we fit the models that can innately handle missing values.

- - - -
oj_modelset <- parallel::parLapply(cl, oj_train, function(df)
-{
-    model(df,
-        mean=MEAN(logmove),
-        naive=NAIVE(logmove),
-        drift=RW(logmove ~ drift()),
-        arima=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0))
-    )
-})
- - - -

Next, we fit models that require manual imputation (ETS).

- - - -
oj_modelset_ets <- parallel::parLapply(cl, oj_train, function(df)
+load_objects("grocery_sales", "data_reg.Rdata")
+load_objects("grocery_sales", "model_basic.Rdata")
+
+cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts", "prophet", "fable.prophet"))
+
+oj_modelset_pr <- parallel::clusterMap(cl, function(df, basicmod)
 {
+    df$logmove <- interpolate(select(basicmod, -c(mean, naive, drift)), df)$logmove
     df %>%
-        fill(everything()) %>%
-        model(ets=ETS(logmove ~ error("A") + trend("A") + season("N")))
-})
+        group_by(store, brand) %>%
+        fill(deal:maxpricediff, .direction="downup") %>%
+        model(
+            pr=prophet(logmove ~ deal + feat + price + maxpricediff),
 
-parallel::stopCluster(cl)
-save(oj_modelset, oj_modelset_ets, file="oj_modelset.Rdata")
+            pr_tune=prophet(logmove ~ deal + feat + price + maxpricediff +
+                growth(n_changepoints=2) + season(period=52, order=5, prior_scale=2)),
 
-head(oj_modelset[[1]])
+ .safely=FALSE + ) +}, oj_trainreg, oj_modelset_basic) + +oj_fcast_pr <- parallel::clusterMap(cl, function(mable, newdata, fcast_func) +{ + newdata <- newdata %>% + fill(deal:maxpricediff, .direction="downup") + fcast_func(mable, newdata) +}, oj_modelset_pr, oj_testreg, MoreArgs=list(fcast_func=get_forecasts)) + +destroy_cluster(cl) + +save_objects(oj_modelset_pr, oj_fcast_pr, + example="grocery_sales", file="model_pr.Rdata") + +do.call(rbind, oj_fcast_pr) %>% + mutate_at(-(1:3), exp) %>% + eval_forecasts()
-
- -
head(oj_modelset_ets[[1]])
- -
-
+

It appears that Prophet does not do better than the simple ARIMA model with regression variables. This is possibly because the dataset does not have a strong time series nature: there is no seasonality, and only weak or nonexistent trends. These are features which the Prophet algorithm is designed to detect, and their absence means that there would be little advantage in using it.

-
LS0tCnRpdGxlOiBTaW1wbGUgbW9kZWxzCm91dHB1dDogaHRtbF9ub3RlYm9vawplbmNvZGluZzogdXRmOAotLS0KCmBgYHtyLCBlY2hvPUZBTFNFLCByZXN1bHRzPSJoaWRlIiwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0c2liYmxlKQpsaWJyYXJ5KGZlYXN0cykKbGlicmFyeShmYWJsZSkKYGBgCgpXZSBmaXQgc29tZSBzaW1wbGUgbW9kZWxzIHRvIHRoZSBvcmFuZ2UganVpY2UgZGF0YS4gT25lIG1vZGVsIGlzIGZpdCBmb3IgZWFjaCBjb21iaW5hdGlvbiBvZiBzdG9yZSBhbmQgYnJhbmQuCgotIGBtZWFuYDogVGhpcyBpcyBqdXN0IGEgc2ltcGxlIG1lYW4uCi0gYG5haXZlYDogQSByYW5kb20gd2FsayBtb2RlbCB3aXRob3V0IGFueSBvdGhlciBjb21wb25lbnRzLiBUaGlzIGFtb3VudHMgdG8gc2V0dGluZyBhbGwgZm9yZWNhc3QgdmFsdWVzIHRvIHRoZSBsYXN0IG9ic2VydmVkIHZhbHVlLgotIGBkcmlmdGA6IFRoaXMgYWRqdXN0cyB0aGUgYG5haXZlYCBtb2RlbCB0byBpbmNvcnBvcmF0ZSBhIHRyZW5kLgotIGBhcmltYWA6IEFuIEFSSU1BIG1vZGVsIHdpdGggdGhlIHBhcmFtZXRlciB2YWx1ZXMgZXN0aW1hdGVkIGZyb20gdGhlIGRhdGEuCi0gYGV0c2A6IEFuIGV4cG9uZW50aWFsbHkgd2VpZ2h0ZWQgbW9kZWwsIGFnYWluIHdpdGggcGFyYW1ldGVyIHZhbHVlcyBlc3RpbWF0ZWQgZnJvbSB0aGUgZGF0YS4KCk5vdGUgdGhhdCB0aGUgbW9kZWwgdHJhaW5pbmcgcHJvY2VzcyBpcyBlbWJhcnJhc3NpbmdseSBwYXJhbGxlbCBvbiAzIGxldmVsczoKCi0gV2UgaGF2ZSBtdWx0aXBsZSBpbmRlcGVuZGVudCB0cmFpbmluZyBkYXRhc2V0czsKLSBGb3Igd2hpY2ggd2UgZml0IG11bHRpcGxlIGluZGVwZW5kZW50IG1vZGVsczsKLSBXaXRoaW4gd2hpY2ggd2UgaGF2ZSBpbmRlcGVuZGVudCBzdWItbW9kZWxzIGZvciBlYWNoIHN0b3JlIGFuZCBicmFuZC4KClRoaXMgbGV0cyB1cyBzcGVlZCB1cCB0aGUgdHJhaW5pbmcgc2lnbmlmaWNhbnRseS4gV2hpbGUgdGhlIGBmYWJsZTo6bW9kZWxgIGZ1bmN0aW9uIGNhbiBmaXQgbXVsdGlwbGUgbW9kZWxzIGluIHBhcmFsbGVsLCB3ZSB3aWxsIHJ1biBpdCBzZXF1ZW50aWFsbHkgaGVyZSBhbmQgaW5zdGVhZCBwYXJhbGxlbGlzZSBieSBkYXRhc2V0LiBUaGlzIGF2b2lkcyBjb250ZW50aW9uIGZvciBjb3JlcywgYW5kIGFsc28gcmVzdWx0cyBpbiB0aGUgc2ltcGxlc3QgY29kZS4KCmBgYHtyLCByZXN1bHRzPSJoaWRlIn0KbG9hZCgib2pfZGF0YS5SZGF0YSIpCgpuY29yZXMgPC0gbWF4KDIsIHBhcmFsbGVsOjpkZXRlY3RDb3Jlcyhsb2dpY2FsPUZBTFNFKSAtIDIpCmNsIDwtIHBhcmFsbGVsOjptYWtlQ2x1c3RlcihuY29yZXMpCnBhcmFsbGVsOjpjbHVzdGVyRXZhbFEoY2wsCnsKICAgIGxpYnJhcnkodGlkeXIpCiAgICBsaWJyYXJ5KGZlYXN0cykKICAgIGxpYnJhcnkoZmFibGUpCiAgICBsaWJyYXJ5KHRzaWJibGUpCn0pCmBgYAoKRmlyc3QsIHdlIGZpdCB0aGUgbW9kZWxzIHRoYXQgY2FuIGlubmF0ZWx5IGhhbmRsZSBtaXNzaW5nIHZhbHVlcy4KCmBgYHtyfQpval9tb2RlbHNldCA8LSBwYXJhbGxlbDo6cGFyTGFwcGx5KGNsLCBval90cmFpbiwgZnVuY3Rpb24oZGYpCnsKICAgIG1vZGVsKGRmLAogICAgICAgIG1lYW49TUVBTihsb2dtb3ZlKSwKICAgICAgICBuYWl2ZT1OQUlWRShsb2dtb3ZlKSwKICAgICAgICBkcmlmdD1SVyhsb2dtb3ZlIH4gZHJpZnQoKSksCiAgICAgICAgYXJpbWE9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApKQogICAgKQp9KQpgYGAKTmV4dCwgd2UgZml0IG1vZGVscyB0aGF0IHJlcXVpcmUgbWFudWFsIGltcHV0YXRpb24gKEVUUykuCgpgYGB7cn0Kb2pfbW9kZWxzZXRfZXRzIDwtIHBhcmFsbGVsOjpwYXJMYXBwbHkoY2wsIG9qX3RyYWluLCBmdW5jdGlvbihkZikKewogICAgZGYgJT4lCiAgICAgICAgZmlsbChldmVyeXRoaW5nKCkpICU+JQogICAgICAgIG1vZGVsKGV0cz1FVFMobG9nbW92ZSB+IGVycm9yKCJBIikgKyB0cmVuZCgiQSIpICsgc2Vhc29uKCJOIikpKQp9KQoKcGFyYWxsZWw6OnN0b3BDbHVzdGVyKGNsKQpzYXZlKG9qX21vZGVsc2V0LCBval9tb2RlbHNldF9ldHMsIGZpbGU9Im9qX21vZGVsc2V0LlJkYXRhIikKCmhlYWQob2pfbW9kZWxzZXRbWzFdXSkKaGVhZChval9tb2RlbHNldF9ldHNbWzFdXSkKYGBgCgo=
+
LS0tCnRpdGxlOiBQcm9waGV0IG1vZGVscwpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpfQ29weXJpZ2h0IChjKSBNaWNyb3NvZnQgQ29ycG9yYXRpb24uXzxici8+Cl9MaWNlbnNlZCB1bmRlciB0aGUgTUlUIExpY2Vuc2UuXwoKYGBge3IsIGVjaG89RkFMU0UsIHJlc3VsdHM9ImhpZGUiLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KHRzaWJibGUpCmxpYnJhcnkoZmVhc3RzKQpsaWJyYXJ5KGZhYmxlKQpsaWJyYXJ5KHByb3BoZXQpCmxpYnJhcnkoZmFibGUucHJvcGhldCkKYGBgCgpUaGlzIG5vdGVib29rIGJ1aWxkcyBhIGZvcmVjYXN0aW5nIG1vZGVsIHVzaW5nIHRoZSBbUHJvcGhldF0oaHR0cHM6Ly9mYWNlYm9vay5naXRodWIuaW8vcHJvcGhldC8pIGFsZ29yaXRobS4gUHJvcGhldCBpcyBhIHRpbWUgc2VyaWVzIG1vZGVsIGRldmVsb3BlZCBieSBGYWNlYm9vayB0aGF0IGlzIGRlc2lnbmVkIHRvIGJlIHNpbXBsZSBmb3Igbm9uLWV4cGVydHMgdG8gdXNlLCB5ZXQgZmxleGlibGUgYW5kIHBvd2VyZnVsLgoKPiBQcm9waGV0IGlzIGEgcHJvY2VkdXJlIGZvciBmb3JlY2FzdGluZyB0aW1lIHNlcmllcyBkYXRhIGJhc2VkIG9uIGFuIGFkZGl0aXZlIG1vZGVsIHdoZXJlIG5vbi1saW5lYXIgdHJlbmRzIGFyZSBmaXQgd2l0aCB5ZWFybHksIHdlZWtseSwgYW5kIGRhaWx5IHNlYXNvbmFsaXR5LCBwbHVzIGhvbGlkYXkgZWZmZWN0cy4gSXQgd29ya3MgYmVzdCB3aXRoIHRpbWUgc2VyaWVzIHRoYXQgaGF2ZSBzdHJvbmcgc2Vhc29uYWwgZWZmZWN0cyBhbmQgc2V2ZXJhbCBzZWFzb25zIG9mIGhpc3RvcmljYWwgZGF0YS4gUHJvcGhldCBpcyByb2J1c3QgdG8gbWlzc2luZyBkYXRhIGFuZCBzaGlmdHMgaW4gdGhlIHRyZW5kLCBhbmQgdHlwaWNhbGx5IGhhbmRsZXMgb3V0bGllcnMgd2VsbC4KCkhlcmUsIHdlIHdpbGwgdXNlIHRoZSBmYWJsZS5wcm9waGV0IHBhY2thZ2Ugd2hpY2ggcHJvdmlkZXMgYSB0aWR5dmVydHMgZnJvbnRlbmQgdG8gdGhlIHByb3BoZXQgcGFja2FnZSBpdHNlbGYuIEFzIHdpdGggRVRTLCBwcm9waGV0IGRvZXMgbm90IHN1cHBvcnQgdGltZSBzZXJpZXMgd2l0aCBtaXNzaW5nIHZhbHVlcywgc28gd2UgYWdhaW4gaW1wdXRlIHRoZW0gdXNpbmcgdGhlIEFSSU1BIG1vZGVsIGZvcmVjYXN0cy4KCmBgYHtyfQpzcmNkaXIgPC0gaGVyZTo6aGVyZSgiUl91dGlscyIpCmZvcihzcmMgaW4gZGlyKHNyY2RpciwgZnVsbC5uYW1lcz1UUlVFKSkgc291cmNlKHNyYykKCmxvYWRfb2JqZWN0cygiZ3JvY2VyeV9zYWxlcyIsICJkYXRhX3JlZy5SZGF0YSIpCmxvYWRfb2JqZWN0cygiZ3JvY2VyeV9zYWxlcyIsICJtb2RlbF9iYXNpYy5SZGF0YSIpCgpjbCA8LSBtYWtlX2NsdXN0ZXIobGlicz1jKCJ0aWR5ciIsICJkcGx5ciIsICJmYWJsZSIsICJ0c2liYmxlIiwgImZlYXN0cyIsICJwcm9waGV0IiwgImZhYmxlLnByb3BoZXQiKSkKCm9qX21vZGVsc2V0X3ByIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBmdW5jdGlvbihkZiwgYmFzaWNtb2QpCnsKICAgIGRmJGxvZ21vdmUgPC0gaW50ZXJwb2xhdGUoc2VsZWN0KGJhc2ljbW9kLCAtYyhtZWFuLCBuYWl2ZSwgZHJpZnQpKSwgZGYpJGxvZ21vdmUKICAgIGRmICU+JQogICAgICAgIGdyb3VwX2J5KHN0b3JlLCBicmFuZCkgJT4lCiAgICAgICAgZmlsbChkZWFsOm1heHByaWNlZGlmZiwgLmRpcmVjdGlvbj0iZG93bnVwIikgJT4lCiAgICAgICAgbW9kZWwoCiAgICAgICAgICAgIHByPXByb3BoZXQobG9nbW92ZSB+IGRlYWwgKyBmZWF0ICsgcHJpY2UgKyBtYXhwcmljZWRpZmYpLAoKICAgICAgICAgICAgcHJfdHVuZT1wcm9waGV0KGxvZ21vdmUgfiBkZWFsICsgZmVhdCArIHByaWNlICsgbWF4cHJpY2VkaWZmICsKICAgICAgICAgICAgICAgIGdyb3d0aChuX2NoYW5nZXBvaW50cz0yKSArIHNlYXNvbihwZXJpb2Q9NTIsIG9yZGVyPTUsIHByaW9yX3NjYWxlPTIpKSwKCiAgICAgICAgICAgIC5zYWZlbHk9RkFMU0UKICAgICAgICApCn0sIG9qX3RyYWlucmVnLCBval9tb2RlbHNldF9iYXNpYykKCm9qX2ZjYXN0X3ByIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBmdW5jdGlvbihtYWJsZSwgbmV3ZGF0YSwgZmNhc3RfZnVuYykKewogICAgbmV3ZGF0YSA8LSBuZXdkYXRhICU+JQogICAgICAgIGZpbGwoZGVhbDptYXhwcmljZWRpZmYsIC5kaXJlY3Rpb249ImRvd251cCIpCiAgICBmY2FzdF9mdW5jKG1hYmxlLCBuZXdkYXRhKQp9LCBval9tb2RlbHNldF9wciwgb2pfdGVzdHJlZywgTW9yZUFyZ3M9bGlzdChmY2FzdF9mdW5jPWdldF9mb3JlY2FzdHMpKQoKZGVzdHJveV9jbHVzdGVyKGNsKQoKc2F2ZV9vYmplY3RzKG9qX21vZGVsc2V0X3ByLCBval9mY2FzdF9wciwKICAgICAgICAgICAgIGV4YW1wbGU9Imdyb2Nlcnlfc2FsZXMiLCBmaWxlPSJtb2RlbF9wci5SZGF0YSIpCgpkby5jYWxsKHJiaW5kLCBval9mY2FzdF9wcikgJT4lCiAgICBtdXRhdGVfYXQoLSgxOjMpLCBleHApICU+JQogICAgZXZhbF9mb3JlY2FzdHMoKQpgYGAKCkl0IGFwcGVhcnMgdGhhdCBQcm9waGV0IGRvZXMgX25vdF8gZG8gYmV0dGVyIHRoYW4gdGhlIHNpbXBsZSBBUklNQSBtb2RlbCB3aXRoIHJlZ3Jlc3Npb24gdmFyaWFibGVzLiBUaGlzIGlzIHBvc3NpYmx5IGJlY2F1c2UgdGhlIGRhdGFzZXQgZG9lcyBub3QgaGF2ZSBhIHN0cm9uZyB0aW1lIHNlcmllcyBuYXR1cmU6IHRoZXJlIGlzIG5vIHNlYXNvbmFsaXR5LCBhbmQgb25seSB3ZWFrIG9yIG5vbmV4aXN0ZW50IHRyZW5kcy4gVGhlc2UgYXJlIGZlYXR1cmVzIHdoaWNoIHRoZSBQcm9waGV0IGFsZ29yaXRobSBpcyBkZXNpZ25lZCB0byBkZXRlY3QsIGFuZCB0aGVpciBhYnNlbmNlIG1lYW5zIHRoYXQgdGhlcmUgd291bGQgYmUgbGl0dGxlIGFkdmFudGFnZSBpbiB1c2luZyBpdC4K
@@ -358,7 +336,7 @@ $(document).ready(function () { diff --git a/examples/grocery_sales/R/README.md b/examples/grocery_sales/R/README.md new file mode 100644 index 00000000..6e3ca408 --- /dev/null +++ b/examples/grocery_sales/R/README.md @@ -0,0 +1,45 @@ +# Forecasting examples in R: orange juice retail sales + +The Rmarkdown notebooks in this directory are as follows. Each notebook also has a corresponding HTML file, which is the rendered output from running the code. + +- [`01_dataprep.Rmd`](01_dataprep.Rmd) creates the training and test datasets +- [`02_basic_models.Rmd`](02_basic_models.Rmd) fits a range of simple time series models to the data, including ARIMA and ETS. +- [`02a_reg_models.Rmd`](02a_reg_models.Rmd) adds independent variables as regressors to the ARIMA model. +- [`02b_prophet_models.Rmd`](02b_prophet_models.Rmd) fits some simple models using the Prophet algorithm. + +If you want to run the code in the notebooks interactively, you must start from `01_dataprep.Rmd` and proceed in sequence, as the earlier notebooks will generate artifacts (datasets/model objects) that are used by later ones. + +## Package installation + +The following packages are needed to run the basic analysis notebooks in this directory: + +- rmarkdown +- dplyr +- tidyr +- ggplot2 +- tsibble +- fable +- feasts +- yaml +- here + +It's likely that you will already have many of these (particularly the [Tidyverse](https://tidyverse.org) packages) installed, if you use R for data science tasks. The main exceptions are the packages in the [Tidyverts](https://tidyverts.org) family, which is a modern framework for time series analysis building on the Tidyverse. + +```r +install.packages("tidyverse") # installs all tidyverse packages +install.packages("rmarkdown") +install.packages("here") +install.packages(c("tsibble", "fable", "feasts")) +``` + +The following packages are needed to run the Prophet analysis notebook: + +- prophet +- fable.prophet + +While prophet is available from CRAN, its frontend for the tidyverts framework, fable.prophet, is currently on GitHub only. You can install these packages with + +```r +install.packages("prophet") +install.packages("https://github.com/mitchelloharawild/fable.prophet/archive/master.tar.gz", repos=NULL) +``` diff --git a/examples/grocery_sales/R/forecast_settings.yaml b/examples/grocery_sales/R/forecast_settings.yaml new file mode 100644 index 00000000..638662ce --- /dev/null +++ b/examples/grocery_sales/R/forecast_settings.yaml @@ -0,0 +1,6 @@ +N_SPLITS: 10 +HORIZON: 2 +GAP: 2 +FIRST_WEEK: 40 +LAST_WEEK: 156 +START_DATE: "1989-09-14" diff --git a/examples/grocery_sales/README.md b/examples/grocery_sales/README.md new file mode 100644 index 00000000..8bcfb30c --- /dev/null +++ b/examples/grocery_sales/README.md @@ -0,0 +1,26 @@ +# Forecasting examples + +This folder contains Python and R examples for building forecasting solutions on the Orange Juice dataset which is part of the [Dominick's dataset](https://www.chicagobooth.edu/research/kilts/datasets/dominicks). The examples are presented in Python Jupyter notebooks and R Markdown files, respectively. + + +## Orange Juice Dataset + +In this scenario, we will use the Orange Juice (OJ) dataset to forecast its sales. The OJ dataset is from R package [bayesm](https://cran.r-project.org/web/packages/bayesm/index.html) and is part of the [Dominick's dataset](https://www.chicagobooth.edu/research/kilts/datasets/dominicks). + +This dataset contains the following two tables: +- **yx.cs.** - Weekly sales of refrigerated orange juice at 83 stores. This table has 106139 rows and 19 columns. It includes weekly sales and prices of 11 orange juice brands as well as information about profit, deal, and advertisement for each brand. Note that the weekly sales is captured by a column named `logmove` which corresponds to the natural logarithm of the number of units sold. To get the number of units sold, you need to apply an exponential transform to this column. +- **storedemo.csv** - Demographic information on those stores. This table has 83 rows and 13 columns. For every store, the table describes demographic information of its consumers, distance to the nearest warehouse store, average distance to the nearest 5 supermarkets, ratio of its sales to the nearest warehouse store, and ratio of its sales to the average of the nearest 5 stores. + +Note that the week number starts from 40 in this dataset, while the full Dominick's dataset has data starting from week 1 to week 400. According to [Dominick's Data Manual](https://www.chicagobooth.edu/-/media/enterprise/centers/kilts/datasets/dominicks-dataset/dominicks-manual-and-codebook_kiltscenter.aspx), week 1 starts on 09/14/1989. Please see pages 40 and 41 of the [bayesm reference manual](https://cran.r-project.org/web/packages/bayesm/bayesm.pdf) and the [Dominick's Data Manual](https://www.chicagobooth.edu/-/media/enterprise/centers/kilts/datasets/dominicks-dataset/dominicks-manual-and-codebook_kiltscenter.aspx) for more details about the data. + + + +## Summary + +The following summarizes each directory of the forecasting examples. + +| Directory | Content | Description | +| --- | --- | --- | +| [python](./python)| [00_quick_start/](./python/00_quick_start)
[01_prepare_data/](./python/01_prepare_data)
[02_model/](./python/02_model)
[03_model_tune_deploy/](./python/03_model_tune_deploy/) |
  • Quick start examples for single-round training
  • Data exploration and preparation notebooks
  • Multi-round training examples
  • Model tuning and deployment example
| +| [R](./R) | [01_dataprep.Rmd](R/01_dataprep.Rmd)
[02_basic_models.Rmd](R/02_basic_models.Rmd)
[02a_reg_models.Rmd](R/02a_reg_models.Rmd)
[02b_prophet_models.Rmd](R/02b_prophet_models.Rmd) |
  • Data preparation
  • Basic time series models
  • ARIMA-regression models
  • Prophet models
| + diff --git a/examples/00_quick_start/auto_arima_forecasting.ipynb b/examples/grocery_sales/python/00_quick_start/autoarima_single_round.ipynb similarity index 99% rename from examples/00_quick_start/auto_arima_forecasting.ipynb rename to examples/grocery_sales/python/00_quick_start/autoarima_single_round.ipynb index 06106a7e..c403d8a7 100644 --- a/examples/00_quick_start/auto_arima_forecasting.ipynb +++ b/examples/grocery_sales/python/00_quick_start/autoarima_single_round.ipynb @@ -164,19 +164,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our data preparation for the training and test set include the following steps:\n", - "\n", - "- The unit sales of orange juice are give in logarithmic scale. We will transfrom them back into the unit scale by applying `math.exp()`\n", - "- Our time series data is not complete, since we have missing sales for some stores/products and weeks. We will fill in those missing values by propagating the last valid observation forward to next available value.\n", - "\n", - "Note that our time series are grouped by `store` and `brand`, while `week` represents a time step, and `move` represents the value to predict." + "### Process training data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Process training data" + "Our time series data is not complete, since we have missing sales for some stores/products and weeks. We will fill in those missing values by propagating the last valid observation forward to next available value. We will define functions for data frame processing, then use these functions within a loop that loops over each forecasting rounds.\n", + "\n", + "Note that our time series are grouped by `store` and `brand`, while `week` represents a time step, and `logmove` represents the value to predict." ] }, { @@ -477,7 +474,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's now process the test data. Note that the test data runs from `LAST_WEEK - HORIZON + 1` to `LAST_WEEK`. Note that we are converting unit sales below from logarithmic scale to the counts, as we will be using counts to calculate the evaluation metrics." + "Let's now process the test data. Note that the test data runs from `LAST_WEEK - HORIZON + 1` to `LAST_WEEK`. Note that, in addition to filling out missing values, we also convert unit sales from logarithmic scale to the counts. We will do model training on the log scale, due to improved performance, however, we will transfrom the test data back into the unit scale (counts) by applying `math.exp()`, so that we can evaluate the performance on the unit scale." ] }, { diff --git a/examples/grocery_sales/python/00_quick_start/azure_automl_single_round.ipynb b/examples/grocery_sales/python/00_quick_start/azure_automl_single_round.ipynb new file mode 100644 index 00000000..027a3814 --- /dev/null +++ b/examples/grocery_sales/python/00_quick_start/azure_automl_single_round.ipynb @@ -0,0 +1,1790 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation.\n", + "\n", + "Licensed under the MIT License. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Machine Learning (AutoML) on Azure for Retail Sales Forecasting\n", + "\n", + "This notebook demonstrates how to apply [AutoML in Azure Machine Learning services](https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml) to train and tune machine learning models for forecasting product sales in retail. We will use the Orange Juice dataset to illustrate the steps of utilizing AutoML as well as how to combine an AutoML model with a custom model for better performance.\n", + "\n", + "AutoML is a process of automating the tasks of machine learning model development. It helps data scientists and other practioners build machine learning models with high scalability and quality in less amount of time. AutoML in Azure Machine Learning allows you to train and tune a model using a target metric that you specify. This service iterates through machine learning algorithms and feature selection approaches, producing a score that measures the quality of each machine learning pipeline. The best model will then be selected based on the scores. For more technical details about Azure AutoML, please check [this paper](https://papers.nips.cc/paper/7595-probabilistic-matrix-factorization-for-automated-machine-learning.pdf).\n", + "\n", + "This notebook uses [Azure ML SDK](https://docs.microsoft.com/en-us/python/api/overview/azureml-sdk/?view=azure-ml-py) which is included in the `forecasting_env` conda environment. If you are running in Azure Notebooks or another Microsoft managed environment, the SDK is already installed. On the other hand, if you are running this notebook in your own environment, please follow [SDK installation instructions](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-environment) to install the SDK." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Settings and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.10 |Anaconda, Inc.| (default, Jan 7 2020, 21:14:29) \n", + "[GCC 7.3.0]\n", + "This notebook was created using version 1.0.85 of the Azure ML SDK\n", + "You are currently using version 1.0.85 of the Azure ML SDK\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import math\n", + "import warnings\n", + "import datetime\n", + "import logging\n", + "import azureml.core\n", + "import azureml.automl\n", + "import pandas as pd\n", + "\n", + "from matplotlib import pyplot as plt\n", + "from fclib.common.utils import git_repo_path\n", + "from fclib.azureml.azureml_utils import (\n", + " get_or_create_workspace,\n", + " get_or_create_amlcompute,\n", + ")\n", + "from fclib.dataset.ojdata import download_ojdata, FIRST_WEEK_START\n", + "from fclib.common.utils import align_outputs\n", + "from fclib.evaluation.evaluation_utils import MAPE\n", + "from fclib.models.multiple_linear_regression import fit, predict\n", + "\n", + "from azureml.core import Workspace\n", + "from azureml.core.dataset import Dataset\n", + "from azureml.core.experiment import Experiment\n", + "from automl.client.core.common import constants\n", + "from azureml.train.automl import AutoMLConfig\n", + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"This notebook was created using version 1.0.85 of the Azure ML SDK\")\n", + "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Use False if you've already downloaded and split the data\n", + "DOWNLOAD_SPLIT_DATA = True\n", + "\n", + "# Data directory\n", + "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n", + "\n", + "# Forecasting settings\n", + "GAP = 2\n", + "LAST_WEEK = 138\n", + "\n", + "# Number of test periods\n", + "NUM_TEST_PERIODS = 3\n", + "\n", + "# Column names\n", + "time_column_name = \"week_start\"\n", + "target_column_name = \"move\"\n", + "grain_column_names = [\"store\", \"brand\"]\n", + "index_column_names = [time_column_name] + grain_column_names\n", + "\n", + "# Subset of stores used in the notebook\n", + "USE_STORES = [2, 5, 8]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Azure Machine Learning Workspace\n", + "\n", + "An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models. To create an Azure ML workspace, first you need access to an Azure subscription. An Azure subscription allows you to manage storage, compute, and other assets in the Azure cloud. You can [create a new subscription](https://azure.microsoft.com/en-us/free/) or access existing subscription information from the [Azure portal](https://portal.azure.com/). Given that you have access to your Azure subscription, you can further create an Azure ML workspace by following the instructions [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace). You can also do so [using Azure CLI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli) or the `Workspace.create()` method in Azure SDK.\n", + "\n", + "Once you have created an Azure ML workspace, you can download its configuration file (`config.json`) from Azure Portal as follows\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Azure ML Workspace\n", + "\n", + "In the following cell, `get_or_create_workspace()` creates a workspace object from the details stored in `config.json` that you have downloaded. We assume that you store this config file to a directory `./.azureml`. In case the existing workspace cannot be loaded, the following cell will try to create a new workspace with the subscription ID, resource group, and workspace name as specified in the beginning of the cell.\n", + "\n", + "The cell can fail if you don't have permission to access the workspace. You may need to log into your Azure account and change the default subscription to the one which the workspace belongs to using Azure CLI `az account set --subscription `." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Workspace preparation succeeded.\n" + ] + } + ], + "source": [ + "# Please specify the AzureML workspace attributes below if you want to create a new one.\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace_name = \"\"\n", + "workspace_region = \"\"\n", + "\n", + "# Connect to a workspace\n", + "ws = get_or_create_workspace(\n", + " config_path=\"./.azureml\",\n", + " subscription_id=subscription_id,\n", + " resource_group=resource_group,\n", + " workspace_name=workspace_name,\n", + " workspace_region=workspace_region,\n", + ")\n", + "print(\n", + " \"Workspace name: \" + ws.name,\n", + " \"Azure region: \" + ws.location,\n", + " \"Resource group: \" + ws.resource_group,\n", + " sep=\"\\n\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create compute resources for your experiments\n", + "\n", + "We run AutoML on a dynamically scalable compute cluster. In the next cell, we create an AmlCompute target with a specific cluster name, VM size, and maximum number of nodes if the cluster does not exist. Otherwise, we will reuse an existing one. For more options of VM sizes, please check the information in this [link](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-general)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing cpu-cluster\n" + ] + } + ], + "source": [ + "# Choose a name for your cluster\n", + "cluster_name = \"cpu-cluster\"\n", + "# VM Size\n", + "vm_size = \"STANDARD_D2_V2\"\n", + "# Maximum number of nodes of the cluster\n", + "max_nodes = 4\n", + "\n", + "# Create a new AmlCompute if it does not exist or reuse an existing one\n", + "cpu_cluster = get_or_create_amlcompute(\n", + " workspace=ws,\n", + " compute_name=cluster_name,\n", + " vm_size=vm_size,\n", + " min_nodes=0,\n", + " max_nodes=max_nodes,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Experiment\n", + "\n", + "To run AutoML, you need to create an Experiment. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SDK version1.0.85
Workspacechhamlws
SKUBasic
Resource Groupchhamlwsrg
Locationwestcentralus
Run History Nameautoml-ojforecasting
\n", + "
" + ], + "text/plain": [ + " \n", + "SDK version 1.0.85 \n", + "Workspace chhamlws \n", + "SKU Basic \n", + "Resource Group chhamlwsrg \n", + "Location westcentralus \n", + "Run History Name automl-ojforecasting" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# choose a name for the run history container in the workspace\n", + "experiment_name = \"automl-ojforecasting\"\n", + "\n", + "experiment = Experiment(ws, experiment_name)\n", + "\n", + "output = {}\n", + "output[\"SDK version\"] = azureml.core.VERSION\n", + "output[\"Workspace\"] = ws.name\n", + "output[\"SKU\"] = ws.sku\n", + "output[\"Resource Group\"] = ws.resource_group\n", + "output[\"Location\"] = ws.location\n", + "output[\"Run History Name\"] = experiment_name\n", + "pd.set_option(\"display.max_colwidth\", -1)\n", + "outputDf = pd.DataFrame(data=output, index=[\"\"])\n", + "outputDf.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "\n", + "We need to download the Orange Juice data and split it into training and test sets. By default, the following cell will download and spit the data. If you've already done so, you may skip this part by switching `DOWNLOAD_SPLIT_DATA` to `False`.\n", + "\n", + "We store the training data and test data using dataframes. The training data includes `train_df` and `aux_df` with `train_df` containing the historical sales up to week 135 (the time we make forecasts) and `aux_df` containing price/promotion information up until week 138. We assume that future price and promotion information up to a certain number of weeks ahead is predetermined and known. The test data is stored in `test_df` which contains the sales of each product in week 137 and 138. Assuming the current week is week 135, our goal is to forecast the sales in week 137 and 138 using the training data. There is a one-week gap between the current week and the first target week of forecasting as we want to leave time for planning inventory in practice." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data download and split" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data already exists at the specified location.\n" + ] + } + ], + "source": [ + "if DOWNLOAD_SPLIT_DATA:\n", + " download_ojdata(DATA_DIR)\n", + " df = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n", + " df = df.loc[df.week <= LAST_WEEK]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert logarithm of the unit sales to unit sales\n", + "df[\"move\"] = df[\"logmove\"].apply(lambda x: round(math.exp(x)))\n", + "# Add timestamp column\n", + "df[\"week_start\"] = df[\"week\"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))\n", + "# Select a subset of stores for demo purpose\n", + "df_sub = df[df.store.isin(USE_STORES)]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Split data into training and test sets\n", + "def split_last_n_by_grain(df, n):\n", + " \"\"\"Group df by grain and split on last n rows for each group.\"\"\"\n", + " df_grouped = df.sort_values(time_column_name).groupby( # Sort by ascending time\n", + " grain_column_names, group_keys=False\n", + " )\n", + " df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])\n", + " df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])\n", + " return df_head, df_tail\n", + "\n", + "\n", + "train_df, test_df = split_last_n_by_grain(df_sub, NUM_TEST_PERIODS)\n", + "train_df.reset_index(drop=True)\n", + "test_df.reset_index(drop=True)\n", + "\n", + "# Save data locally\n", + "local_data_pathes = [\n", + " os.path.join(DATA_DIR, \"train.csv\"),\n", + " os.path.join(DATA_DIR, \"test.csv\"),\n", + "]\n", + "\n", + "train_df.to_csv(local_data_pathes[0], index=None, header=True)\n", + "test_df.to_csv(local_data_pathes[1], index=None, header=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload data to datastore\n", + "\n", + "The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace), is paired with the storage account, which contains the default data store. We will use it to upload the train and test data and create [tabular datasets](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training and testing. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uploading an estimated of 2 files\n", + "Uploading /data/home/chenhui/work/forecasting/ojdata/test.csv\n", + "Uploading /data/home/chenhui/work/forecasting/ojdata/train.csv\n", + "Uploaded /data/home/chenhui/work/forecasting/ojdata/test.csv, 1 files out of an estimated total of 2\n", + "Uploaded /data/home/chenhui/work/forecasting/ojdata/train.csv, 2 files out of an estimated total of 2\n", + "Uploaded 2 files\n" + ] + }, + { + "data": { + "text/plain": [ + "$AZUREML_DATAREFERENCE_1f003008a69b4030b4c6165a27ca7f24" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datastore = ws.get_default_datastore()\n", + "datastore.upload_files(files=local_data_pathes, target_path=\"dataset/\", overwrite=True, show_progress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create dataset for training" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = Dataset.Tabular.from_delimited_files(path=datastore.path(\"dataset/train.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
storebrandweeklogmoveconstantprice1price2price3price4price5...price7price8price9price10price11dealfeatprofitmoveweek_start
297681113110.4010.030.040.040.030.03...0.040.030.020.020.0200.005.52330241992-03-12
297781113210.3910.030.040.040.040.03...0.030.030.020.020.0211.005.48323841992-03-19
29788111339.3710.050.040.040.030.04...0.030.030.020.020.0200.005.38117761992-03-26
29798111349.3410.040.040.040.030.03...0.040.030.020.020.0200.007.16113921992-04-02
298081113510.5110.040.040.040.040.03...0.040.030.030.020.0211.008.29368641992-04-09
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " store brand week logmove constant price1 price2 price3 price4 \\\n", + "2976 8 11 131 10.40 1 0.03 0.04 0.04 0.03 \n", + "2977 8 11 132 10.39 1 0.03 0.04 0.04 0.04 \n", + "2978 8 11 133 9.37 1 0.05 0.04 0.04 0.03 \n", + "2979 8 11 134 9.34 1 0.04 0.04 0.04 0.03 \n", + "2980 8 11 135 10.51 1 0.04 0.04 0.04 0.04 \n", + "\n", + " price5 ... price7 price8 price9 price10 price11 deal \\\n", + "2976 0.03 ... 0.04 0.03 0.02 0.02 0.02 0 \n", + "2977 0.03 ... 0.03 0.03 0.02 0.02 0.02 1 \n", + "2978 0.04 ... 0.03 0.03 0.02 0.02 0.02 0 \n", + "2979 0.03 ... 0.04 0.03 0.02 0.02 0.02 0 \n", + "2980 0.03 ... 0.04 0.03 0.03 0.02 0.02 1 \n", + "\n", + " feat profit move week_start \n", + "2976 0.00 5.52 33024 1992-03-12 \n", + "2977 1.00 5.48 32384 1992-03-19 \n", + "2978 0.00 5.38 11776 1992-03-26 \n", + "2979 0.00 7.16 11392 1992-04-02 \n", + "2980 1.00 8.29 36864 1992-04-09 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_dataset.to_pandas_dataframe().tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling\n", + "\n", + "For forecasting tasks, AutoML uses pre-processing and estimation steps that are specific to time-series. AutoML will undertake the following pre-processing steps:\n", + "* Detect time-series sample frequency (e.g. hourly, daily, weekly) and create new records for absent time points to make the series regular. A regular time series has a well-defined frequency and has a value at every sample point in a contiguous time span\n", + "* Impute missing values in the target (via forward-fill) and feature columns (using median column values)\n", + "* Create grain-based features to enable fixed effects across different series\n", + "* Create time-based features to assist in learning seasonal patterns\n", + "* Encode categorical variables to numeric quantities\n", + "\n", + "In this notebook, AutoML will train a single, regression-type model across all time-series in a given training set. This allows the model to generalize across related series. To create a training job, we use AutoML Config object to define the settings and data. Here is a summary of the meanings of the AutoMLConfig parameters:\n", + "\n", + "|Property|Description|\n", + "|-|-|\n", + "|**task**|forecasting|\n", + "|**primary_metric**|This is the metric that you want to optimize.
Forecasting supports the following primary metrics
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error\n", + "|**experiment_timeout_hours**|Experimentation timeout in hours.|\n", + "|**enable_early_stopping**|If early stopping is on, training will stop when the primary metric is no longer improving.|\n", + "|**training_data**|Input dataset, containing both features and label column.|\n", + "|**label_column_name**|The name of the label column.|\n", + "|**compute_target**|The remote compute for training.|\n", + "|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection|\n", + "|**enable_voting_ensemble**|Allow AutoML to create a Voting ensemble of the best performing models|\n", + "|**enable_stack_ensemble**|Allow AutoML to create a Stack ensemble of the best performing models|\n", + "|**debug_log**|Log file path for writing debugging information|\n", + "|**time_column_name**|Name of the datetime column in the input data|\n", + "|**grain_column_names**|Name(s) of the columns defining individual series in the input data|\n", + "|**drop_column_names**|Name(s) of columns to drop prior to modeling|\n", + "|**max_horizon**|Maximum desired forecast horizon in units of time-series frequency|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model training" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "time_series_settings = {\n", + " \"time_column_name\": time_column_name,\n", + " \"grain_column_names\": grain_column_names,\n", + " \"drop_column_names\": [\"logmove\"], # 'logmove' is a leaky feature, so we remove it.\n", + " \"max_horizon\": NUM_TEST_PERIODS,\n", + "}\n", + "\n", + "automl_config = AutoMLConfig(\n", + " task=\"forecasting\",\n", + " debug_log=\"automl_oj_sales_errors.log\",\n", + " primary_metric=\"normalized_mean_absolute_error\",\n", + " experiment_timeout_hours=1.0, # You may increase this number to improve model accuracy\n", + " training_data=train_dataset,\n", + " label_column_name=target_column_name,\n", + " compute_target=cpu_cluster,\n", + " enable_early_stopping=True,\n", + " n_cross_validations=3,\n", + " verbosity=logging.INFO,\n", + " **time_series_settings\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ExperimentIdTypeStatusDetails PageDocs Page
automl-ojforecastingAutoML_45710381-d3fc-47c9-816d-9874c41b5355automlStartingLink to Azure Machine Learning studioLink to Documentation
" + ], + "text/plain": [ + "Run(Experiment: automl-ojforecasting,\n", + "Id: AutoML_45710381-d3fc-47c9-816d-9874c41b5355,\n", + "Type: automl,\n", + "Status: Starting)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remote_run = experiment.submit(automl_config, show_output=False)\n", + "remote_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remote_run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve the best model\n", + "\n", + "Each run within an Experiment stores serialized (i.e. pickled) pipelines from the AutoML iterations. After the training job is done, we can retrieve the pipeline with the best performance on the validation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('timeseriestransformer', TimeSeriesTransformer(logger=None,\n", + " pipeline_type=)), ('MinMaxScaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('GradientBoostingRegressor', GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,\n", + " learning_rate=0.1, loss='huber', max_depth=10,\n", + " max_features='sqrt', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=0.15874989977926784,\n", + " min_samples_split=0.10734188827013527,\n", + " min_weight_fraction_leaf=0.0, n_estimators=50,\n", + " n_iter_no_change=None, presort='auto', random_state=None,\n", + " subsample=0.95, tol=0.0001, validation_fraction=0.1,\n", + " verbose=0, warm_start=False))]\n" + ] + } + ], + "source": [ + "best_run, fitted_model = remote_run.get_output()\n", + "print(fitted_model.steps)\n", + "model_name = best_run.properties[\"model_name\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Forecasting\n", + "\n", + "Now that we have retrieved the best model pipeline, we can apply it to generate forecasts for the target weeks. To do this, we first remove the target values from the test set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "X_test = test_df\n", + "y_test = X_test.pop(target_column_name).values" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
storebrandweeklogmoveconstantprice1price2price3price4price5price6price7price8price9price10price11dealfeatprofitweek_start
85211368.5910.050.050.050.050.040.050.030.040.030.020.0300.0033.541992-04-16
86211379.1910.040.050.050.040.030.040.030.040.040.020.0300.0020.431992-04-23
87211389.7410.040.040.050.040.040.050.040.040.040.030.0311.0011.291992-04-30
195221369.1410.050.050.050.050.040.050.030.040.030.020.0310.0027.131992-04-16
196221378.7410.040.050.050.040.030.040.030.040.040.020.0300.0033.301992-04-23
\n", + "
" + ], + "text/plain": [ + " store brand week logmove constant price1 price2 price3 price4 \\\n", + "85 2 1 136 8.59 1 0.05 0.05 0.05 0.05 \n", + "86 2 1 137 9.19 1 0.04 0.05 0.05 0.04 \n", + "87 2 1 138 9.74 1 0.04 0.04 0.05 0.04 \n", + "195 2 2 136 9.14 1 0.05 0.05 0.05 0.05 \n", + "196 2 2 137 8.74 1 0.04 0.05 0.05 0.04 \n", + "\n", + " price5 price6 price7 price8 price9 price10 price11 deal feat \\\n", + "85 0.04 0.05 0.03 0.04 0.03 0.02 0.03 0 0.00 \n", + "86 0.03 0.04 0.03 0.04 0.04 0.02 0.03 0 0.00 \n", + "87 0.04 0.05 0.04 0.04 0.04 0.03 0.03 1 1.00 \n", + "195 0.04 0.05 0.03 0.04 0.03 0.02 0.03 1 0.00 \n", + "196 0.03 0.04 0.03 0.04 0.04 0.02 0.03 0 0.00 \n", + "\n", + " profit week_start \n", + "85 33.54 1992-04-16 \n", + "86 20.43 1992-04-23 \n", + "87 11.29 1992-04-30 \n", + "195 27.13 1992-04-16 \n", + "196 33.30 1992-04-23 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# The featurized data, aligned to y, will also be returned. It contains the assumptions\n", + "# that were made in the forecast and helps align the forecast to the original data.\n", + "y_predictions, X_trans = fitted_model.forecast(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
week_startstorebrandpredictedweeklogmoveconstantprice1price2price3...price6price7price8price9price10price11dealfeatprofitmove
01992-04-16214075.571368.5910.050.050.05...0.050.030.040.030.020.0300.0033.545376
11992-04-16227212.491369.1410.050.050.05...0.050.030.040.030.020.0310.0027.139312
21992-04-16234075.571367.8510.050.050.05...0.050.030.040.030.020.0300.0032.552560
31992-04-16244011.431367.4210.050.050.05...0.050.030.040.030.020.0300.0034.981664
41992-04-16254336.831368.5910.050.050.05...0.050.030.040.030.020.0300.0028.805376
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " week_start store brand predicted week logmove constant price1 \\\n", + "0 1992-04-16 2 1 4075.57 136 8.59 1 0.05 \n", + "1 1992-04-16 2 2 7212.49 136 9.14 1 0.05 \n", + "2 1992-04-16 2 3 4075.57 136 7.85 1 0.05 \n", + "3 1992-04-16 2 4 4011.43 136 7.42 1 0.05 \n", + "4 1992-04-16 2 5 4336.83 136 8.59 1 0.05 \n", + "\n", + " price2 price3 ... price6 price7 price8 price9 price10 price11 \\\n", + "0 0.05 0.05 ... 0.05 0.03 0.04 0.03 0.02 0.03 \n", + "1 0.05 0.05 ... 0.05 0.03 0.04 0.03 0.02 0.03 \n", + "2 0.05 0.05 ... 0.05 0.03 0.04 0.03 0.02 0.03 \n", + "3 0.05 0.05 ... 0.05 0.03 0.04 0.03 0.02 0.03 \n", + "4 0.05 0.05 ... 0.05 0.03 0.04 0.03 0.02 0.03 \n", + "\n", + " deal feat profit move \n", + "0 0 0.00 33.54 5376 \n", + "1 1 0.00 27.13 9312 \n", + "2 0 0.00 32.55 2560 \n", + "3 0 0.00 34.98 1664 \n", + "4 0 0.00 28.80 5376 \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_automl = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n", + "pred_automl.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results evaluation & visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Test data scores]\n", + "\n", + "explained_variance: 0.187\n", + "r2_score: 0.172\n", + "spearman_correlation: 0.703\n", + "mean_absolute_percentage_error: 117.345\n", + "mean_absolute_error: 6624.722\n", + "normalized_mean_absolute_error: 0.046\n", + "median_absolute_error: 3048.760\n", + "normalized_median_absolute_error: 0.021\n", + "root_mean_squared_error: 16663.119\n", + "normalized_root_mean_squared_error: 0.115\n", + "root_mean_squared_log_error: 0.890\n", + "normalized_root_mean_squared_log_error: 0.140\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Use automl metrics module\n", + "scores = metrics.compute_metrics_regression(\n", + " pred_automl[\"predicted\"],\n", + " pred_automl[target_column_name],\n", + " list(constants.Metric.SCALAR_REGRESSION_SET),\n", + " None,\n", + " None,\n", + " None,\n", + ")\n", + "\n", + "print(\"[Test data scores]\\n\")\n", + "for key, value in scores.items():\n", + " print(\"{}: {:.3f}\".format(key, value))\n", + "\n", + "# Plot outputs\n", + "test_pred = plt.scatter(pred_automl[target_column_name], pred_automl[\"predicted\"], color=\"b\")\n", + "test_test = plt.scatter(pred_automl[target_column_name], pred_automl[target_column_name], color=\"g\")\n", + "plt.legend((test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also compute MAPE of the forecasts in the last two weeks of the forecast period in order to be consistent with the evaluation period that is used in other quick start examples." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE of forecasts obtained by AutoML in the last two weeks: 124.10334037331717\n" + ] + } + ], + "source": [ + "pred_automl_sub = pred_automl.loc[pred_automl.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", + "mape_automl_sub = MAPE(pred_automl_sub[\"predicted\"], pred_automl_sub[\"move\"]) * 100\n", + "print(\"MAPE of forecasts obtained by AutoML in the last two weeks: \" + str(mape_automl_sub))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combine AutoML Model with a Custom Model\n", + "\n", + "So far we have demonstrated how we can quickly build a forecasting model with AutoML in Azure. Next, we further show a simple way to achieve more robust and accurate forecasts by combining the forecasts from AutoML and a custom model that the user may have. Here we assume that the user have also constructed a series of linear regression models with each model forecasts the sales of a specfic store-brand using `scikit-learn` package." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple linear regression models" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Create price features\n", + "df_sub[\"price\"] = df_sub.apply(lambda x: x.loc[\"price\" + str(int(x.loc[\"brand\"]))], axis=1)\n", + "price_cols = [\n", + " \"price1\",\n", + " \"price2\",\n", + " \"price3\",\n", + " \"price4\",\n", + " \"price5\",\n", + " \"price6\",\n", + " \"price7\",\n", + " \"price8\",\n", + " \"price9\",\n", + " \"price10\",\n", + " \"price11\",\n", + "]\n", + "df_sub[\"avg_price\"] = df_sub[price_cols].sum(axis=1).apply(lambda x: x / len(price_cols))\n", + "df_sub[\"price_ratio\"] = df_sub.apply(lambda x: x[\"price\"] / x[\"avg_price\"], axis=1)\n", + "\n", + "# Create lag features on unit sales\n", + "df_sub[\"move_lag1\"] = df_sub[\"move\"].shift(1)\n", + "df_sub[\"move_lag2\"] = df_sub[\"move\"].shift(2)\n", + "\n", + "# Drop rows with NaN values\n", + "df_sub.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After splitting the data, we use `fit()` and `predit()` functions from `fclib.models.multiple_linear_regression` to train separate linear regression model for each invididual time series and generate forecasts for the sales during the test period." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
week_startpredictionstorebrandweeklogmoveconstantprice1price2price3...price11dealfeatprofitmovepriceavg_priceprice_ratiomove_lag1move_lag2
01992-04-1612507211368.5910.050.050.05...0.0300.0033.5453760.050.041.2712416.0028096.00
11992-04-2317664211379.1910.040.050.05...0.0300.0020.4397920.040.041.115376.0012416.00
21992-04-3021670211389.7410.040.040.05...0.0311.0011.29169600.040.040.949792.005376.00
31992-04-169551221369.1410.050.050.05...0.0310.0027.1393120.050.041.2111424.004992.00
41992-04-237452221378.7410.040.050.05...0.0300.0033.3062400.050.041.399312.0011424.00
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " week_start prediction store brand week logmove constant price1 \\\n", + "0 1992-04-16 12507 2 1 136 8.59 1 0.05 \n", + "1 1992-04-23 17664 2 1 137 9.19 1 0.04 \n", + "2 1992-04-30 21670 2 1 138 9.74 1 0.04 \n", + "3 1992-04-16 9551 2 2 136 9.14 1 0.05 \n", + "4 1992-04-23 7452 2 2 137 8.74 1 0.04 \n", + "\n", + " price2 price3 ... price11 deal feat profit move price \\\n", + "0 0.05 0.05 ... 0.03 0 0.00 33.54 5376 0.05 \n", + "1 0.05 0.05 ... 0.03 0 0.00 20.43 9792 0.04 \n", + "2 0.04 0.05 ... 0.03 1 1.00 11.29 16960 0.04 \n", + "3 0.05 0.05 ... 0.03 1 0.00 27.13 9312 0.05 \n", + "4 0.05 0.05 ... 0.03 0 0.00 33.30 6240 0.05 \n", + "\n", + " avg_price price_ratio move_lag1 move_lag2 \n", + "0 0.04 1.27 12416.00 28096.00 \n", + "1 0.04 1.11 5376.00 12416.00 \n", + "2 0.04 0.94 9792.00 5376.00 \n", + "3 0.04 1.21 11424.00 4992.00 \n", + "4 0.04 1.39 9312.00 11424.00 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Split data into training and test sets\n", + "train_df, test_df = split_last_n_by_grain(df_sub, NUM_TEST_PERIODS)\n", + "train_df.reset_index(drop=True)\n", + "test_df.reset_index(drop=True)\n", + "\n", + "# Train multiple linear regression models\n", + "fea_column_names = [\"move_lag1\", \"move_lag2\", \"price\", \"price_ratio\"]\n", + "lr_models = fit(train_df, grain_column_names, fea_column_names, target_column_name)\n", + "\n", + "# Generate forecasts with the trained models\n", + "pred_all = predict(test_df, lr_models, time_column_name, grain_column_names, fea_column_names)\n", + "\n", + "pred_lr = pd.merge(pred_all, test_df, on=index_column_names)\n", + "pred_lr.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check the accuracy of the predictions on the entire forecast period as well as in the last two weeks of the forecast period.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE of forecasts obtained by multiple linear regression on entire test period: 83.90865445283927\n" + ] + } + ], + "source": [ + "mape_lr_entire = MAPE(pred_lr[\"prediction\"], pred_lr[\"move\"]) * 100\n", + "print(\"MAPE of forecasts obtained by multiple linear regression on entire test period: \" + str(mape_lr_entire))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE of forecasts obtained by multiple linear regression in the last two weeks: 72.11741385279376\n" + ] + } + ], + "source": [ + "pred_lr_sub = pred_lr.loc[pred_lr.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", + "mape_lr_sub = MAPE(pred_lr_sub[\"prediction\"], pred_lr_sub[\"move\"]) * 100\n", + "print(\"MAPE of forecasts obtained by multiple linear regression in the last two weeks: \" + str(mape_lr_sub))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combine forecasts from different methods\n", + "\n", + "We can combine the forecasts obtained by AutoML and multiple linear regression using weighted average and evaluate the final forecasts. Usually the combined forecasts will be more robust as a combination of two methods can reduce the chance of model overfitting. Here we use equal weights which can be further adjusted according to our confidence on each model." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "pred_final = pd.merge(\n", + " pred_automl[index_column_names + [\"predicted\", \"move\", \"week\"]],\n", + " pred_lr[index_column_names + [\"prediction\"]],\n", + " on=index_column_names,\n", + " how=\"left\",\n", + ")\n", + "pred_final[\"combined_prediction\"] = pred_final[\"predicted\"] * 0.5 + pred_final[\"prediction\"] * 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE of forecasts obtained by the combined model on entire test period: 87.2964359857758\n" + ] + } + ], + "source": [ + "mape_entire = MAPE(pred_final[\"combined_prediction\"], pred_final[\"move\"]) * 100\n", + "print(\"MAPE of forecasts obtained by the combined model on entire test period: \" + str(mape_entire))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE of forecasts obtained by the combined model in the last two weeks: 84.39534839261313\n" + ] + } + ], + "source": [ + "pred_final_sub = pred_final.loc[pred_final.week >= max(test_df.week) - NUM_TEST_PERIODS + GAP]\n", + "mape_final_sub = MAPE(pred_final_sub[\"combined_prediction\"], pred_final_sub[\"move\"]) * 100\n", + "print(\"MAPE of forecasts obtained by the combined model in the last two weeks: \" + str(mape_final_sub))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Reading\n", + "\n", + "\\[1\\] Nicolo Fusi, Rishit Sheth, and Melih Elibol. 2018. Probabilistic Matrix Factorization for Automated Machine Learning. In Advances in Neural Information Processing Systems. 3348-3357.
\n", + "\\[2\\] Azure AutoML Package Docs: https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl?view=azure-ml-py
\n", + "\\[3\\] Azure Automated Machine Learning Examples: https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning
\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "author_info": { + "affiliation": "Microsoft", + "created_by": "Chenhui Hu" + }, + "kernelspec": { + "display_name": "forecasting_env", + "language": "python", + "name": "forecasting_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/00_quick_start/lightgbm_point_forecast.ipynb b/examples/grocery_sales/python/00_quick_start/lightgbm_single_round.ipynb similarity index 99% rename from examples/00_quick_start/lightgbm_point_forecast.ipynb rename to examples/grocery_sales/python/00_quick_start/lightgbm_single_round.ipynb index 7a49baa7..72de6354 100644 --- a/examples/00_quick_start/lightgbm_point_forecast.ipynb +++ b/examples/grocery_sales/python/00_quick_start/lightgbm_single_round.ipynb @@ -6,7 +6,7 @@ "source": [ "Copyright (c) Microsoft Corporation.\n", "\n", - "Licensed under the MIT License." + "Licensed under the MIT License. " ] }, { @@ -104,7 +104,11 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "# Use False if you've already downloaded and split the data\n", @@ -587,6 +591,10 @@ } ], "metadata": { + "author_info": { + "affiliation": "Microsoft", + "created_by": "Chenhui Hu" + }, "kernelspec": { "display_name": "forecasting_env", "language": "python", diff --git a/examples/01_prepare_data/ojdata_exploration_retail.ipynb b/examples/grocery_sales/python/01_prepare_data/ojdata_exploration.ipynb similarity index 100% rename from examples/01_prepare_data/ojdata_exploration_retail.ipynb rename to examples/grocery_sales/python/01_prepare_data/ojdata_exploration.ipynb diff --git a/examples/01_prepare_data/ojdata_preparation_retail.ipynb b/examples/grocery_sales/python/01_prepare_data/ojdata_preparation.ipynb similarity index 99% rename from examples/01_prepare_data/ojdata_preparation_retail.ipynb rename to examples/grocery_sales/python/01_prepare_data/ojdata_preparation.ipynb index 86a9afb1..38508ec6 100644 --- a/examples/01_prepare_data/ojdata_preparation_retail.ipynb +++ b/examples/grocery_sales/python/01_prepare_data/ojdata_preparation.ipynb @@ -160,7 +160,7 @@ "For demonstration, this is what the time series split on the Orange Juice dataset looks like, for the parameters listed above.\n", "For `HORIZON = 2` and `GAP = 2`, assuming the current week is week `153`, our goal is to forecast the sales in week `155` and `156` using the training data. As you can see, the first forecasting week is `two` weeks away from the current week, as we want to leave time for planning inventory in practice.\n", "\n", - "![Single split](../../assets/time_series_split_singleround.jpg)\n", + "![Single split](../../../../assets/time_series_split_singleround.jpg)\n", "\n", "We also refer to splits as rounds, so for `N_SPLITS = 1`, we have single-round forecasting, and for `N_SPLITS > 1`, we have multi-round forecasting." ] @@ -1120,7 +1120,7 @@ "\n", "For demonstration, this is what the time series splits would look like for `N_SPLITS = 5`, and using other settings as above:\n", "\n", - "![Multi split](../../assets/time_series_split_multiround.jpg)\n" + "![Multi split](../../../../assets/time_series_split_multiround.jpg)\n" ] }, { diff --git a/examples/grocery_sales/python/02_model/autoarima_multi_round.ipynb b/examples/grocery_sales/python/02_model/autoarima_multi_round.ipynb new file mode 100644 index 00000000..d57f850b --- /dev/null +++ b/examples/grocery_sales/python/02_model/autoarima_multi_round.ipynb @@ -0,0 +1,588 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ARIMA: Autoregressive Integrated Moving Average\n", + "\n", + "This notebook provides an example of how to train an ARIMA model to generate point forecasts of product sales in retail. We will train an ARIMA based model on the Orange Juice dataset.\n", + "\n", + "An ARIMA, which stands for AutoRegressive Integrated Moving Average, model can be created using an `ARIMA(p,d,q)` model within `statsmodels` library. In this notebook, we will be using an alternative library `pmdarima`, which allows us to automatically search for optimal ARIMA parameters, within a specified range. More specifically, we will be using `auto_arima` function within `pmdarima` to automatically discover the optimal parameters for an ARIMA model. This function wraps `ARIMA` and `SARIMAX` models of `statsmodels` library, that correspond to non-seasonal and seasonal model space, respectively.\n", + "\n", + "In an ARIMA model there are 3 parameters that are used to help model the major aspects of a times series: seasonality, trend, and noise. These parameters are:\n", + "- **p** is the parameter associated with the auto-regressive aspect of the model, which incorporates past values.\n", + "- **d** is the parameter associated with the integrated part of the model, which effects the amount of differencing to apply to a time series.\n", + "- **q** is the parameter associated with the moving average part of the model.,\n", + "\n", + "If our data has a seasonal component, we use a seasonal ARIMA model or `ARIMA(p,d,q)(P,D,Q)m`. In that case, we have an additional set of parameters: `P`, `D`, and `Q` which describe the autoregressive, differencing, and moving average terms for the seasonal part of the ARIMA model, and `m` refers to the number of periods in each season.\n", + "\n", + "We provide a [quick-start ARIMA example](../00_quick_start/auto_arima_forecasting.ipynb), in which we explain the process of using ARIMA model to forecast a single time series, and analyze the model performance. Please take a look at this notebook for more information.\n", + "\n", + "In this notebook, we will train an ARIMA model on multiple splits (round) of the train/test data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Settings and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.10 |Anaconda, Inc.| (default, Jan 7 2020, 21:14:29) \n", + "[GCC 7.3.0]\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import math\n", + "import warnings\n", + "import itertools\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scrapbook as sb\n", + "\n", + "from tqdm import tqdm\n", + "from pmdarima.arima import auto_arima\n", + "\n", + "from fclib.common.utils import git_repo_path\n", + "from fclib.common.plot import plot_predictions_with_history\n", + "from fclib.evaluation.evaluation_utils import MAPE\n", + "from fclib.dataset.ojdata import download_ojdata, split_train_test, complete_and_fill_df\n", + "\n", + "pd.options.display.float_format = \"{:,.2f}\".format\n", + "np.set_printoptions(precision=2)\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "print(\"System version: {}\".format(sys.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parameters\n", + "\n", + "Next, we define global settings related to the model. We will use historical weekly sales data only, without any covariate features to train the ARIMA model. The model parameter ranges are provided in params. These are later used by the `auto_arima()` function to search the space for the optimal set of parameters. To increase the space of models to search over, increase the `max_p` and `max_q` parameters.\n", + "\n", + "> NOTE: Our data does not show a strong seasonal component (as demonstrated in data exploration example notebook), so we will not be searching over the seasonal ARIMA models. To learn more about the seasonal ARIMA models, please take a look at the quick start ARIMA notebook, referenced above in the introduction." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Use False if you've already downloaded and split the data\n", + "DOWNLOAD_SPLIT_DATA = True\n", + "\n", + "# Data directory\n", + "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n", + "\n", + "# Forecasting settings\n", + "N_SPLITS = 5\n", + "HORIZON = 2\n", + "GAP = 2\n", + "FIRST_WEEK = 40\n", + "LAST_WEEK = 156\n", + "\n", + "# Parameters of ARIMA model\n", + "params = {\n", + " \"seasonal\": False,\n", + " \"start_p\": 0,\n", + " \"start_q\": 0,\n", + " \"max_p\": 5,\n", + " \"max_q\": 5,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "\n", + "We need to download the Orange Juice data and split it into training and test sets. By default, the following cell will download and spit the data. If you've already done so, you may skip this part by switching `DOWNLOAD_SPLIT_DATA` to `False`.\n", + "\n", + "We store the training data and test data using dataframes. The training data includes `train_df` and `aux_df` with `train_df` containing the historical sales up to week 135 (the time we make forecasts) and `aux_df` containing price/promotion information up until week 138. Here we assume that future price and promotion information up to a certain number of weeks ahead is predetermined and known. In our example, we will be using historical sales only, and will not be using the `aux_df` data. The test data is stored in `test_df` which contains the sales of each product in week 137 and 138. Assuming the current week is week 135, our goal is to forecast the sales in week 137 and 138 using the training data. There is a one-week gap between the current week and the first target week of forecasting as we want to leave time for planning inventory in practice.\n", + "\n", + "The setting of the forecast problem are defined in `fclib.dataset.ojdata.split_train_test` function. We can change this setting (e.g., modify the horizon of the forecast or the range of the historical data) by passing different parameters to this functions. Below, we split the data into `n_splits=N_SPLITS` splits, using the forecasting settings listed above in the **Parameters** section." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data already exists at the specified location.\n", + "Finished data downloading and splitting.\n" + ] + } + ], + "source": [ + "if DOWNLOAD_SPLIT_DATA:\n", + " download_ojdata(DATA_DIR)\n", + " train_df_list, test_df_list, _ = split_train_test(\n", + " DATA_DIR,\n", + " n_splits=N_SPLITS,\n", + " horizon=HORIZON,\n", + " gap=GAP,\n", + " first_week=FIRST_WEEK,\n", + " last_week=LAST_WEEK,\n", + " write_csv=True,\n", + " )\n", + "\n", + " print(\"Finished data downloading and splitting.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create training data and test data for multi-round forecasting, we pass a number greater than `1` to `n_splits` parameter in `split_train_test()` function. Note that the forecasting periods we generate in each test round are **non-overlapping**. This allows us to evaluate the forecasting model on multiple rounds of data, and get a more robust estimate of our model's performance.\n", + "\n", + "For visual demonstration, this is what the time series splits would look like for `N_SPLITS = 5`, and using other settings as above:\n", + "\n", + "![Multi split](../../../../assets/time_series_split_multiround.jpg)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process training data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our time series data is not complete, since we have missing sales for some stores/products and weeks. We will fill in those missing values by propagating the last valid observation forward to next available value. We will define functions for data frame processing, then use these functions within a loop that loops over each forecasting rounds.\n", + "\n", + "Note that our time series are grouped by `store` and `brand`, while `week` represents a time step, and `logmove` represents the value to predict." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def process_training_df(train_df):\n", + " \"\"\"Process training data frame.\"\"\"\n", + " train_df = train_df[[\"store\", \"brand\", \"week\", \"logmove\"]]\n", + " store_list = train_df[\"store\"].unique()\n", + " brand_list = train_df[\"brand\"].unique()\n", + " train_week_list = range(FIRST_WEEK, max(train_df.week))\n", + "\n", + " train_filled = complete_and_fill_df(train_df, stores=store_list, brands=brand_list, weeks=train_week_list)\n", + "\n", + " return train_filled" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process test data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now process the test data. Note that, in addition to filling out missing values, we also convert unit sales from logarithmic scale to the counts. We will do model training on the log scale, due to improved performance, however, we will transfrom the test data back into the unit scale (counts) by applying `math.exp()`, so that we can evaluate the performance on the unit scale.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def process_test_df(test_df):\n", + " \"\"\"Process test data frame.\"\"\"\n", + " test_df[\"actuals\"] = test_df.logmove.apply(lambda x: round(math.exp(x)))\n", + " test_df = test_df[[\"store\", \"brand\", \"week\", \"actuals\"]]\n", + " store_list = test_df[\"store\"].unique()\n", + " brand_list = test_df[\"brand\"].unique()\n", + "\n", + " test_week_list = range(min(test_df.week), max(test_df.week) + 1)\n", + " test_filled = complete_and_fill_df(test_df, stores=store_list, brands=brand_list, weeks=test_week_list)\n", + "\n", + " return test_filled" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's run model training across all the stores and brands, and across all rounds. We will re-run the same code to automatically search for the best parameters, simply wrapped in a for loop iterating over stores and brands.\n", + "\n", + "> **NOTE**: Since we are building a model for each time series sequentially (900+ time series for each store and brand), it would take about 1 hour to run the following cell over all stores. To speed up the execution, we model only a subset of ten stores in each round (exacution time ~8 minutes). To change this behavior, and run ARIMA modeling over *all stores and brands*, switch the boolean indicator `subset_stores` to `False`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/5 [00:00 Note: Since `auto_arima` model makes consecutive forecasts from the last time point, we want to forecast the next `n_periods = GAP + HORIZON - 1` points, so that we can account for the GAP, as described in the data setup." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To evaluate the model, we will use *mean absolute percentage error* or [MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAPE values for each forecasting round:\n", + "round\n", + "1 57.72\n", + "2 77.08\n", + "3 63.12\n", + "4 74.93\n", + "5 73.70\n", + "dtype: float64\n" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 69.22142436904007, + "encoder": "json", + "name": "MAPE", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "MAPE" + } + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overall MAPE is 69.22 %\n" + ] + } + ], + "source": [ + "mape_r = result_df.groupby(\"round\").apply(lambda x: MAPE(x.predictions, x.actuals) * 100)\n", + "\n", + "print(\"MAPE values for each forecasting round:\")\n", + "print(mape_r)\n", + "\n", + "metric_value = MAPE(result_df.predictions, result_df.actuals) * 100\n", + "sb.glue(\"MAPE\", metric_value)\n", + "\n", + "print(f\"Overall MAPE is {metric_value:.2f} %\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting MAPE value is relatively high. As `auto_arima` searches a restricted space of the models, defined by the range of `p` and `q` parameters, we often might not find an optimal model for each time series. In addition, when building a model for a large number of time series, it is often difficult to examine each model individually, which would usually help us improve an ARIMA model. Please refer to the [quick start ARIMA notebook](../00_quick_start/auto_arima_forecasting.ipynb) for a more comprehensive evaluation of a single ARIMA model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's plot a few examples of forecasted results." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "num_samples = 6\n", + "min_week = 140\n", + "sales = pd.read_csv(os.path.join(DATA_DIR, \"yx.csv\"))\n", + "sales[\"move\"] = sales.logmove.apply(lambda x: round(math.exp(x)) if x > 0 else 0)\n", + "\n", + "result_df[\"move\"] = result_df.predictions\n", + "plot_predictions_with_history(\n", + " result_df,\n", + " sales,\n", + " grain1_unique_vals=store_list,\n", + " grain2_unique_vals=brand_list,\n", + " time_col_name=\"week\",\n", + " target_col_name=\"move\",\n", + " grain1_name=\"store\",\n", + " grain2_name=\"brand\",\n", + " min_timestep=min_week,\n", + " num_samples=num_samples,\n", + " predict_at_timestep=145,\n", + " line_at_predict_time=False,\n", + " title=\"Prediction results for a few sample time series\",\n", + " x_label=\"week\",\n", + " y_label=\"unit sales\",\n", + " random_seed=2,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "forecasting_env", + "language": "python", + "name": "forecasting_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/02_model/dilatedcnn_point_forecast_multiround.ipynb b/examples/grocery_sales/python/02_model/dilatedcnn_multi_round.ipynb similarity index 99% rename from examples/02_model/dilatedcnn_point_forecast_multiround.ipynb rename to examples/grocery_sales/python/02_model/dilatedcnn_multi_round.ipynb index 2ca1bf60..48a41f47 100644 --- a/examples/02_model/dilatedcnn_point_forecast_multiround.ipynb +++ b/examples/grocery_sales/python/02_model/dilatedcnn_multi_round.ipynb @@ -38,8 +38,7 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext tensorboard\n", - "%load_ext blackcellmagic" + "%load_ext tensorboard" ] }, { @@ -105,11 +104,15 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "# Use False if you've already downloaded and split the data\n", - "DOWNLOAD_SPLIT_DATA = False # True\n", + "DOWNLOAD_SPLIT_DATA = True\n", "\n", "# Data directories\n", "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n", @@ -241,7 +244,7 @@ " data_filled = pd.merge(data_grid, train_df, how=\"left\", on=[\"store\", \"brand\", \"week\"])\n", "\n", " # Get future price, deal, and advertisement info\n", - " aux_df = pd.read_csv(os.path.join(TRAIN_DIR, \"aux_\" + str(pred_round) + \".csv\"))\n", + " aux_df = pd.read_csv(os.path.join(TRAIN_DIR, \"auxi_\" + str(pred_round) + \".csv\"))\n", " data_filled = pd.merge(data_filled, aux_df, how=\"left\", on=[\"store\", \"brand\", \"week\"])\n", "\n", " # Create relative price feature\n", @@ -938,6 +941,10 @@ } ], "metadata": { + "author_info": { + "affiliation": "Microsoft", + "created_by": "Chenhui Hu" + }, "kernelspec": { "display_name": "forecasting_env", "language": "python", diff --git a/examples/02_model/lightgbm_point_forecast_multiround.ipynb b/examples/grocery_sales/python/02_model/lightgbm_multi_round.ipynb similarity index 99% rename from examples/02_model/lightgbm_point_forecast_multiround.ipynb rename to examples/grocery_sales/python/02_model/lightgbm_multi_round.ipynb index bcb2e620..33cb1567 100644 --- a/examples/02_model/lightgbm_point_forecast_multiround.ipynb +++ b/examples/grocery_sales/python/02_model/lightgbm_multi_round.ipynb @@ -103,7 +103,11 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "# Use False if you've already downloaded and split the data\n", @@ -244,7 +248,7 @@ " data_filled = pd.merge(data_grid, train_df, how=\"left\", on=[\"store\", \"brand\", \"week\"])\n", "\n", " # Get future price, deal, and advertisement info\n", - " aux_df = pd.read_csv(os.path.join(train_dir, \"aux_\" + str(pred_round) + \".csv\"))\n", + " aux_df = pd.read_csv(os.path.join(train_dir, \"auxi_\" + str(pred_round) + \".csv\"))\n", " data_filled = pd.merge(data_filled, aux_df, how=\"left\", on=[\"store\", \"brand\", \"week\"])\n", "\n", " # Create relative price feature\n", @@ -4129,6 +4133,10 @@ } ], "metadata": { + "author_info": { + "affiliation": "Microsoft", + "created_by": "Chenhui Hu" + }, "kernelspec": { "display_name": "forecasting_env", "language": "python", @@ -4144,7 +4152,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.6-final" + "version": "3.6.10" } }, "nbformat": 4, diff --git a/examples/grocery_sales/python/03_model_tune_deploy/aml_scripts/train_validate.py b/examples/grocery_sales/python/03_model_tune_deploy/aml_scripts/train_validate.py new file mode 100644 index 00000000..d7d0e23d --- /dev/null +++ b/examples/grocery_sales/python/03_model_tune_deploy/aml_scripts/train_validate.py @@ -0,0 +1,275 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Perform cross validation of a LightGBM forecasting model on the training data of the 1st forecast round. +""" + +import os +import math +import argparse +import datetime +import numpy as np +import pandas as pd +import lightgbm as lgb +from azureml.core import Run +from sklearn.model_selection import train_test_split +from fclib.feature_engineering.feature_utils import week_of_month, df_from_cartesian_product, combine_features + + +FIRST_WEEK = 40 +GAP = 2 +HORIZON = 2 +FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") + + +def create_features(pred_round, train_dir, lags, window_size, used_columns): + """Create input features for model training and testing. + + Args: + pred_round (int): Prediction round (1, 2, ...) + train_dir (str): Path of the training data directory + lags (np.array): Numpy array including all the lags + window_size (int): Maximum step for computing the moving average + used_columns (list[str]): A list of names of columns used in model training (including target variable) + + Returns: + pd.Dataframe: Dataframe including all the input features and target variable + int: Last week of the training data + """ + + # Load training data + default_train_file = os.path.join(train_dir, "train.csv") + if os.path.isfile(default_train_file): + train_df = pd.read_csv(default_train_file) + else: + train_df = pd.read_csv(os.path.join(train_dir, "train_" + str(pred_round) + ".csv")) + train_df["move"] = train_df["logmove"].apply(lambda x: round(math.exp(x))) + train_df = train_df[["store", "brand", "week", "move"]] + + # Create a dataframe to hold all necessary data + store_list = train_df["store"].unique() + brand_list = train_df["brand"].unique() + train_end_week = train_df["week"].max() + week_list = range(FIRST_WEEK, train_end_week + GAP + HORIZON) + d = {"store": store_list, "brand": brand_list, "week": week_list} + data_grid = df_from_cartesian_product(d) + data_filled = pd.merge(data_grid, train_df, how="left", on=["store", "brand", "week"]) + + # Get future price, deal, and advertisement info + default_aux_file = os.path.join(train_dir, "auxi.csv") + if os.path.isfile(default_aux_file): + aux_df = pd.read_csv(default_aux_file) + else: + aux_df = pd.read_csv(os.path.join(train_dir, "auxi_" + str(pred_round) + ".csv")) + data_filled = pd.merge(data_filled, aux_df, how="left", on=["store", "brand", "week"]) + + # Create relative price feature + price_cols = [ + "price1", + "price2", + "price3", + "price4", + "price5", + "price6", + "price7", + "price8", + "price9", + "price10", + "price11", + ] + data_filled["price"] = data_filled.apply(lambda x: x.loc["price" + str(int(x.loc["brand"]))], axis=1) + data_filled["avg_price"] = data_filled[price_cols].sum(axis=1).apply(lambda x: x / len(price_cols)) + data_filled["price_ratio"] = data_filled["price"] / data_filled["avg_price"] + data_filled.drop(price_cols, axis=1, inplace=True) + + # Fill missing values + data_filled = data_filled.groupby(["store", "brand"]).apply( + lambda x: x.fillna(method="ffill").fillna(method="bfill") + ) + + # Create datetime features + data_filled["week_start"] = data_filled["week"].apply( + lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7) + ) + data_filled["year"] = data_filled["week_start"].apply(lambda x: x.year) + data_filled["month"] = data_filled["week_start"].apply(lambda x: x.month) + data_filled["week_of_month"] = data_filled["week_start"].apply(lambda x: week_of_month(x)) + data_filled["day"] = data_filled["week_start"].apply(lambda x: x.day) + data_filled.drop("week_start", axis=1, inplace=True) + + # Create other features (lagged features, moving averages, etc.) + features = data_filled.groupby(["store", "brand"]).apply( + lambda x: combine_features(x, ["move"], lags, window_size, used_columns) + ) + + # Drop rows with NaN values + features.dropna(inplace=True) + + return features, train_end_week + + +if __name__ == "__main__": + # Parse input arguments + parser = argparse.ArgumentParser() + parser.add_argument("--data-folder", type=str, dest="data_folder", default=".", help="data folder mounting point") + parser.add_argument("--num-leaves", type=int, dest="num_leaves", default=64, help="# of leaves of the tree") + parser.add_argument( + "--min-data-in-leaf", type=int, dest="min_data_in_leaf", default=50, help="minimum # of samples in each leaf" + ) + parser.add_argument("--learning-rate", type=float, dest="learning_rate", default=0.001, help="learning rate") + parser.add_argument( + "--feature-fraction", + type=float, + dest="feature_fraction", + default=1.0, + help="ratio of features used in each iteration", + ) + parser.add_argument( + "--bagging-fraction", + type=float, + dest="bagging_fraction", + default=1.0, + help="ratio of samples used in each iteration", + ) + parser.add_argument("--bagging-freq", type=int, dest="bagging_freq", default=1, help="bagging frequency") + parser.add_argument("--max-rounds", type=int, dest="max_rounds", default=400, help="# of boosting iterations") + parser.add_argument("--max-lag", type=int, dest="max_lag", default=10, help="max lag of unit sales") + parser.add_argument( + "--window-size", type=int, dest="window_size", default=10, help="window size of moving average of unit sales" + ) + args = parser.parse_args() + args.feature_fraction = round(args.feature_fraction, 2) + args.bagging_fraction = round(args.bagging_fraction, 2) + print(args) + + # Start an Azure ML run + run = Run.get_context() + + # Data paths + DATA_DIR = args.data_folder + TRAIN_DIR = os.path.join(DATA_DIR, "train") + + # Data and forecast problem parameters + TRAIN_START_WEEK = 40 + TRAIN_END_WEEK_LIST = list(range(135, 159, 2)) + TEST_START_WEEK_LIST = list(range(137, 161, 2)) + TEST_END_WEEK_LIST = list(range(138, 162, 2)) + # The start datetime of the first week in the dataset + FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") + + # Parameters of GBM model + params = { + "objective": "mape", + "num_leaves": args.num_leaves, + "min_data_in_leaf": args.min_data_in_leaf, + "learning_rate": args.learning_rate, + "feature_fraction": args.feature_fraction, + "bagging_fraction": args.bagging_fraction, + "bagging_freq": args.bagging_freq, + "num_rounds": args.max_rounds, + "early_stopping_rounds": 125, + "num_threads": 16, + } + + # Lags and used column names + lags = np.arange(2, args.max_lag + 1) + used_columns = ["store", "brand", "week", "week_of_month", "month", "deal", "feat", "move", "price", "price_ratio"] + categ_fea = ["store", "brand", "deal"] + + # Train and validate the model using only the first round data + r = 0 + print("---- Round " + str(r + 1) + " ----") + # Load training data + default_train_file = os.path.join(TRAIN_DIR, "train.csv") + if os.path.isfile(default_train_file): + train_df = pd.read_csv(default_train_file) + else: + train_df = pd.read_csv(os.path.join(TRAIN_DIR, "train_" + str(r + 1) + ".csv")) + train_df["move"] = train_df["logmove"].apply(lambda x: round(math.exp(x))) + train_df = train_df[["store", "brand", "week", "move"]] + + # Create a dataframe to hold all necessary data + store_list = train_df["store"].unique() + brand_list = train_df["brand"].unique() + week_list = range(TRAIN_START_WEEK, TEST_END_WEEK_LIST[r] + 1) + d = {"store": store_list, "brand": brand_list, "week": week_list} + data_grid = df_from_cartesian_product(d) + data_filled = pd.merge(data_grid, train_df, how="left", on=["store", "brand", "week"]) + + # Get future price, deal, and advertisement info + default_aux_file = os.path.join(TRAIN_DIR, "auxi.csv") + if os.path.isfile(default_aux_file): + aux_df = pd.read_csv(default_aux_file) + else: + aux_df = pd.read_csv(os.path.join(TRAIN_DIR, "auxi_" + str(r + 1) + ".csv")) + data_filled = pd.merge(data_filled, aux_df, how="left", on=["store", "brand", "week"]) + + # Create relative price feature + price_cols = [ + "price1", + "price2", + "price3", + "price4", + "price5", + "price6", + "price7", + "price8", + "price9", + "price10", + "price11", + ] + data_filled["price"] = data_filled.apply(lambda x: x.loc["price" + str(int(x.loc["brand"]))], axis=1) + data_filled["avg_price"] = data_filled[price_cols].sum(axis=1).apply(lambda x: x / len(price_cols)) + data_filled["price_ratio"] = data_filled["price"] / data_filled["avg_price"] + data_filled.drop(price_cols, axis=1, inplace=True) + + # Fill missing values + data_filled = data_filled.groupby(["store", "brand"]).apply( + lambda x: x.fillna(method="ffill").fillna(method="bfill") + ) + + # Create datetime features + data_filled["week_start"] = data_filled["week"].apply( + lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7) + ) + data_filled["year"] = data_filled["week_start"].apply(lambda x: x.year) + data_filled["month"] = data_filled["week_start"].apply(lambda x: x.month) + data_filled["week_of_month"] = data_filled["week_start"].apply(lambda x: week_of_month(x)) + data_filled["day"] = data_filled["week_start"].apply(lambda x: x.day) + data_filled.drop("week_start", axis=1, inplace=True) + + # Create other features (lagged features, moving averages, etc.) + features = data_filled.groupby(["store", "brand"]).apply( + lambda x: combine_features(x, ["move"], lags, args.window_size, used_columns) + ) + train_fea = features[features.week <= TRAIN_END_WEEK_LIST[r]].reset_index(drop=True) + + # Drop rows with NaN values + train_fea.dropna(inplace=True) + + # Model training and validation + # Create a training/validation split + train_fea, valid_fea, train_label, valid_label = train_test_split( + train_fea.drop("move", axis=1, inplace=False), train_fea["move"], test_size=0.05, random_state=1 + ) + dtrain = lgb.Dataset(train_fea, train_label) + dvalid = lgb.Dataset(valid_fea, valid_label) + # A dictionary to record training results + evals_result = {} + # Train LightGBM model + bst = lgb.train( + params, dtrain, valid_sets=[dtrain, dvalid], categorical_feature=categ_fea, evals_result=evals_result + ) + # Get final training loss & validation loss + train_loss = evals_result["training"]["mape"][-1] + valid_loss = evals_result["valid_1"]["mape"][-1] + print("Final training loss is {}".format(train_loss)) + print("Final validation loss is {}".format(valid_loss)) + + # Log the validation loss (MAPE) + run.log("MAPE", np.float(valid_loss) * 100) + + # Files saved in the "./outputs" folder are automatically uploaded into run history + os.makedirs("./outputs/model", exist_ok=True) + bst.save_model("./outputs/model/bst-model.txt") diff --git a/examples/grocery_sales/python/03_model_tune_deploy/azure_hyperdrive_lightgbm.ipynb b/examples/grocery_sales/python/03_model_tune_deploy/azure_hyperdrive_lightgbm.ipynb new file mode 100644 index 00000000..cb591146 --- /dev/null +++ b/examples/grocery_sales/python/03_model_tune_deploy/azure_hyperdrive_lightgbm.ipynb @@ -0,0 +1,1185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation.\n", + "\n", + "Licensed under the MIT License. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Tuning and Deployment using Azure Machine Learning Service\n", + "\n", + "In this notebook, we perform hyperparameter tuning of a LightGBM retail sales forecast model using HyperDrive in Azure Machine Learning (AzureML). After the optimal hyperparameters are found, we further deploy the best model as a web service on Azure.\n", + "\n", + "To tune the hyperparameters, we carry out cross-validation with the Orange Juice data from week 40 to week 135. Specifically, we split the data into a training set and a validation set. Then, we train LightGBM models with different sets of hyperparameters on the training set and evaluate the accuracy of each model on the validation set. The set of hyperparameters which yield the best validation accuracy will be used to train forecast models when the data beyond week 135 is available, e.g., in the multi-round training examples provided in [examples/02_model](../02_model).\n", + "\n", + "## Prerequisites\n", + "\n", + "To run this notebook, you need to start from a conda environment where AzureML SDK is installed. In our case, we can first activate `forecasting_env` environment by\n", + "```\n", + "conda activate forecasting_env\n", + "```\n", + "as we have installed AzureML SDK in this environment. Then, we can start the notebook via\n", + "```\n", + "jupyter notebook --no-browswers\n", + "```\n", + "In addition, you need to install and enable AzureML widget extension in your environment by running the following commands.\n", + "```\n", + "jupyter nbextension install --py --user azureml.widgets\n", + "jupyter nbextension enable --py --user azureml.widgets\n", + "```\n", + "\n", + "Besides, you need to create an AzureML workspace and download its configuration file (`config.json`) by following the instructions in [configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing /data/anaconda/envs/forecasting_env/lib/python3.6/site-packages/azureml/widgets/static -> azureml_widgets\n", + "Up to date: /data/home/chenhui/.local/share/jupyter/nbextensions/azureml_widgets/index.js\n", + "Up to date: /data/home/chenhui/.local/share/jupyter/nbextensions/azureml_widgets/extension.js\n", + "Up to date: /data/home/chenhui/.local/share/jupyter/nbextensions/azureml_widgets/packages/labextension/azureml_widgets-1.1.0.tgz\n", + "- Validating: \u001b[32mOK\u001b[0m\n", + "\n", + " To initialize this nbextension in the browser every time the notebook (or other app) loads:\n", + " \n", + " jupyter nbextension enable azureml.widgets --user --py\n", + " \n", + "Enabling notebook extension azureml_widgets/extension...\n", + " - Validating: \u001b[32mOK\u001b[0m\n" + ] + } + ], + "source": [ + "# Install and enable AzureML widgets\n", + "!jupyter nbextension install --py --user azureml.widgets\n", + "!jupyter nbextension enable --py --user azureml.widgets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Turning diagnostics collection on. \n", + "Azure ML SDK Version: 1.0.85\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import shutil\n", + "import azureml\n", + "import requests\n", + "import subprocess\n", + "import numpy as np\n", + "from azureml.core import (\n", + " Experiment,\n", + " ScriptRunConfig,\n", + ")\n", + "from azureml.telemetry import set_diagnostics_collection\n", + "from azureml.core.runconfig import (\n", + " RunConfiguration,\n", + " EnvironmentDefinition,\n", + " CondaDependencies,\n", + ")\n", + "from azureml.train.estimator import Estimator\n", + "from azureml.widgets import RunDetails\n", + "from azureml.train.hyperdrive import (\n", + " BayesianParameterSampling,\n", + " HyperDriveConfig,\n", + " quniform,\n", + " uniform,\n", + " choice,\n", + " PrimaryMetricGoal,\n", + ")\n", + "from azureml.core.webservice import AciWebservice\n", + "from azureml.core.model import Model, InferenceConfig\n", + "from fclib.common.utils import git_repo_path\n", + "from fclib.azureml.azureml_utils import (\n", + " get_or_create_workspace,\n", + " get_or_create_amlcompute,\n", + ")\n", + "from fclib.dataset.ojdata import download_ojdata, split_train_test\n", + "\n", + "cur_dir = os.getcwd()\n", + "if cur_dir not in sys.path:\n", + " sys.path.append(cur_dir)\n", + "from aml_scripts.train_validate import create_features\n", + "\n", + "# Opt-in diagnostics for better experience of future releases\n", + "set_diagnostics_collection(send_diagnostics=True)\n", + "\n", + "# Check core SDK version number\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Use False if you've already downloaded and split the data\n", + "DOWNLOAD_SPLIT_DATA = True\n", + "\n", + "# Get data directory\n", + "DATA_DIR = os.path.join(git_repo_path(), \"ojdata\")\n", + "\n", + "# Forecasting settings\n", + "N_SPLITS = 1\n", + "HORIZON = 2\n", + "GAP = 2\n", + "FIRST_WEEK = 40\n", + "LAST_WEEK = 138" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Workspace & Create an AzureML Experiment\n", + "\n", + "Initialize a [Machine Learning Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the workspace you created in the Prerequisites step. `get_or_create_workspace()` below creates a workspace object from the details stored in `config.json` that you have downloaded. We assume that you store this config file to a directory `./.azureml`. In case the existing workspace cannot be loaded, the following cell will try to create a new workspace with the subscription ID, resource group, and workspace name as specified in the beginning of the cell." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Workspace name: chhamlws\n", + "Azure region: westcentralus\n", + "Resource group: chhamlwsrg\n" + ] + } + ], + "source": [ + "# Please specify the AzureML workspace attributes below if you want to create a new one.\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace_name = \"\"\n", + "workspace_region = \"\"\n", + "\n", + "# Connect to a workspace\n", + "ws = get_or_create_workspace(\n", + " config_path=\"./.azureml\",\n", + " subscription_id=subscription_id,\n", + " resource_group=resource_group,\n", + " workspace_name=workspace_name,\n", + " workspace_region=workspace_region,\n", + ")\n", + "print(\n", + " \"Workspace name: \" + ws.name,\n", + " \"Azure region: \" + ws.location,\n", + " \"Resource group: \" + ws.resource_group,\n", + " sep=\"\\n\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an experiment\n", + "exp = Experiment(workspace=ws, name=\"tune-lgbm-forecast\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "\n", + "We need to download the Orange Juice data and split it into training and test sets. By default, the following cell will download and spit the data. If you've already done so, you may skip this part by switching `DOWNLOAD_SPLIT_DATA` to False. \n", + "\n", + "By passing `write_csv=True` to `split_train_test()` below, this function will write the training data and test data to three csv files: `train.csv`, `auxi.csv` and `test.csv`. The first two csv files contain the historical sales up to week 135 as well as auxiliary information such as future price and promotion. Here we assume that future price and promotion information up to a certain number of weeks ahead is predetermined and known. We will use these two files to implement cross-validation and search for the best model with HyperDrive." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data already exists at the specified location.\n" + ] + } + ], + "source": [ + "if DOWNLOAD_SPLIT_DATA:\n", + " download_ojdata(DATA_DIR)\n", + " split_train_test(\n", + " DATA_DIR,\n", + " n_splits=N_SPLITS,\n", + " horizon=HORIZON,\n", + " gap=GAP,\n", + " first_week=FIRST_WEEK,\n", + " last_week=LAST_WEEK,\n", + " write_csv=True,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate Script Locally\n", + "\n", + "A good practice is to test the model training and validation script on your local machine before you run the hyperparameter tuning job on a remote compute. To run the script locally, we need to correctly specify the path of the Python interpreter that has been installed in `forecasting_env` conda environment. In what follows, the script `train_validate.py` trains a model on the training set with the input arguments as specified in `ScriptRunConfig()` and computes the accuracy of the model on the validation set. Here we evaluate the model accuracy using mean-absolute-percentage-error (MAPE)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Get Python interpreter path\n", + "python_path = subprocess.check_output(\"which python\", shell=True)\n", + "python_path = python_path.decode(\"utf-8\")[:-1]\n", + "\n", + "# Configure local, user managed environment\n", + "run_config_user_managed = RunConfiguration()\n", + "run_config_user_managed.environment.python.user_managed_dependencies = True\n", + "run_config_user_managed.environment.python.interpreter_path = python_path" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Directory of the local scripts\n", + "script_folder = \"./aml_scripts\"\n", + "\n", + "# Copy feature engineering utils\n", + "src_dir = os.path.join(git_repo_path(), \"fclib\", \"fclib\", \"feature_engineering\")\n", + "des_dir = os.path.join(script_folder, \"fclib\", \"feature_engineering\")\n", + "shutil.copytree(src_dir, des_dir)\n", + "\n", + "# Training script name and path\n", + "train_script_name = \"train_validate.py\"\n", + "train_script_path = os.path.join(script_folder, train_script_name)\n", + "\n", + "# Specify script run config\n", + "src = ScriptRunConfig(\n", + " source_directory=\"./\",\n", + " script=train_script_path,\n", + " arguments=[\"--data-folder\", DATA_DIR, \"--bagging-fraction\", \"0.8\"],\n", + " run_config=run_config_user_managed,\n", + ")\n", + "run_local = exp.submit(src)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Running'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check job status\n", + "run_local.get_status()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will wait until the local run finishes. Then, we print out the validation metric. Moreover, you can also use `run_local.get_details()` to get detailed information about this run." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MAPE': 66.59144474679267}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check results\n", + "while run_local.get_status() != \"Completed\":\n", + " {}\n", + "run_local.get_metrics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Script on Remote Compute\n", + "\n", + "After validating model training script locally, we can create a remote compute and further test the script on the remote compute." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a CPU cluster as compute target\n", + "\n", + "In the next cell, we create an AmlCompute target with a specific cluster name, VM size, and maximum number of nodes if the cluster does not exist. Otherwise, we will reuse an existing one. For more options of VM sizes, you can check information in this [link](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-general)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found compute target: cpu-cluster\n" + ] + } + ], + "source": [ + "# Choose a name for your cluster\n", + "cluster_name = \"cpu-cluster\"\n", + "# VM Size\n", + "vm_size = \"STANDARD_D2_V2\"\n", + "# Maximum number of nodes of the cluster\n", + "max_nodes = 4\n", + "\n", + "# Create a new AmlCompute if it does not exist or reuse an existing one\n", + "compute_target = get_or_create_amlcompute(\n", + " workspace=ws,\n", + " compute_name=cluster_name,\n", + " vm_size=vm_size,\n", + " min_nodes=0,\n", + " max_nodes=max_nodes,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure Docker environment\n", + "\n", + "The remote compute will need to create a [Docker image](https://docs.docker.com/get-started/) for running the script. The Docker image is an encapsulated environment with necessary dependencies installed. In the following cell, we specify the conda packages and Python version that are needed for running the script." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "env = EnvironmentDefinition()\n", + "env.python.user_managed_dependencies = False\n", + "env.python.conda_dependencies = CondaDependencies.create(\n", + " conda_packages=[\"pandas\", \"numpy\", \"scipy\", \"scikit-learn\", \"lightgbm\", \"joblib\"],\n", + " python_version=\"3.6.2\",\n", + ")\n", + "env.python.conda_dependencies.add_channel(\"conda-forge\")\n", + "env.docker.enabled = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload data to default datastore\n", + "\n", + "Each workspace comes with a default datastore. In the following, we upload the Orange Juice dataset to the workspace's default datastore, which will later be mounted on the cluster for model training and validation." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Datastore type: AzureBlob\n", + "Account name: chhamlws4931040064\n", + "Container name: azureml-blobstore-f799a640-1ca3-4877-ad24-08eef7bd307e\n" + ] + } + ], + "source": [ + "ds = ws.get_default_datastore()\n", + "print(\n", + " \"Datastore type: \" + ds.datastore_type,\n", + " \"Account name: \" + ds.account_name,\n", + " \"Container name: \" + ds.container_name,\n", + " sep=\"\\n\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "$AZUREML_DATAREFERENCE_dd1f71e8652d4d32b738e2c5aa03afdf" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remote data path\n", + "path_on_datastore = \"data\"\n", + "ds.upload(\n", + " src_dir=DATA_DIR,\n", + " target_path=path_on_datastore,\n", + " overwrite=True,\n", + " show_progress=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$AZUREML_DATAREFERENCE_5442e91c25f449ff9a9780f48c6d7792\n" + ] + } + ], + "source": [ + "# Get data reference object for the data path\n", + "ds_data = ds.path(path_on_datastore)\n", + "print(ds_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create estimator\n", + "\n", + "Next, we will check if the remote compute target is successfully created by submitting a job to the target. This compute target will be used by HyperDrive for hyperparameter tuning later. Note that you may skip this part and directly go to [Tune Hyperparameters using HyperDrive](#tune-hyperparameters-using-hyperdrive) if you want.\n", + "\n", + "In the following cells, we first create an estimator to specify details of the job. Then we sumbit the job to the remote compute and check the status of the job." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "script_params = {\"--data-folder\": ds_data.as_mount(), \"--bagging-fraction\": 0.8}\n", + "est = Estimator(\n", + " source_directory=script_folder,\n", + " script_params=script_params,\n", + " compute_target=compute_target,\n", + " use_docker=True,\n", + " entry_script=train_script_name,\n", + " environment_definition=env,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Submit job to remote compute\n", + "run_remote = exp.submit(config=est)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check job status\n", + "\n", + "You can monitor the status of the remote run using the AzureML widgets. After the job is done, the following cell will display a dashboard similar as\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e15887d5c8bb484d87506b7a5e3df4b5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/aml.mini.widget.v1": "{\"status\": \"Completed\", \"workbench_run_details_uri\": \"https://ml.azure.com/experiments/tune-lgbm-forecast/runs/tune-lgbm-forecast_1584652203_688e0e17?wsid=/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourcegroups/chhamlwsrg/workspaces/chhamlws\", \"run_id\": \"tune-lgbm-forecast_1584652203_688e0e17\", \"run_properties\": {\"run_id\": \"tune-lgbm-forecast_1584652203_688e0e17\", \"created_utc\": \"2020-03-19T21:10:05.552973Z\", \"properties\": {\"_azureml.ComputeTargetType\": \"amlcompute\", \"ContentSnapshotId\": \"c3e4a829-831a-45f4-973d-e5093d4639c7\", \"azureml.git.repository_uri\": \"git@github.com:microsoft/forecasting.git\", \"mlflow.source.git.repoURL\": \"git@github.com:microsoft/forecasting.git\", \"azureml.git.branch\": \"chenhui/hyperdrive_example_update\", \"mlflow.source.git.branch\": \"chenhui/hyperdrive_example_update\", \"azureml.git.commit\": \"a36fa88dade03b7811ab97e2f8d5120c643a2073\", \"mlflow.source.git.commit\": \"a36fa88dade03b7811ab97e2f8d5120c643a2073\", \"azureml.git.dirty\": \"True\", \"AzureML.DerivedImageName\": \"azureml/azureml_7842fd2c5e99a43f1cca1341b66a0ecb\", \"ProcessInfoFile\": \"azureml-logs/process_info.json\", \"ProcessStatusFile\": \"azureml-logs/process_status.json\"}, \"tags\": {\"_aml_system_ComputeTargetStatus\": \"{\\\"AllocationState\\\":\\\"steady\\\",\\\"PreparingNodeCount\\\":0,\\\"RunningNodeCount\\\":0,\\\"CurrentNodeCount\\\":4}\"}, \"script_name\": null, \"arguments\": null, \"end_time_utc\": \"2020-03-19T21:12:14.300468Z\", \"status\": \"Completed\", \"log_files\": {\"azureml-logs/55_azureml-execution-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/55_azureml-execution-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt?sv=2019-02-02&sr=b&sig=7%2FV%2F7Uf5suKYNSmAT2tod9rrpEQG02YGjGLWufjtR9M%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"azureml-logs/65_job_prep-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/65_job_prep-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt?sv=2019-02-02&sr=b&sig=hAfvRB9EVc6%2BI%2FI1i91V3gw%2FPTQCjxEWFxztnvVK1Zk%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"azureml-logs/70_driver_log.txt\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/70_driver_log.txt?sv=2019-02-02&sr=b&sig=6gSOaifysap9NdTBY2fBOPPxns8WmwmTkSxK75giv0I%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"azureml-logs/75_job_post-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/75_job_post-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt?sv=2019-02-02&sr=b&sig=Poow17KG%2BO1JDruHX2wImansTcQPtRtkC0VfCSLn1CI%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"azureml-logs/process_info.json\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/process_info.json?sv=2019-02-02&sr=b&sig=5SjpOATjDqF6j92anqPQEgkTM171%2FBRx8FTehTycICk%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"azureml-logs/process_status.json\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/azureml-logs/process_status.json?sv=2019-02-02&sr=b&sig=KDk1VVeNgTRx37%2BBacOb%2FWVecpyoc%2FTJCbVLRmCOWCI%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"logs/azureml/140_azureml.log\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/logs/azureml/140_azureml.log?sv=2019-02-02&sr=b&sig=cTNXDVJQhpbd5Y5%2BKlOOlYe7F4QZIHfTtriNZ1RZEbE%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"logs/azureml/job_prep_azureml.log\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/logs/azureml/job_prep_azureml.log?sv=2019-02-02&sr=b&sig=TIVY4fks6XsO%2BQ50LbfoXWfWcD4JK0uekoE9%2FvQtA9I%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\", \"logs/azureml/job_release_azureml.log\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.tune-lgbm-forecast_1584652203_688e0e17/logs/azureml/job_release_azureml.log?sv=2019-02-02&sr=b&sig=XvAIWqtaowr3HrYaVdz4HaQ6zARgk8mtZW8VP91Wp%2Fc%3D&st=2020-03-19T21%3A02%3A23Z&se=2020-03-20T05%3A12%3A23Z&sp=r\"}, \"log_groups\": [[\"azureml-logs/process_info.json\", \"azureml-logs/process_status.json\", \"logs/azureml/job_prep_azureml.log\", \"logs/azureml/job_release_azureml.log\"], [\"azureml-logs/55_azureml-execution-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\"], [\"azureml-logs/65_job_prep-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\"], [\"azureml-logs/70_driver_log.txt\"], [\"azureml-logs/75_job_post-tvmps_64205f30874f615b57c2717938d4039e14020b51244803148cd50a8d15cabd4c_d.txt\"], [\"logs/azureml/140_azureml.log\"]], \"run_duration\": \"0:02:08\"}, \"child_runs\": [], \"children_metrics\": {}, \"run_metrics\": [{\"name\": \"MAPE\", \"run_id\": \"tune-lgbm-forecast_1584652203_688e0e17\", \"categories\": [0], \"series\": [{\"data\": [66.59144474679267]}]}], \"run_logs\": \"2020-03-19 21:11:02,046|azureml|DEBUG|Inputs:: kwargs: {'OutputCollection': True, 'snapshotProject': True, 'only_in_process_features': True, 'skip_track_logs_dir': True}, track_folders: None, deny_list: None, directories_to_watch: []\\n2020-03-19 21:11:02,046|azureml.history._tracking.PythonWorkingDirectory|DEBUG|Execution target type: batchai\\n2020-03-19 21:11:02,053|azureml.history._tracking.PythonWorkingDirectory|DEBUG|Failed to import pyspark with error: No module named 'pyspark'\\n2020-03-19 21:11:02,054|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|Pinning working directory for filesystems: ['pyfs']\\n2020-03-19 21:11:02,256|azureml._base_sdk_common.user_agent|DEBUG|Fetching client info from /root/.azureml/clientinfo.json\\n2020-03-19 21:11:02,257|azureml._base_sdk_common.user_agent|DEBUG|Error loading client info: [Errno 2] No such file or directory: '/root/.azureml/clientinfo.json'\\n2020-03-19 21:11:02,559|azureml.core.run|DEBUG|Adding new factory for run source azureml.scriptrun\\n2020-03-19 21:11:02,560|azureml.core.authentication.TokenRefresherDaemon|DEBUG|Starting daemon and triggering first instance\\n2020-03-19 21:11:02,567|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,567|azureml._restclient.clientbase|INFO|Created a worker pool for first use\\n2020-03-19 21:11:02,567|azureml.core.authentication|DEBUG|Time to expire 1814342.432059 seconds\\n2020-03-19 21:11:02,568|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,568|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,568|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,568|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,569|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,600|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,600|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,601|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:02,605|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,613|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,618|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,623|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,628|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:02,628|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.RunClient.get-async:False|DEBUG|[START]\\n2020-03-19 21:11:02,628|msrest.service_client|DEBUG|Accept header absent and forced to application/json\\n2020-03-19 21:11:02,628|msrest.http_logger|DEBUG|Request URL: 'https://westcentralus.experiments.azureml.net/history/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runs/tune-lgbm-forecast_1584652203_688e0e17'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG|Request method: 'GET'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG|Request headers:\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG| 'Accept': 'application/json'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG| 'Content-Type': 'application/json; charset=utf-8'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG| 'x-ms-client-request-id': '3c9a8626-ae8e-4e19-a9ed-2bb98b7ad7f6'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG| 'request-id': '3c9a8626-ae8e-4e19-a9ed-2bb98b7ad7f6'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG| 'User-Agent': 'python/3.6.2 (Linux-4.15.0-1067-azure-x86_64-with-debian-stretch-sid) msrest/0.6.11 azureml._restclient/core.1.0.85'\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG|Request body:\\n2020-03-19 21:11:02,629|msrest.http_logger|DEBUG|None\\n2020-03-19 21:11:02,629|msrest.universal_http|DEBUG|Configuring redirects: allow=True, max=30\\n2020-03-19 21:11:02,629|msrest.universal_http|DEBUG|Configuring request: timeout=100, verify=True, cert=None\\n2020-03-19 21:11:02,629|msrest.universal_http|DEBUG|Configuring proxies: ''\\n2020-03-19 21:11:02,629|msrest.universal_http|DEBUG|Evaluate proxies against ENV settings: True\\n2020-03-19 21:11:02,683|msrest.http_logger|DEBUG|Response status: 200\\n2020-03-19 21:11:02,683|msrest.http_logger|DEBUG|Response headers:\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Date': 'Thu, 19 Mar 2020 21:11:02 GMT'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Content-Type': 'application/json; charset=utf-8'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Transfer-Encoding': 'chunked'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Connection': 'keep-alive'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Vary': 'Accept-Encoding'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'x-ms-client-request-id': '3c9a8626-ae8e-4e19-a9ed-2bb98b7ad7f6'\\n2020-03-19 21:11:02,684|msrest.http_logger|DEBUG| 'x-ms-client-session-id': ''\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG| 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG| 'x-request-time': '0.034'\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG| 'X-Content-Type-Options': 'nosniff'\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG| 'Content-Encoding': 'gzip'\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG|Response content:\\n2020-03-19 21:11:02,685|msrest.http_logger|DEBUG|{\\n \\\"runNumber\\\": 123,\\n \\\"rootRunId\\\": \\\"tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"experimentId\\\": \\\"3199ea18-1505-42f9-9092-777f27df73e5\\\",\\n \\\"createdUtc\\\": \\\"2020-03-19T21:10:05.5529735+00:00\\\",\\n \\\"createdBy\\\": {\\n \\\"userObjectId\\\": \\\"8157bc92-2d12-4bc9-9270-bab51c673493\\\",\\n \\\"userPuId\\\": \\\"10033FFF97A21586\\\",\\n \\\"userIdp\\\": null,\\n \\\"userAltSecId\\\": null,\\n \\\"userIss\\\": \\\"https://sts.windows.net/72f988bf-86f1-41af-91ab-2d7cd011db47/\\\",\\n \\\"userTenantId\\\": \\\"72f988bf-86f1-41af-91ab-2d7cd011db47\\\",\\n \\\"userName\\\": \\\"Chenhui Hu\\\"\\n },\\n \\\"userId\\\": \\\"8157bc92-2d12-4bc9-9270-bab51c673493\\\",\\n \\\"token\\\": null,\\n \\\"tokenExpiryTimeUtc\\\": null,\\n \\\"error\\\": null,\\n \\\"warnings\\\": null,\\n \\\"revision\\\": 9,\\n \\\"runUuid\\\": \\\"88b2e204-56cc-477e-bff4-cbb0c94c18d6\\\",\\n \\\"parentRunUuid\\\": null,\\n \\\"rootRunUuid\\\": \\\"88b2e204-56cc-477e-bff4-cbb0c94c18d6\\\",\\n \\\"runId\\\": \\\"tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"parentRunId\\\": null,\\n \\\"status\\\": \\\"Running\\\",\\n \\\"startTimeUtc\\\": \\\"2020-03-19T21:10:42.8166176+00:00\\\",\\n \\\"endTimeUtc\\\": null,\\n \\\"heartbeatEnabled\\\": false,\\n \\\"options\\\": {\\n \\\"generateDataContainerIdIfNotSpecified\\\": true\\n },\\n \\\"name\\\": null,\\n \\\"dataContainerId\\\": \\\"dcid.tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"description\\\": null,\\n \\\"hidden\\\": false,\\n \\\"runType\\\": \\\"azureml.scriptrun\\\",\\n \\\"properties\\\": {\\n \\\"_azureml.ComputeTargetType\\\": \\\"amlcompute\\\",\\n \\\"ContentSnapshotId\\\": \\\"c3e4a829-831a-45f4-973d-e5093d4639c7\\\",\\n \\\"azureml.git.repository_uri\\\": \\\"git@github.com:microsoft/forecasting.git\\\",\\n \\\"mlflow.source.git.repoURL\\\": \\\"git@github.com:microsoft/forecasting.git\\\",\\n \\\"azureml.git.branch\\\": \\\"chenhui/hyperdrive_example_update\\\",\\n \\\"mlflow.source.git.branch\\\": \\\"chenhui/hyperdrive_example_update\\\",\\n \\\"azureml.git.commit\\\": \\\"a36fa88dade03b7811ab97e2f8d5120c643a2073\\\",\\n \\\"mlflow.source.git.commit\\\": \\\"a36fa88dade03b7811ab97e2f8d5120c643a2073\\\",\\n \\\"azureml.git.dirty\\\": \\\"True\\\",\\n \\\"AzureML.DerivedImageName\\\": \\\"azureml/azureml_7842fd2c5e99a43f1cca1341b66a0ecb\\\",\\n \\\"ProcessInfoFile\\\": \\\"azureml-logs/process_info.json\\\",\\n \\\"ProcessStatusFile\\\": \\\"azureml-logs/process_status.json\\\"\\n },\\n \\\"scriptName\\\": \\\"train_validate.py\\\",\\n \\\"target\\\": \\\"cpu-cluster\\\",\\n \\\"uniqueChildRunComputeTargets\\\": [],\\n \\\"tags\\\": {\\n \\\"_aml_system_ComputeTargetStatus\\\": \\\"{\\\\\\\"AllocationState\\\\\\\":\\\\\\\"steady\\\\\\\",\\\\\\\"PreparingNodeCount\\\\\\\":0,\\\\\\\"RunningNodeCount\\\\\\\":0,\\\\\\\"CurrentNodeCount\\\\\\\":4}\\\"\\n },\\n \\\"inputDatasets\\\": [],\\n \\\"runDefinition\\\": null,\\n \\\"createdFrom\\\": null,\\n \\\"cancelUri\\\": \\\"https://westcentralus.experiments.azureml.net/execution/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runId/tune-lgbm-forecast_1584652203_688e0e17/cancel\\\",\\n \\\"completeUri\\\": null,\\n \\\"diagnosticsUri\\\": \\\"https://westcentralus.experiments.azureml.net/execution/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runId/tune-lgbm-forecast_1584652203_688e0e17/diagnostics\\\",\\n \\\"computeRequest\\\": {\\n \\\"nodeCount\\\": 1\\n },\\n \\\"retainForLifetimeOfWorkspace\\\": false,\\n \\\"queueingInfo\\\": null\\n}\\n2020-03-19 21:11:02,691|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.RunClient.get-async:False|DEBUG|[STOP]\\n2020-03-19 21:11:02,691|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17|DEBUG|Constructing run from dto. type: azureml.scriptrun, source: None, props: {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': 'c3e4a829-831a-45f4-973d-e5093d4639c7', 'azureml.git.repository_uri': 'git@github.com:microsoft/forecasting.git', 'mlflow.source.git.repoURL': 'git@github.com:microsoft/forecasting.git', 'azureml.git.branch': 'chenhui/hyperdrive_example_update', 'mlflow.source.git.branch': 'chenhui/hyperdrive_example_update', 'azureml.git.commit': 'a36fa88dade03b7811ab97e2f8d5120c643a2073', 'mlflow.source.git.commit': 'a36fa88dade03b7811ab97e2f8d5120c643a2073', 'azureml.git.dirty': 'True', 'AzureML.DerivedImageName': 'azureml/azureml_7842fd2c5e99a43f1cca1341b66a0ecb', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}\\n2020-03-19 21:11:02,691|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunContextManager|DEBUG|Valid logs dir, setting up content loader\\n2020-03-19 21:11:02,692|azureml|WARNING|Could not import azureml.mlflow or azureml.contrib.mlflow mlflow APIs will not run against AzureML services. Add azureml-mlflow as a conda dependency for the run if this behavior is desired\\n2020-03-19 21:11:02,692|azureml.WorkerPool|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml.SendRunKillSignal|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml.RunStatusContext|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunContextManager.RunStatusContext|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml.WorkingDirectoryCM|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|[START]\\n2020-03-19 21:11:02,692|azureml.history._tracking.PythonWorkingDirectory|INFO|Current working dir: /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17\\n2020-03-19 21:11:02,692|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|Calling pyfs\\n2020-03-19 21:11:02,692|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|Storing working dir for pyfs as /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,544|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,545|azureml._base_sdk_common.service_discovery|DEBUG|Found history service url in environment variable AZUREML_SERVICE_ENDPOINT, history service url: https://westcentralus.experiments.azureml.net.\\n2020-03-19 21:11:03,550|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:03,551|azureml._run_impl.run_history_facade|DEBUG|Created a static thread pool for RunHistoryFacade class\\n2020-03-19 21:11:03,555|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:03,560|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:03,566|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:03,571|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:03,572|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.RunClient.get-async:False|DEBUG|[START]\\n2020-03-19 21:11:03,572|msrest.service_client|DEBUG|Accept header absent and forced to application/json\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG|Request URL: 'https://westcentralus.experiments.azureml.net/history/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runs/tune-lgbm-forecast_1584652203_688e0e17'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG|Request method: 'GET'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG|Request headers:\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG| 'Accept': 'application/json'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG| 'Content-Type': 'application/json; charset=utf-8'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG| 'x-ms-client-request-id': 'a7d0b96f-400a-4641-b6ef-91c00e44e5d3'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG| 'request-id': 'a7d0b96f-400a-4641-b6ef-91c00e44e5d3'\\n2020-03-19 21:11:03,572|msrest.http_logger|DEBUG| 'User-Agent': 'python/3.6.2 (Linux-4.15.0-1067-azure-x86_64-with-debian-stretch-sid) msrest/0.6.11 azureml._restclient/core.1.0.85'\\n2020-03-19 21:11:03,573|msrest.http_logger|DEBUG|Request body:\\n2020-03-19 21:11:03,573|msrest.http_logger|DEBUG|None\\n2020-03-19 21:11:03,573|msrest.universal_http|DEBUG|Configuring redirects: allow=True, max=30\\n2020-03-19 21:11:03,573|msrest.universal_http|DEBUG|Configuring request: timeout=100, verify=True, cert=None\\n2020-03-19 21:11:03,573|msrest.universal_http|DEBUG|Configuring proxies: ''\\n2020-03-19 21:11:03,573|msrest.universal_http|DEBUG|Evaluate proxies against ENV settings: True\\n2020-03-19 21:11:03,625|msrest.http_logger|DEBUG|Response status: 200\\n2020-03-19 21:11:03,625|msrest.http_logger|DEBUG|Response headers:\\n2020-03-19 21:11:03,625|msrest.http_logger|DEBUG| 'Date': 'Thu, 19 Mar 2020 21:11:03 GMT'\\n2020-03-19 21:11:03,625|msrest.http_logger|DEBUG| 'Content-Type': 'application/json; charset=utf-8'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'Transfer-Encoding': 'chunked'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'Connection': 'keep-alive'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'Vary': 'Accept-Encoding'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'x-ms-client-request-id': 'a7d0b96f-400a-4641-b6ef-91c00e44e5d3'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'x-ms-client-session-id': ''\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'\\n2020-03-19 21:11:03,626|msrest.http_logger|DEBUG| 'x-request-time': '0.031'\\n2020-03-19 21:11:03,627|msrest.http_logger|DEBUG| 'X-Content-Type-Options': 'nosniff'\\n2020-03-19 21:11:03,627|msrest.http_logger|DEBUG| 'Content-Encoding': 'gzip'\\n2020-03-19 21:11:03,627|msrest.http_logger|DEBUG|Response content:\\n2020-03-19 21:11:03,627|msrest.http_logger|DEBUG|{\\n \\\"runNumber\\\": 123,\\n \\\"rootRunId\\\": \\\"tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"experimentId\\\": \\\"3199ea18-1505-42f9-9092-777f27df73e5\\\",\\n \\\"createdUtc\\\": \\\"2020-03-19T21:10:05.5529735+00:00\\\",\\n \\\"createdBy\\\": {\\n \\\"userObjectId\\\": \\\"8157bc92-2d12-4bc9-9270-bab51c673493\\\",\\n \\\"userPuId\\\": \\\"10033FFF97A21586\\\",\\n \\\"userIdp\\\": null,\\n \\\"userAltSecId\\\": null,\\n \\\"userIss\\\": \\\"https://sts.windows.net/72f988bf-86f1-41af-91ab-2d7cd011db47/\\\",\\n \\\"userTenantId\\\": \\\"72f988bf-86f1-41af-91ab-2d7cd011db47\\\",\\n \\\"userName\\\": \\\"Chenhui Hu\\\"\\n },\\n \\\"userId\\\": \\\"8157bc92-2d12-4bc9-9270-bab51c673493\\\",\\n \\\"token\\\": null,\\n \\\"tokenExpiryTimeUtc\\\": null,\\n \\\"error\\\": null,\\n \\\"warnings\\\": null,\\n \\\"revision\\\": 9,\\n \\\"runUuid\\\": \\\"88b2e204-56cc-477e-bff4-cbb0c94c18d6\\\",\\n \\\"parentRunUuid\\\": null,\\n \\\"rootRunUuid\\\": \\\"88b2e204-56cc-477e-bff4-cbb0c94c18d6\\\",\\n \\\"runId\\\": \\\"tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"parentRunId\\\": null,\\n \\\"status\\\": \\\"Running\\\",\\n \\\"startTimeUtc\\\": \\\"2020-03-19T21:10:42.8166176+00:00\\\",\\n \\\"endTimeUtc\\\": null,\\n \\\"heartbeatEnabled\\\": false,\\n \\\"options\\\": {\\n \\\"generateDataContainerIdIfNotSpecified\\\": true\\n },\\n \\\"name\\\": null,\\n \\\"dataContainerId\\\": \\\"dcid.tune-lgbm-forecast_1584652203_688e0e17\\\",\\n \\\"description\\\": null,\\n \\\"hidden\\\": false,\\n \\\"runType\\\": \\\"azureml.scriptrun\\\",\\n \\\"properties\\\": {\\n \\\"_azureml.ComputeTargetType\\\": \\\"amlcompute\\\",\\n \\\"ContentSnapshotId\\\": \\\"c3e4a829-831a-45f4-973d-e5093d4639c7\\\",\\n \\\"azureml.git.repository_uri\\\": \\\"git@github.com:microsoft/forecasting.git\\\",\\n \\\"mlflow.source.git.repoURL\\\": \\\"git@github.com:microsoft/forecasting.git\\\",\\n \\\"azureml.git.branch\\\": \\\"chenhui/hyperdrive_example_update\\\",\\n \\\"mlflow.source.git.branch\\\": \\\"chenhui/hyperdrive_example_update\\\",\\n \\\"azureml.git.commit\\\": \\\"a36fa88dade03b7811ab97e2f8d5120c643a2073\\\",\\n \\\"mlflow.source.git.commit\\\": \\\"a36fa88dade03b7811ab97e2f8d5120c643a2073\\\",\\n \\\"azureml.git.dirty\\\": \\\"True\\\",\\n \\\"AzureML.DerivedImageName\\\": \\\"azureml/azureml_7842fd2c5e99a43f1cca1341b66a0ecb\\\",\\n \\\"ProcessInfoFile\\\": \\\"azureml-logs/process_info.json\\\",\\n \\\"ProcessStatusFile\\\": \\\"azureml-logs/process_status.json\\\"\\n },\\n \\\"scriptName\\\": \\\"train_validate.py\\\",\\n \\\"target\\\": \\\"cpu-cluster\\\",\\n \\\"uniqueChildRunComputeTargets\\\": [],\\n \\\"tags\\\": {\\n \\\"_aml_system_ComputeTargetStatus\\\": \\\"{\\\\\\\"AllocationState\\\\\\\":\\\\\\\"steady\\\\\\\",\\\\\\\"PreparingNodeCount\\\\\\\":0,\\\\\\\"RunningNodeCount\\\\\\\":0,\\\\\\\"CurrentNodeCount\\\\\\\":4}\\\"\\n },\\n \\\"inputDatasets\\\": [],\\n \\\"runDefinition\\\": null,\\n \\\"createdFrom\\\": null,\\n \\\"cancelUri\\\": \\\"https://westcentralus.experiments.azureml.net/execution/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runId/tune-lgbm-forecast_1584652203_688e0e17/cancel\\\",\\n \\\"completeUri\\\": null,\\n \\\"diagnosticsUri\\\": \\\"https://westcentralus.experiments.azureml.net/execution/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runId/tune-lgbm-forecast_1584652203_688e0e17/diagnostics\\\",\\n \\\"computeRequest\\\": {\\n \\\"nodeCount\\\": 1\\n },\\n \\\"retainForLifetimeOfWorkspace\\\": false,\\n \\\"queueingInfo\\\": null\\n}\\n2020-03-19 21:11:03,630|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.RunClient.get-async:False|DEBUG|[STOP]\\n2020-03-19 21:11:03,631|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17|DEBUG|Constructing run from dto. type: azureml.scriptrun, source: None, props: {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': 'c3e4a829-831a-45f4-973d-e5093d4639c7', 'azureml.git.repository_uri': 'git@github.com:microsoft/forecasting.git', 'mlflow.source.git.repoURL': 'git@github.com:microsoft/forecasting.git', 'azureml.git.branch': 'chenhui/hyperdrive_example_update', 'mlflow.source.git.branch': 'chenhui/hyperdrive_example_update', 'azureml.git.commit': 'a36fa88dade03b7811ab97e2f8d5120c643a2073', 'mlflow.source.git.commit': 'a36fa88dade03b7811ab97e2f8d5120c643a2073', 'azureml.git.dirty': 'True', 'AzureML.DerivedImageName': 'azureml/azureml_7842fd2c5e99a43f1cca1341b66a0ecb', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}\\n2020-03-19 21:11:03,631|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunContextManager|DEBUG|Valid logs dir, setting up content loader\\n2020-03-19 21:11:32,573|azureml.core.authentication|DEBUG|Time to expire 1814312.426703 seconds\\n2020-03-19 21:11:49,120|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient|DEBUG|Overrides: Max batch size: 50, batch cushion: 5, Interval: 1.\\n2020-03-19 21:11:49,121|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.PostMetricsBatchDaemon|DEBUG|Starting daemon and triggering first instance\\n2020-03-19 21:11:49,133|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient|DEBUG|Used for use_batch=True.\\n2020-03-19 21:11:49,751|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|Calling pyfs\\n2020-03-19 21:11:49,752|azureml.history._tracking.PythonWorkingDirectory|INFO|Current working dir: /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17\\n2020-03-19 21:11:49,752|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|Reverting working dir from /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17 to /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17\\n2020-03-19 21:11:49,752|azureml.history._tracking.PythonWorkingDirectory|INFO|Working dir is already updated /mnt/batch/tasks/shared/LS_root/jobs/chhamlws/2c8fb242ef2843d9bed884402f5e3132/tune-lgbm-forecast_1584652203_688e0e17/mounts/workspaceblobstore/azureml/tune-lgbm-forecast_1584652203_688e0e17\\n2020-03-19 21:11:49,752|azureml.history._tracking.PythonWorkingDirectory.workingdir|DEBUG|[STOP]\\n2020-03-19 21:11:49,752|azureml.WorkingDirectoryCM|DEBUG|[STOP]\\n2020-03-19 21:11:49,752|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17|INFO|complete is not setting status for submitted runs.\\n2020-03-19 21:11:49,752|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,752|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient|DEBUG|Overrides: Max batch size: 50, batch cushion: 5, Interval: 1.\\n2020-03-19 21:11:49,752|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.PostMetricsBatchDaemon|DEBUG|Starting daemon and triggering first instance\\n2020-03-19 21:11:49,752|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient|DEBUG|Used for use_batch=True.\\n2020-03-19 21:11:49,753|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,753|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|flush timeout 300 is different from task queue timeout 120, using flush timeout\\n2020-03-19 21:11:49,753|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|Waiting 300 seconds on tasks: [].\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch|DEBUG|\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:49,756|azureml.RunStatusContext|DEBUG|[STOP]\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,756|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|flush timeout 300.0 is different from task queue timeout 120, using flush timeout\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|Waiting 300.0 seconds on tasks: [].\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch|DEBUG|\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,757|azureml.BatchTaskQueueAdd_1_Batches|DEBUG|[Start]\\n2020-03-19 21:11:49,757|azureml.BatchTaskQueueAdd_1_Batches.WorkerPool|DEBUG|submitting future: _handle_batch\\n2020-03-19 21:11:49,757|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch|DEBUG|Batch size 1.\\n2020-03-19 21:11:49,758|azureml.BatchTaskQueueAdd_1_Batches.0__handle_batch|DEBUG|Using basic handler - no exception handling\\n2020-03-19 21:11:49,758|azureml._restclient.clientbase.WorkerPool|DEBUG|submitting future: _log_batch\\n2020-03-19 21:11:49,758|azureml.BatchTaskQueueAdd_1_Batches|DEBUG|Adding task 0__handle_batch to queue of approximate size: 0\\n2020-03-19 21:11:49,758|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.post_batch-async:False|DEBUG|[START]\\n2020-03-19 21:11:49,758|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.0__log_batch|DEBUG|Using basic handler - no exception handling\\n2020-03-19 21:11:49,758|azureml.BatchTaskQueueAdd_1_Batches|DEBUG|[Stop] - waiting default timeout\\n2020-03-19 21:11:49,759|msrest.service_client|DEBUG|Accept header absent and forced to application/json\\n2020-03-19 21:11:49,759|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch|DEBUG|Adding task 0__log_batch to queue of approximate size: 0\\n2020-03-19 21:11:49,760|azureml.BatchTaskQueueAdd_1_Batches.WaitFlushSource:BatchTaskQueueAdd_1_Batches|DEBUG|[START]\\n2020-03-19 21:11:49,760|msrest.universal_http.requests|DEBUG|Configuring retry: max_retries=3, backoff_factor=0.8, max_backoff=90\\n2020-03-19 21:11:49,760|azureml.BatchTaskQueueAdd_1_Batches.WaitFlushSource:BatchTaskQueueAdd_1_Batches|DEBUG|Overriding default flush timeout from None to 120\\n2020-03-19 21:11:49,760|msrest.http_logger|DEBUG|Request URL: 'https://westcentralus.experiments.azureml.net/history/v1.0/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast/runs/tune-lgbm-forecast_1584652203_688e0e17/batch/metrics'\\n2020-03-19 21:11:49,761|azureml.BatchTaskQueueAdd_1_Batches.WaitFlushSource:BatchTaskQueueAdd_1_Batches|DEBUG|Waiting 120 seconds on tasks: [AsyncTask(0__handle_batch)].\\n2020-03-19 21:11:49,761|msrest.http_logger|DEBUG|Request method: 'POST'\\n2020-03-19 21:11:49,761|azureml.BatchTaskQueueAdd_1_Batches.0__handle_batch.WaitingTask|DEBUG|[START]\\n2020-03-19 21:11:49,761|msrest.http_logger|DEBUG|Request headers:\\n2020-03-19 21:11:49,761|azureml.BatchTaskQueueAdd_1_Batches.0__handle_batch.WaitingTask|DEBUG|Awaiter is BatchTaskQueueAdd_1_Batches\\n2020-03-19 21:11:49,761|msrest.http_logger|DEBUG| 'Accept': 'application/json'\\n2020-03-19 21:11:49,761|azureml.BatchTaskQueueAdd_1_Batches.0__handle_batch.WaitingTask|DEBUG|[STOP]\\n2020-03-19 21:11:49,761|msrest.http_logger|DEBUG| 'Content-Type': 'application/json-patch+json; charset=utf-8'\\n2020-03-19 21:11:49,762|azureml.BatchTaskQueueAdd_1_Batches|DEBUG|\\n2020-03-19 21:11:49,762|msrest.http_logger|DEBUG| 'x-ms-client-request-id': '2b7451d4-0c71-4763-8101-a0e3fa53f3ac'\\n2020-03-19 21:11:49,762|azureml.BatchTaskQueueAdd_1_Batches.WaitFlushSource:BatchTaskQueueAdd_1_Batches|DEBUG|[STOP]\\n2020-03-19 21:11:49,762|msrest.http_logger|DEBUG| 'request-id': '2b7451d4-0c71-4763-8101-a0e3fa53f3ac'\\n2020-03-19 21:11:49,762|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[START]\\n2020-03-19 21:11:49,762|msrest.http_logger|DEBUG| 'Content-Length': '341'\\n2020-03-19 21:11:49,762|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|flush timeout 300.0 is different from task queue timeout 120, using flush timeout\\n2020-03-19 21:11:49,762|msrest.http_logger|DEBUG| 'User-Agent': 'python/3.6.2 (Linux-4.15.0-1067-azure-x86_64-with-debian-stretch-sid) msrest/0.6.11 azureml._restclient/core.1.0.85 sdk_run'\\n2020-03-19 21:11:49,763|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|Waiting 300.0 seconds on tasks: [AsyncTask(0__log_batch)].\\n2020-03-19 21:11:49,763|msrest.http_logger|DEBUG|Request body:\\n2020-03-19 21:11:49,763|msrest.http_logger|DEBUG|{\\\"values\\\": [{\\\"metricId\\\": \\\"408d82dc-c710-4472-8aa0-38b854a71a92\\\", \\\"metricType\\\": \\\"azureml.v1.scalar\\\", \\\"createdUtc\\\": \\\"2020-03-19T21:11:49.120039Z\\\", \\\"name\\\": \\\"MAPE\\\", \\\"description\\\": \\\"\\\", \\\"numCells\\\": 1, \\\"cells\\\": [{\\\"MAPE\\\": 66.59144474679267}], \\\"schema\\\": {\\\"numProperties\\\": 1, \\\"properties\\\": [{\\\"propertyId\\\": \\\"MAPE\\\", \\\"name\\\": \\\"MAPE\\\", \\\"type\\\": \\\"float\\\"}]}}]}\\n2020-03-19 21:11:49,763|msrest.universal_http|DEBUG|Configuring redirects: allow=True, max=30\\n2020-03-19 21:11:49,763|msrest.universal_http|DEBUG|Configuring request: timeout=100, verify=True, cert=None\\n2020-03-19 21:11:49,763|msrest.universal_http|DEBUG|Configuring proxies: ''\\n2020-03-19 21:11:49,763|msrest.universal_http|DEBUG|Evaluate proxies against ENV settings: True\\n2020-03-19 21:11:50,292|msrest.http_logger|DEBUG|Response status: 200\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG|Response headers:\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'Date': 'Thu, 19 Mar 2020 21:11:50 GMT'\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'Content-Length': '0'\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'Connection': 'keep-alive'\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d'\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'x-ms-client-request-id': '2b7451d4-0c71-4763-8101-a0e3fa53f3ac'\\n2020-03-19 21:11:50,293|msrest.http_logger|DEBUG| 'x-ms-client-session-id': ''\\n2020-03-19 21:11:50,294|msrest.http_logger|DEBUG| 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'\\n2020-03-19 21:11:50,294|msrest.http_logger|DEBUG| 'x-request-time': '0.419'\\n2020-03-19 21:11:50,294|msrest.http_logger|DEBUG| 'X-Content-Type-Options': 'nosniff'\\n2020-03-19 21:11:50,294|msrest.http_logger|DEBUG|Response content:\\n2020-03-19 21:11:50,294|msrest.http_logger|DEBUG|\\n2020-03-19 21:11:50,296|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.post_batch-async:False|DEBUG|[STOP]\\n2020-03-19 21:11:50,514|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.0__log_batch.WaitingTask|DEBUG|[START]\\n2020-03-19 21:11:50,515|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.0__log_batch.WaitingTask|DEBUG|Awaiter is PostMetricsBatch\\n2020-03-19 21:11:50,515|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.0__log_batch.WaitingTask|DEBUG|[STOP]\\n2020-03-19 21:11:50,515|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch|DEBUG|Waiting on task: 0__log_batch.\\n1 tasks left. Current duration of flush 0.00025343894958496094 seconds.\\nWaiting on task: 0__log_batch.\\n1 tasks left. Current duration of flush 0.2507054805755615 seconds.\\nWaiting on task: 0__log_batch.\\n1 tasks left. Current duration of flush 0.5011677742004395 seconds.\\n\\n2020-03-19 21:11:50,515|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.PostMetricsBatch.WaitFlushSource:MetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:50,515|azureml._SubmittedRun#tune-lgbm-forecast_1584652203_688e0e17.RunHistoryFacade.MetricsClient.FlushingMetricsClient|DEBUG|[STOP]\\n2020-03-19 21:11:50,516|azureml.SendRunKillSignal|DEBUG|[STOP]\\n2020-03-19 21:11:50,516|azureml.HistoryTrackingWorkerPool.WorkerPoolShutdown|DEBUG|[START]\\n2020-03-19 21:11:50,516|azureml.HistoryTrackingWorkerPool.WorkerPoolShutdown|DEBUG|[STOP]\\n2020-03-19 21:11:50,516|azureml.WorkerPool|DEBUG|[STOP]\\n\\nRun is completed.\", \"graph\": {}, \"widget_settings\": {\"childWidgetDisplay\": \"popup\", \"send_telemetry\": true, \"log_level\": \"INFO\", \"sdk_version\": \"1.0.85\"}, \"loading\": false}" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "RunDetails(run_remote).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check the validation metric after the job finishes. The validation metric should be the same as the one we obtained when the script was ran locally. For more details of the job, you can execute `run_remote.get_details()`." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MAPE': 66.59144474679267}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get metric value after the job finishes\n", + "while run_remote.get_status() != \"Completed\":\n", + " {}\n", + "run_remote.get_metrics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Tune Hyperparameters using HyperDrive\n", + "\n", + "Now we are ready to tune the hyperparameters of the LightGBM forecast model by launching multiple runs on the cluster. In the following cell, we define the configurations of a HyperDrive job that does a parallel searching of the hyperparameter space using a Bayesian sampling method. HyperDrive also supports random sampling of the parameter space.\n", + "\n", + "It is recommended that the maximum number of runs should be greater than or equal to 20 times the number of hyperparameters being tuned, for best results with Bayesian sampling. Specifically, it should be no less than 180 in the following case. Nevertheless, we find that even with very small amount of runs Bayesian search can achieve decent performance. Thus, the maximum number of child runs of HyperDrive `max_total_runs` is set as `20` to reduce the running time." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Current value for max_total_runs:20. Recommendend value:180.\n" + ] + } + ], + "source": [ + "# Increase this value if you want to achieve better performance\n", + "max_total_runs = 20\n", + "script_params = {\"--data-folder\": ds_data.as_mount()}\n", + "est = Estimator(\n", + " source_directory=script_folder,\n", + " script_params=script_params,\n", + " compute_target=compute_target,\n", + " use_docker=True,\n", + " entry_script=train_script_name,\n", + " environment_definition=env,\n", + ")\n", + "\n", + "# Specify hyperparameter space\n", + "ps = BayesianParameterSampling(\n", + " {\n", + " \"--num-leaves\": quniform(8, 128, 1),\n", + " \"--min-data-in-leaf\": quniform(20, 500, 10),\n", + " \"--learning-rate\": choice(\n", + " 1e-4, 1e-3, 5e-3, 1e-2, 1.5e-2, 2e-2, 3e-2, 5e-2, 1e-1\n", + " ),\n", + " \"--feature-fraction\": uniform(0.2, 1),\n", + " \"--bagging-fraction\": uniform(0.1, 1),\n", + " \"--bagging-freq\": quniform(1, 20, 1),\n", + " \"--max-rounds\": quniform(50, 2000, 10),\n", + " \"--max-lag\": quniform(3, 40, 1),\n", + " \"--window-size\": quniform(3, 40, 1),\n", + " }\n", + ")\n", + "\n", + "# HyperDrive job configuration\n", + "htc = HyperDriveConfig(\n", + " estimator=est,\n", + " hyperparameter_sampling=ps,\n", + " primary_metric_name=\"MAPE\",\n", + " primary_metric_goal=PrimaryMetricGoal.MINIMIZE,\n", + " max_total_runs=max_total_runs,\n", + " max_concurrent_runs=4,\n", + ")\n", + "\n", + "htr = exp.submit(config=htc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the job finishes, you should see outputs from the AzureML widgets similar to the following. Note that you can rerun `RunDetails(htr).show()` after the job finishes to get the updated results on the dashboard in case it is not automatically refreshed.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5cc42add84c1482eb12f0659c0f0eb9d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/aml.mini.widget.v1": "{\"status\": \"Running\", \"workbench_run_details_uri\": \"https://ml.azure.com/experiments/tune-lgbm-forecast/runs/HD_2d2796ac-1717-4dce-896d-905dcbce2c1a?wsid=/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourcegroups/chhamlwsrg/workspaces/chhamlws\", \"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"run_properties\": {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"created_utc\": \"2020-03-19T21:12:15.556256Z\", \"properties\": {\"primary_metric_config\": \"{\\\"name\\\": \\\"MAPE\\\", \\\"goal\\\": \\\"minimize\\\"}\", \"resume_from\": \"null\", \"runTemplate\": \"HyperDrive\", \"azureml.runsource\": \"hyperdrive\", \"platform\": \"AML\", \"ContentSnapshotId\": \"c3e4a829-831a-45f4-973d-e5093d4639c7\"}, \"tags\": {\"max_concurrent_jobs\": \"4\", \"max_total_jobs\": \"20\", \"max_duration_minutes\": \"10080\", \"policy_config\": \"{\\\"name\\\": \\\"DEFAULT\\\"}\", \"generator_config\": \"{\\\"name\\\": \\\"BAYESIANOPTIMIZATION\\\", \\\"parameter_space\\\": {\\\"--num-leaves\\\": [\\\"quniform\\\", [8, 128, 1]], \\\"--min-data-in-leaf\\\": [\\\"quniform\\\", [20, 500, 10]], \\\"--learning-rate\\\": [\\\"choice\\\", [[0.0001, 0.001, 0.005, 0.01, 0.015, 0.02, 0.03, 0.05, 0.1]]], \\\"--feature-fraction\\\": [\\\"uniform\\\", [0.2, 1]], \\\"--bagging-fraction\\\": [\\\"uniform\\\", [0.1, 1]], \\\"--bagging-freq\\\": [\\\"quniform\\\", [1, 20, 1]], \\\"--max-rounds\\\": [\\\"quniform\\\", [50, 2000, 10]], \\\"--max-lag\\\": [\\\"quniform\\\", [3, 40, 1]], \\\"--window-size\\\": [\\\"quniform\\\", [3, 40, 1]]}}\", \"primary_metric_config\": \"{\\\"name\\\": \\\"MAPE\\\", \\\"goal\\\": \\\"minimize\\\"}\", \"platform_config\": \"{\\\"ServiceAddress\\\": \\\"https://westcentralus.experiments.azureml.net\\\", \\\"ServiceArmScope\\\": \\\"subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourceGroups/chhamlwsrg/providers/Microsoft.MachineLearningServices/workspaces/chhamlws/experiments/tune-lgbm-forecast\\\", \\\"SubscriptionId\\\": \\\"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\\\", \\\"ResourceGroupName\\\": \\\"chhamlwsrg\\\", \\\"WorkspaceName\\\": \\\"chhamlws\\\", \\\"ExperimentName\\\": \\\"tune-lgbm-forecast\\\", \\\"Definition\\\": {\\\"Overrides\\\": {\\\"script\\\": \\\"train_validate.py\\\", \\\"arguments\\\": [\\\"--data-folder\\\", \\\"$AZUREML_DATAREFERENCE_5442e91c25f449ff9a9780f48c6d7792\\\"], \\\"target\\\": \\\"cpu-cluster\\\", \\\"framework\\\": \\\"Python\\\", \\\"communicator\\\": \\\"None\\\", \\\"maxRunDurationSeconds\\\": null, \\\"nodeCount\\\": 1, \\\"environment\\\": {\\\"name\\\": null, \\\"version\\\": null, \\\"environmentVariables\\\": {\\\"EXAMPLE_ENV_VAR\\\": \\\"EXAMPLE_VALUE\\\"}, \\\"python\\\": {\\\"userManagedDependencies\\\": false, \\\"interpreterPath\\\": \\\"python\\\", \\\"condaDependenciesFile\\\": null, \\\"baseCondaEnvironment\\\": null, \\\"condaDependencies\\\": {\\\"name\\\": \\\"project_environment\\\", \\\"dependencies\\\": [\\\"python=3.6.2\\\", {\\\"pip\\\": [\\\"azureml-defaults\\\"]}, \\\"pandas\\\", \\\"numpy\\\", \\\"scipy\\\", \\\"scikit-learn\\\", \\\"lightgbm\\\", \\\"joblib\\\"], \\\"channels\\\": [\\\"conda-forge\\\"]}}, \\\"docker\\\": {\\\"enabled\\\": true, \\\"baseImage\\\": \\\"mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04\\\", \\\"baseDockerfile\\\": null, \\\"sharedVolumes\\\": true, \\\"shmSize\\\": \\\"2g\\\", \\\"arguments\\\": [], \\\"baseImageRegistry\\\": {\\\"address\\\": null, \\\"username\\\": null, \\\"password\\\": null}}, \\\"spark\\\": {\\\"repositories\\\": [], \\\"packages\\\": [], \\\"precachePackages\\\": true}, \\\"databricks\\\": {\\\"mavenLibraries\\\": [], \\\"pypiLibraries\\\": [], \\\"rcranLibraries\\\": [], \\\"jarLibraries\\\": [], \\\"eggLibraries\\\": []}, \\\"inferencingStackVersion\\\": null}, \\\"history\\\": {\\\"outputCollection\\\": true, \\\"snapshotProject\\\": true, \\\"directoriesToWatch\\\": [\\\"logs\\\"]}, \\\"spark\\\": {\\\"configuration\\\": {\\\"spark.app.name\\\": \\\"Azure ML Experiment\\\", \\\"spark.yarn.maxAppAttempts\\\": 1}}, \\\"hdi\\\": {\\\"yarnDeployMode\\\": \\\"cluster\\\"}, \\\"tensorflow\\\": {\\\"workerCount\\\": 1, \\\"parameterServerCount\\\": 1}, \\\"mpi\\\": {\\\"processCountPerNode\\\": 1}, \\\"dataReferences\\\": {\\\"5442e91c25f449ff9a9780f48c6d7792\\\": {\\\"dataStoreName\\\": \\\"workspaceblobstore\\\", \\\"pathOnDataStore\\\": \\\"data\\\", \\\"mode\\\": \\\"mount\\\", \\\"overwrite\\\": false, \\\"pathOnCompute\\\": null}}, \\\"data\\\": {}, \\\"sourceDirectoryDataStore\\\": null, \\\"amlcompute\\\": {\\\"vmSize\\\": null, \\\"vmPriority\\\": null, \\\"retainCluster\\\": false, \\\"name\\\": null, \\\"clusterMaxNodeCount\\\": 1}}, \\\"TargetDetails\\\": null, \\\"SnapshotId\\\": \\\"c3e4a829-831a-45f4-973d-e5093d4639c7\\\", \\\"TelemetryValues\\\": {\\\"amlClientType\\\": \\\"azureml-sdk-train\\\", \\\"amlClientModule\\\": \\\"azureml.train.hyperdrive._search\\\", \\\"amlClientFunction\\\": \\\"search\\\", \\\"tenantId\\\": \\\"72f988bf-86f1-41af-91ab-2d7cd011db47\\\", \\\"amlClientRequestId\\\": \\\"460b5db2-3fac-4065-bb34-fa43a1fbcb53\\\", \\\"amlClientSessionId\\\": \\\"729a95f4-a59a-471a-94a6-cb8bffc6ea3a\\\", \\\"subscriptionId\\\": \\\"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\\\", \\\"estimator\\\": \\\"Estimator\\\", \\\"samplingMethod\\\": \\\"BayesianOptimization\\\", \\\"terminationPolicy\\\": \\\"Default\\\", \\\"primaryMetricGoal\\\": \\\"minimize\\\", \\\"maxTotalRuns\\\": 20, \\\"maxConcurrentRuns\\\": 4, \\\"maxDurationMinutes\\\": 10080, \\\"computeTarget\\\": \\\"AmlCompute\\\", \\\"vmSize\\\": null}}}\", \"resume_child_runs\": \"null\", \"all_jobs_generated\": \"false\", \"cancellation_requested\": \"false\", \"progress_metadata_evaluation_timestamp\": \"\\\"2020-03-19T21:12:16.513664\\\"\", \"progress_metadata_digest\": \"\\\"5fea6da115a28e7e578bf15f72768a97c8336919197281c1b5a301933aad5e22\\\"\", \"progress_metadata_active_timestamp\": \"\\\"2020-03-19T21:12:16.513664\\\"\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_0\": \"{\\\"--num-leaves\\\": 99, \\\"--min-data-in-leaf\\\": 50, \\\"--learning-rate\\\": 0.1, \\\"--feature-fraction\\\": 0.8132296030587374, \\\"--bagging-fraction\\\": 0.7418407116160189, \\\"--bagging-freq\\\": 16, \\\"--max-rounds\\\": 1930, \\\"--max-lag\\\": 17, \\\"--window-size\\\": 21}\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_1\": \"{\\\"--num-leaves\\\": 27, \\\"--min-data-in-leaf\\\": 120, \\\"--learning-rate\\\": 0.005, \\\"--feature-fraction\\\": 0.3706438325846761, \\\"--bagging-fraction\\\": 0.5542795601096442, \\\"--bagging-freq\\\": 18, \\\"--max-rounds\\\": 1760, \\\"--max-lag\\\": 33, \\\"--window-size\\\": 15}\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_2\": \"{\\\"--num-leaves\\\": 28, \\\"--min-data-in-leaf\\\": 310, \\\"--learning-rate\\\": 0.0001, \\\"--feature-fraction\\\": 0.4168752967156847, \\\"--bagging-fraction\\\": 0.6030886362720486, \\\"--bagging-freq\\\": 3, \\\"--max-rounds\\\": 720, \\\"--max-lag\\\": 21, \\\"--window-size\\\": 35}\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3\": \"{\\\"--num-leaves\\\": 15, \\\"--min-data-in-leaf\\\": 390, \\\"--learning-rate\\\": 0.03, \\\"--feature-fraction\\\": 0.3668251847896762, \\\"--bagging-fraction\\\": 0.3815013306553845, \\\"--bagging-freq\\\": 12, \\\"--max-rounds\\\": 770, \\\"--max-lag\\\": 38, \\\"--window-size\\\": 28}\", \"environment_preparation_status\": \"PREPARED\", \"prepare_run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_preparation\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_4\": \"{\\\"--num-leaves\\\": 13, \\\"--min-data-in-leaf\\\": 450, \\\"--learning-rate\\\": 0.02, \\\"--feature-fraction\\\": 0.2901471521383765, \\\"--bagging-fraction\\\": 0.3065771878629929, \\\"--bagging-freq\\\": 17, \\\"--max-rounds\\\": 1800, \\\"--max-lag\\\": 4, \\\"--window-size\\\": 37}\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_5\": \"{\\\"--num-leaves\\\": 80, \\\"--min-data-in-leaf\\\": 30, \\\"--learning-rate\\\": 0.03, \\\"--feature-fraction\\\": 0.9078890478449191, \\\"--bagging-fraction\\\": 0.45835888307459804, \\\"--bagging-freq\\\": 10, \\\"--max-rounds\\\": 650, \\\"--max-lag\\\": 27, \\\"--window-size\\\": 25}\", \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_6\": \"{\\\"--num-leaves\\\": 53, \\\"--min-data-in-leaf\\\": 250, \\\"--learning-rate\\\": 0.001, \\\"--feature-fraction\\\": 0.9904628066051346, \\\"--bagging-fraction\\\": 0.9205596553917755, \\\"--bagging-freq\\\": 15, \\\"--max-rounds\\\": 1340, \\\"--max-lag\\\": 22, \\\"--window-size\\\": 34}\"}, \"end_time_utc\": null, \"status\": \"Running\", \"log_files\": {\"azureml-logs/hyperdrive.txt\": \"https://chhamlws4931040064.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_2d2796ac-1717-4dce-896d-905dcbce2c1a/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=VLRUwMTzJCdY%2FZ4j9Zw%2Bh7AEBdIiFhexEtfRTmbjhzQ%3D&st=2020-03-19T21%3A07%3A21Z&se=2020-03-20T05%3A17%3A21Z&sp=r\"}, \"log_groups\": [[\"azureml-logs/hyperdrive.txt\"]], \"run_duration\": \"0:05:06\", \"hyper_parameters\": {\"--num-leaves\": [\"quniform\", [8, 128, 1]], \"--min-data-in-leaf\": [\"quniform\", [20, 500, 10]], \"--learning-rate\": [\"choice\", [[0.0001, 0.001, 0.005, 0.01, 0.015, 0.02, 0.03, 0.05, 0.1]]], \"--feature-fraction\": [\"uniform\", [0.2, 1]], \"--bagging-fraction\": [\"uniform\", [0.1, 1]], \"--bagging-freq\": [\"quniform\", [1, 20, 1]], \"--max-rounds\": [\"quniform\", [50, 2000, 10]], \"--max-lag\": [\"quniform\", [3, 40, 1]], \"--window-size\": [\"quniform\", [3, 40, 1]]}}, \"child_runs\": [{\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_2\", \"run_number\": 126, \"metric\": 75.24892867, \"status\": \"Completed\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:13:22.910343Z\", \"end_time\": \"2020-03-19T21:14:57.403012Z\", \"created_time\": \"2020-03-19T21:12:49.005902Z\", \"created_time_dt\": \"2020-03-19T21:12:49.005902Z\", \"duration\": \"0:02:08\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 28, \"param_--min-data-in-leaf\": 310, \"param_--learning-rate\": 0.0001, \"param_--feature-fraction\": 0.4168752967156847, \"param_--bagging-fraction\": 0.6030886362720486, \"param_--bagging-freq\": 3, \"param_--max-rounds\": 720, \"param_--max-lag\": 21, \"param_--window-size\": 35, \"best_metric\": 75.24892867}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_1\", \"run_number\": 127, \"metric\": 39.54132385, \"status\": \"Completed\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:13:22.100062Z\", \"end_time\": \"2020-03-19T21:15:46.098653Z\", \"created_time\": \"2020-03-19T21:12:49.08304Z\", \"created_time_dt\": \"2020-03-19T21:12:49.08304Z\", \"duration\": \"0:02:57\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 27, \"param_--min-data-in-leaf\": 120, \"param_--learning-rate\": 0.005, \"param_--feature-fraction\": 0.3706438325846761, \"param_--bagging-fraction\": 0.5542795601096442, \"param_--bagging-freq\": 18, \"param_--max-rounds\": 1760, \"param_--max-lag\": 33, \"param_--window-size\": 15, \"best_metric\": 39.54132385}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_0\", \"run_number\": 128, \"metric\": 36.68529907, \"status\": \"Completed\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:13:23.334757Z\", \"end_time\": \"2020-03-19T21:17:20.148798Z\", \"created_time\": \"2020-03-19T21:12:49.2375Z\", \"created_time_dt\": \"2020-03-19T21:12:49.2375Z\", \"duration\": \"0:04:30\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 99, \"param_--min-data-in-leaf\": 50, \"param_--learning-rate\": 0.1, \"param_--feature-fraction\": 0.8132296030587374, \"param_--bagging-fraction\": 0.7418407116160189, \"param_--bagging-freq\": 16, \"param_--max-rounds\": 1930, \"param_--max-lag\": 17, \"param_--window-size\": 21, \"best_metric\": 36.68529907}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3\", \"run_number\": 129, \"metric\": 37.11300476, \"status\": \"Completed\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:13:21.844298Z\", \"end_time\": \"2020-03-19T21:15:02.636388Z\", \"created_time\": \"2020-03-19T21:12:49.556578Z\", \"created_time_dt\": \"2020-03-19T21:12:49.556578Z\", \"duration\": \"0:02:13\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 15, \"param_--min-data-in-leaf\": 390, \"param_--learning-rate\": 0.03, \"param_--feature-fraction\": 0.3668251847896762, \"param_--bagging-fraction\": 0.3815013306553845, \"param_--bagging-freq\": 12, \"param_--max-rounds\": 770, \"param_--max-lag\": 38, \"param_--window-size\": 28, \"best_metric\": 36.68529907}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_5\", \"run_number\": 130, \"metric\": 41.06715386, \"status\": \"Running\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:15:55.338891Z\", \"end_time\": \"\", \"created_time\": \"2020-03-19T21:15:22.195216Z\", \"created_time_dt\": \"2020-03-19T21:15:22.195216Z\", \"duration\": \"0:02:00\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 80, \"param_--min-data-in-leaf\": 30, \"param_--learning-rate\": 0.03, \"param_--feature-fraction\": 0.9078890478449191, \"param_--bagging-fraction\": 0.45835888307459804, \"param_--bagging-freq\": 10, \"param_--max-rounds\": 650, \"param_--max-lag\": 27, \"param_--window-size\": 25, \"best_metric\": 36.68529907}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_4\", \"run_number\": 131, \"metric\": null, \"status\": \"Running\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:15:56.362681Z\", \"end_time\": \"\", \"created_time\": \"2020-03-19T21:15:22.801193Z\", \"created_time_dt\": \"2020-03-19T21:15:22.801193Z\", \"duration\": \"0:01:59\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 13, \"param_--min-data-in-leaf\": 450, \"param_--learning-rate\": 0.02, \"param_--feature-fraction\": 0.2901471521383765, \"param_--bagging-fraction\": 0.3065771878629929, \"param_--bagging-freq\": 17, \"param_--max-rounds\": 1800, \"param_--max-lag\": 4, \"param_--window-size\": 37, \"best_metric\": null}, {\"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_6\", \"run_number\": 132, \"metric\": null, \"status\": \"Running\", \"run_type\": \"azureml.scriptrun\", \"training_percent\": null, \"start_time\": \"2020-03-19T21:16:25.813014Z\", \"end_time\": \"\", \"created_time\": \"2020-03-19T21:15:53.807197Z\", \"created_time_dt\": \"2020-03-19T21:15:53.807197Z\", \"duration\": \"0:01:28\", \"hyperdrive_id\": \"2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"arguments\": null, \"param_--num-leaves\": 53, \"param_--min-data-in-leaf\": 250, \"param_--learning-rate\": 0.001, \"param_--feature-fraction\": 0.9904628066051346, \"param_--bagging-fraction\": 0.9205596553917755, \"param_--bagging-freq\": 15, \"param_--max-rounds\": 1340, \"param_--max-lag\": 22, \"param_--window-size\": 34, \"best_metric\": null}], \"children_metrics\": {\"categories\": [0], \"series\": {\"MAPE\": [{\"categories\": [126, 127, 128, 129, 130], \"mode\": \"markers\", \"name\": \"MAPE\", \"stepped\": false, \"type\": \"scatter\", \"data\": [75.2489286722675, 39.5413238482365, 36.68529906716439, 37.1130047640178, 41.06715386014593]}, {\"categories\": [126, 127, 128, 129, 130], \"mode\": \"lines\", \"name\": \"MAPE_min\", \"stepped\": true, \"type\": \"scatter\", \"data\": [75.2489286722675, 39.5413238482365, 36.68529906716439, 36.68529906716439, 36.68529906716439]}]}, \"metricName\": null, \"primaryMetricName\": \"MAPE\", \"showLegend\": false}, \"run_metrics\": [{\"name\": \"best_child_by_primary_metric\", \"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a\", \"categories\": [0], \"series\": [{\"data\": [{\"metric_name\": \"MAPE\", \"timestamp\": \"2020-03-19 21:15:15.945032+00:00\", \"run_id\": \"HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3\", \"metric_value\": 37.1130047640178, \"final\": false}]}]}], \"run_logs\": \"[2020-03-19T21:12:15.797368][API][INFO]Experiment created\\r\\n[2020-03-19T21:12:16.647816][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space\\r\\n[2020-03-19T21:12:16.8347749Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.\\r\\n[2020-03-19T21:12:16.763629][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.\\r\\n[2020-03-19T21:12:48.2820889Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3'\\r\\n[2020-03-19T21:12:48.2795514Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_1'\\r\\n[2020-03-19T21:12:48.2806839Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_2'\\r\\n[2020-03-19T21:12:48.2772616Z][SCHEDULER][INFO]The execution environment was successfully prepared.\\r\\n[2020-03-19T21:12:48.2782467Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_0'\\r\\n[2020-03-19T21:12:49.1346437Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_2'\\r\\n[2020-03-19T21:12:49.2097647Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_1'\\r\\n[2020-03-19T21:12:49.5936141Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_0'\\r\\n[2020-03-19T21:12:49.6453613Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3'\\r\\n[2020-03-19T21:15:19.237941][GENERATOR][INFO]Trying to sample '2' jobs from the hyperparameter space\\r\\n[2020-03-19T21:15:19.548213][GENERATOR][INFO]Successfully sampled '2' jobs, they will soon be submitted to the execution target.\\r\\n[2020-03-19T21:15:21.2292149Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_4'\\r\\n[2020-03-19T21:15:21.2308195Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_5'\\r\\n[2020-03-19T21:15:22.3288867Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_5'\\r\\n[2020-03-19T21:15:23.1078686Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_4'\\r\\n[2020-03-19T21:15:49.563444][GENERATOR][INFO]Trying to sample '1' jobs from the hyperparameter space\\r\\n[2020-03-19T21:15:49.950635][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.\\r\\n[2020-03-19T21:15:53.3050388Z][SCHEDULER][INFO]Scheduling job, id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_6'\\r\\n[2020-03-19T21:15:53.8900932Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_6'\\n\", \"graph\": {}, \"widget_settings\": {\"childWidgetDisplay\": \"popup\", \"send_telemetry\": true, \"log_level\": \"INFO\", \"sdk_version\": \"1.0.85\"}, \"loading\": false}" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "RunDetails(htr).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_0': {'MAPE': 36.68529906716439},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_1': {'MAPE': 39.5413238482365},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_10': {'MAPE': 39.48909471171353},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_11': {'MAPE': 32.65702658023425},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_12': {'MAPE': 32.93810233523822},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_13': {'MAPE': 37.62926074943101},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_14': {'MAPE': 42.136255340447754},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_15': {'MAPE': 72.76283423841294},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_16': {'MAPE': 38.453668227285334},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_17': {'MAPE': 41.00447880109714},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_18': {'MAPE': 34.26881841497758},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_19': {'MAPE': 50.750403770935534},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_2': {'MAPE': 75.2489286722675},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_3': {'MAPE': 37.1130047640178},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_4': {'MAPE': 36.51222909108317},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_5': {'MAPE': 41.06715386014593},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_6': {'MAPE': 46.580738749741506},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_7': {'MAPE': 29.75961762449498},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_8': {'MAPE': 51.76561288108014},\n", + " 'HD_2d2796ac-1717-4dce-896d-905dcbce2c1a_9': {'MAPE': 36.66639457885292}}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "while htr.get_status() != \"Completed\":\n", + " {}\n", + "htr.get_metrics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The best model and its hyperparameter values can be retrieved as follows" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['--data-folder', '$AZUREML_DATAREFERENCE_5442e91c25f449ff9a9780f48c6d7792', '--num-leaves', '108', '--min-data-in-leaf', '410', '--learning-rate', '0.05', '--feature-fraction', '0.418894687852234', '--bagging-fraction', '0.54813576760277', '--bagging-freq', '7', '--max-rounds', '1710', '--max-lag', '8', '--window-size', '25']\n" + ] + } + ], + "source": [ + "best_run = htr.get_best_run_by_primary_metric()\n", + "parameter_values = best_run.get_details()[\"runDefinition\"][\"arguments\"]\n", + "print(parameter_values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then register the folder (and all files in it) as a model named `lgbm-oj-forecast` under the workspace for deployment." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "model = best_run.register_model(\n", + " model_name=\"lgbm-oj-forecast\", model_path=\"outputs/model\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the Model in ACI\n", + "\n", + "Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n", + "\n", + "### Create score.py\n", + "\n", + "First, we will create a scoring script that will be invoked by the web service call.\n", + "\n", + "* Note that the scoring script must have two required functions, `init()` and `run(input_data)`.\n", + " - In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started.\n", + " - In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to run typically use JSON as serialization and de-serialization format but you are not limited to that." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting score.py\n" + ] + } + ], + "source": [ + "%%writefile score.py\n", + "import os\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import lightgbm as lgb\n", + "\n", + "\n", + "def init():\n", + " global bst\n", + " model_root = os.getenv(\"AZUREML_MODEL_DIR\")\n", + " # The name of the folder in which to look for LightGBM model files\n", + " lgbm_model_folder = \"model\"\n", + " bst = lgb.Booster(\n", + " model_file=os.path.join(model_root, lgbm_model_folder, \"bst-model.txt\")\n", + " )\n", + "\n", + "\n", + "def run(raw_data):\n", + " columns = bst.feature_name()\n", + " data = np.array(json.loads(raw_data)[\"data\"])\n", + " test_df = pd.DataFrame(data=data, columns=columns)\n", + " # Make prediction\n", + " out = bst.predict(test_df)\n", + " return out.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create myenv.yml\n", + "\n", + "We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `pandas`, and `lightgbm`." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Conda environment specification. The dependencies defined in this file will\r\n", + "# be automatically provisioned for runs with userManagedDependencies=False.\r\n", + "\n", + "# Details about the Conda environment file format:\r\n", + "# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually\r\n", + "\n", + "name: project_environment\n", + "dependencies:\n", + " # The python interpreter version.\r\n", + " # Currently Azure ML only supports 3.5.2 and later.\r\n", + "- python=3.6.2\n", + "\n", + "- pip:\n", + " - azureml-defaults\n", + "- numpy=1.16.2\n", + "- pandas=0.23.4\n", + "- lightgbm=2.3.0\n", + "channels:\n", + "- conda-forge\n", + "\n" + ] + } + ], + "source": [ + "cd = CondaDependencies.create()\n", + "cd.add_conda_package(\"numpy=1.16.2\")\n", + "cd.add_conda_package(\"pandas=0.23.4\")\n", + "cd.add_conda_package(\"lightgbm=2.3.0\")\n", + "cd.save_to_file(base_directory=\"./\", conda_file_path=\"myenv.yml\")\n", + "\n", + "print(cd.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy to ACI\n", + "\n", + "We are almost ready to deploy. In the next cell, we first create the inference configuration and deployment configuration. Then, we deploy the model to ACI. This cell will run for several minutes." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running............................\n", + "Succeeded\n", + "ACI service creation operation finished, operation \"Succeeded\"\n", + "Healthy\n", + "CPU times: user 385 ms, sys: 77.5 ms, total: 463 ms\n", + "Wall time: 2min 46s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "inference_config = InferenceConfig(runtime=\"python\", entry_script=\"score.py\", conda_file=\"myenv.yml\")\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(\n", + " cpu_cores=1,\n", + " memory_gb=1,\n", + " tags={\"name\": \"ojdata\", \"framework\": \"LightGBM\"},\n", + " description=\"LightGBM model on Orange Juice data\",\n", + ")\n", + "\n", + "service = Model.deploy(\n", + " workspace=ws, name=\"lgbm-oj-svc\", models=[model], inference_config=inference_config, deployment_config=aciconfig\n", + ")\n", + "\n", + "service.wait_for_deployment(True)\n", + "print(service.state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Tip: If something goes wrong with the deployment, you could look at the logs from the service by running this command `print(service.get_logs())`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the scoring web service endpoint:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://61ad47b3-7ad6-4093-8535-a5324b2238c7.westus.azurecontainer.io/score\n" + ] + } + ], + "source": [ + "print(service.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the web service is successfully deployed, you will see a deployment in the Azure Machine Learning workspace on Azure portal\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the deployed model\n", + "\n", + "Let's test the deployed model. We create a few test data points and send them to the web service hosted in ACI. Note here we are using the run API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n", + "\n", + "After the invocation, we print the returned predictions each of which represents the forecasted sales of a target store, brand in a given week as specified by `store, brand, week` in `used_columns`." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prediction: [12360.719047885588, 18866.86295711333, 5805.556346800228]\n" + ] + } + ], + "source": [ + "# Prepare features according to the input schema of the best model\n", + "train_dir = os.path.join(DATA_DIR, \"train\")\n", + "max_lag = int(parameter_values[parameter_values.index(\"--max-lag\") + 1])\n", + "lags = np.arange(2, max_lag + 1)\n", + "window_size = int(parameter_values[parameter_values.index(\"--window-size\") + 1])\n", + "used_columns = [\n", + " \"store\",\n", + " \"brand\",\n", + " \"week\",\n", + " \"week_of_month\",\n", + " \"month\",\n", + " \"deal\",\n", + " \"feat\",\n", + " \"move\",\n", + " \"price\",\n", + " \"price_ratio\",\n", + "]\n", + "GAP = 2\n", + "features, train_end_week = create_features(\n", + " 1, train_dir, lags, window_size, used_columns\n", + ")\n", + "test_fea = features[features.week >= train_end_week + GAP].reset_index(drop=True)\n", + "test_fea.drop(\"move\", axis=1, inplace=True)\n", + "\n", + "# Pick a few test data points\n", + "test_samples = json.dumps({\"data\": np.array(test_fea.iloc[:3]).tolist()})\n", + "test_samples = bytes(test_samples, encoding=\"utf8\")\n", + "\n", + "# Predict using the deployed model\n", + "result = service.run(input_data=test_samples)\n", + "print(\"prediction:\", result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also send raw HTTP request to the service." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "POST to url http://61ad47b3-7ad6-4093-8535-a5324b2238c7.westus.azurecontainer.io/score\n", + "\n", + "input data: b'{\"data\": [[2.0, 1.0, 137.0, 4.0, 4.0, 0.0, 0.0, 0.0416446872, 1.1124927835293534, 12416.0, 28096.0, 15168.0, 20736.0, 31808.0, 25728.0, 43584.0, 14453.76], [2.0, 1.0, 138.0, 5.0, 4.0, 1.0, 1.0, 0.03734375, 0.9420125411290402, 12416.0, 12416.0, 28096.0, 15168.0, 20736.0, 31808.0, 25728.0, 14699.52], [2.0, 2.0, 137.0, 4.0, 4.0, 0.0, 0.0, 0.0519791667, 1.388567227553081, 11424.0, 4992.0, 7008.0, 6816.0, 5280.0, 7296.0, 5664.0, 9219.84]]}'\n", + "\n", + "prediction: [12360.719047885588, 18866.86295711333, 5805.556346800228]\n" + ] + } + ], + "source": [ + "headers = {\"Content-Type\": \"application/json\"}\n", + "\n", + "resp = requests.post(service.scoring_uri, test_samples, headers=headers)\n", + "\n", + "print(\"POST to url\", service.scoring_uri)\n", + "print(\"\")\n", + "print(\"input data:\", test_samples)\n", + "print(\"\")\n", + "print(\"prediction:\", resp.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up\n", + "\n", + "After finishing the tests, you can delete the ACI deployment with a simple delete API call as follows." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Reading:\n", + "\n", + "\\[1\\] Training, hyperparameter tune, and deploy with TensorFlow: https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb
\n", + "\n", + "\\[2\\] AzureML HyperDrive package: https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive?view=azure-ml-py" + ] + } + ], + "metadata": { + "author_info": { + "affiliation": "Microsoft", + "created_by": "Chenhui Hu" + }, + "kernelspec": { + "display_name": "forecasting_env", + "language": "python", + "name": "forecasting_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/grocery_sales/python/README.md b/examples/grocery_sales/python/README.md new file mode 100644 index 00000000..8e87201e --- /dev/null +++ b/examples/grocery_sales/python/README.md @@ -0,0 +1,16 @@ +# Forecasting examples in Python + +This folder contains Jupyter notebooks with Python examples for building forecasting solutions. To run the notebooks, please ensure your environment is set up with required dependencies by following instructions in the [Setup guide](../../../docs/SETUP.md). + + +## Summary + +The following summarizes each directory of the Python best practice notebooks. + +| Directory | Content | Description | +| --- | --- | --- | +| [00_quick_start](./00_quick_start)| [autoarima_single_round.ipynb](./00_quick_start/autoarima_single_round.ipynb)
[azure_automl_single_round.ipynb](./00_quick_start/azure_automl_single_round.ipynb)
[lightgbm_single_round.ipynb](./00_quick_start/lightgbm_single_round.ipynb) | Quick start notebooks that demonstrate workflow of developing a forecasting model using one-round training and testing data| +| [01_prepare_data](./01_prepare_data) | [ojdata_exploration.ipynb](./01_prepare_data/ojdata_exploration.ipynb)
[ojdata_preparation.ipynb](./01_prepare_data/ojdata_preparation.ipynb) | Data exploration and preparation notebooks| +| [02_model](./02_model) | [dilatedcnn_multi_round.ipynb](./02_model/dilatedcnn_multi_round.ipynb)
[lightgbm_multi_round.ipynb](./02_model/lightgbm_multi_round.ipynb)
[autoarima_multi_round.ipynb](./02_model/autoarima_multi_round.ipynb) | Deep dive notebooks that perform multi-round training and testing of various classical and deep learning forecast algorithms| +| [03_model_tune_deploy](./03_model_tune_deploy/) | [azure_hyperdrive_lightgbm.ipynb](./03_model_tune_deploy/azure_hyperdrive_lightgbm.ipynb)
[aml_scripts/](./03_model_tune_deploy/aml_scripts) |
  • Example notebook for model tuning using Azure Machine Learning Service and deploying the best model on Azure
  • Scripts for model training and validation
| + diff --git a/fclib/README.md b/fclib/README.md index 60ab6424..75a72611 100644 --- a/fclib/README.md +++ b/fclib/README.md @@ -1,11 +1,40 @@ # Forecasting library -A set of utility functions for forecasting. +Building forecasting models can involve tedious tasks ranging from data loading, dataset understanding, model development, model evaluation to deployment of trained models. To assist with these tasks, we developed a forecasting library - **fclib**. You'll see this library used widely in sample notebooks in [examples](../examples). The following provides a short description of the sub-modules. For more details about what functions/classes/utitilies are available and how to use them, please review the doc-strings provided with the code and see the sample notebooks in [examples](../examples) directory. -## Install +## Submodules -```bash -pip install -e . +### [AzureML](fclib/azureml) + +The AzureML submodule contains utilities to connect to an Azure Machine Learning workspace, train, tune and operationalize forecasting models at scale using AzureML. + + +### [Common](fclib/common) + +This submodule contains high-level utilities that are commonly used in multiple algorithms as well as helper functions for visualizing forecasting predictions. + +### [Dataset](fclib/dataset) +This submodule includes helper functions for interacting with datasets used in the example notebooks, utility functions to process datasets for different models tasks, as well as utilities for splitting data for training/testing. For example, the [ojdata](fclib/dataset/ojdata.py) submodule will allow you to download and process Orange Juice data set, as well as split it into training and testing rounds. + +```python +from fclib.dataset.ojdata import download_ojdata, split_train_test + +download_ojdata(DATA_DIR) +train_df_list, test_df_list, _ = split_train_test( + DATA_DIR, + n_splits=N_SPLITS, + horizon=HORIZON, + gap=GAP, + first_week=FIRST_WEEK, + last_week=LAST_WEEK +) ``` -This will install the package fclib. \ No newline at end of file +### [Evaluation](fclib/evaluation) +Evaluation module includes functionalities for computing common forecasting evaluation metrics, more specifically `MAPE`, `sMAPE`, and `pinball loss`. + +### [Feature Engineering](fclib/feature_engineering) +Feature engineering module contains utilities to create various time series features, for example, week or day of month, lagged features, and moving average features. This module is used widely in machine-learning based approaches to forecasting, in which time series data is transformed into a tabular featurized dataset, that becomes input to a machine learning method. + +### [Models](fclib/models) +The models module contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new forecasting solutions. Some submodules found here are: `lightgbm`, `dilated cnn`, etc. A more detailed description of which algorithms are used in our examples can be found in [this README](../examples/oj_retail/python/README.md). \ No newline at end of file diff --git a/fclib/fclib/azureml/azureml_utils.py b/fclib/fclib/azureml/azureml_utils.py index c0f9707e..e0c4aef8 100644 --- a/fclib/fclib/azureml/azureml_utils.py +++ b/fclib/fclib/azureml/azureml_utils.py @@ -2,6 +2,149 @@ # Licensed under the MIT License. """ -This file contains utility functions for using AzureML SDK in the -development of forecasting solutions. +This file contains utility functions for interacting with Azure ML Resources. +Reused code from +https://github.com/microsoft/nlp-recipes/blob/master/utils_nlp/azureml/azureml_utils.py """ + +import os +from azureml.core.authentication import AzureCliAuthentication +from azureml.core.authentication import InteractiveLoginAuthentication +from azureml.core.authentication import AuthenticationException +from azureml.core import Workspace +from azureml.exceptions import ProjectSystemException +from azureml.core.compute import ComputeTarget, AmlCompute +from azureml.core.compute_target import ComputeTargetException + + +def get_auth(): + """ + Method to get the correct Azure ML Authentication type + + Always start with CLI Authentication and if it fails, fall back + to interactive login + """ + try: + auth_type = AzureCliAuthentication() + auth_type.get_authentication_header() + except AuthenticationException: + auth_type = InteractiveLoginAuthentication() + return auth_type + + +def get_or_create_workspace( + config_path="./.azureml", subscription_id=None, resource_group=None, workspace_name=None, workspace_region=None, +): + """ + Method to get or create workspace. + + Args: + config_path: optional directory to look for / store config.json file (defaults to current + directory) + subscription_id: Azure subscription id + resource_group: Azure resource group to create workspace and related resources + workspace_name: name of azure ml workspace + workspace_region: region for workspace + + Returns: + obj: AzureML workspace if one exists already with the name otherwise creates a new one. + """ + config_file_path = "." + + if config_path is not None: + config_dir, config_file_name = os.path.split(config_path) + if config_file_name != "config.json": + config_file_path = os.path.join(config_path, "config.json") + + try: + # Get existing azure ml workspace + if os.path.isfile(config_file_path): + ws = Workspace.from_config(config_file_path, auth=get_auth()) + else: + ws = Workspace.get( + name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, auth=get_auth(), + ) + + except ProjectSystemException: + # This call might take a minute or two. + print("Creating new workspace") + ws = Workspace.create( + name=workspace_name, + subscription_id=subscription_id, + resource_group=resource_group, + create_resource_group=True, + location=workspace_region, + auth=get_auth(), + ) + + ws.write_config(path=config_path) + return ws + + +def get_or_create_amlcompute( + workspace, compute_name, vm_size="", min_nodes=0, max_nodes=None, idle_seconds_before_scaledown=None, verbose=False, +): + """ + Get or create AmlCompute as the compute target. If a cluster of the same name is found, + attach it and rescale accordingly. Otherwise, create a new cluster. + + Args: + workspace (Workspace): workspace + compute_name (str): name + vm_size (str, optional): vm size + min_nodes (int, optional): minimum number of nodes in cluster + max_nodes (None, optional): maximum number of nodes in cluster + idle_seconds_before_scaledown (None, optional): how long to wait before the cluster + autoscales down + verbose (bool, optional): if true, print logs + Returns: + Compute target + """ + try: + if verbose: + print("Found compute target: {}".format(compute_name)) + + compute_target = ComputeTarget(workspace=workspace, name=compute_name) + if len(compute_target.list_nodes()) < max_nodes: + if verbose: + print("Rescaling to {} nodes".format(max_nodes)) + compute_target.update(max_nodes=max_nodes) + compute_target.wait_for_completion(show_output=verbose) + + except ComputeTargetException: + if verbose: + print("Creating new compute target: {}".format(compute_name)) + + compute_config = AmlCompute.provisioning_configuration( + vm_size=vm_size, + min_nodes=min_nodes, + max_nodes=max_nodes, + idle_seconds_before_scaledown=idle_seconds_before_scaledown, + ) + compute_target = ComputeTarget.create(workspace, compute_name, compute_config) + compute_target.wait_for_completion(show_output=verbose) + + return compute_target + + +def get_output_files(run, output_path, file_names=None): + """ + Method to get the output files from an AzureML output directory. + + Args: + file_names(list): Names of the files to download. + run(azureml.core.run.Run): Run object of the run. + output_path(str): Path to download the output files. + + Returns: None + + """ + os.makedirs(output_path, exist_ok=True) + + if file_names is None: + file_names = run.get_file_names() + + for f in file_names: + dest = os.path.join(output_path, f.split("/")[-1]) + print("Downloading file {} to {}...".format(f, dest)) + run.download_file(f, dest) diff --git a/fclib/fclib/dataset/download_oj_data.R b/fclib/fclib/dataset/load_oj_data.R similarity index 57% rename from fclib/fclib/dataset/download_oj_data.R rename to fclib/fclib/dataset/load_oj_data.R index dcbb5604..9d780d97 100755 --- a/fclib/fclib/dataset/download_oj_data.R +++ b/fclib/fclib/dataset/load_oj_data.R @@ -1,20 +1,25 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# This script retrieves the orangeJuice dataset from the bayesm R package and saves the data as csv +# This script retrieves the orangeJuice dataset from the bayesm R package and saves the data as csv. +# +# Two arguments must be supplied to this script: +# +# RDA_PATH - path to the local .rda file containing the data +# DATA_DIR - destination directory for saving processed .csv files args = commandArgs(trailingOnly=TRUE) -# test if there is at least one argument: if not, return an error -if (length(args)==0) { - stop("At least one argument must be supplied (data directory).", call.=FALSE) -} else if (length(args)==1) { - DATA_DIR <- args[1] -} +# Test if there are at least two arguments: if not, return an error +if (length(args)==2) { + RDA_PATH <- args[1] + DATA_DIR <- args[2] +} else { + stop("Two arguments must be supplied - path to .rda file and destination data directory).", call.=FALSE) +} # Load the data from bayesm library -library(bayesm) -data("orangeJuice") +load(RDA_PATH) yx <- orangeJuice[[1]] storedemo <- orangeJuice[[2]] diff --git a/fclib/fclib/dataset/ojdata.py b/fclib/fclib/dataset/ojdata.py index 83aaf39a..71342ae2 100644 --- a/fclib/fclib/dataset/ojdata.py +++ b/fclib/fclib/dataset/ojdata.py @@ -8,11 +8,16 @@ import pandas as pd import math import datetime import itertools +import argparse +import logging +import requests +from tqdm import tqdm +from fclib.common.utils import git_repo_path from fclib.feature_engineering.feature_utils import df_from_cartesian_product DATA_FILE_LIST = ["yx.csv", "storedemo.csv"] -SCRIPT_NAME = "download_oj_data.R" +SCRIPT_NAME = "load_oj_data.R" DEFAULT_TARGET_COL = "move" DEFAULT_STATIC_FEA = None @@ -21,25 +26,54 @@ DEFAULT_DYNAMIC_FEA = ["deal", "feat"] # The start datetime of the first week in the record FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00") - -def download_ojdata(dest_dir): - """Downloads Orange Juice dataset. - - Args: - dest_dir (str): Directory path for the downloaded file - """ - maybe_download(dest_dir=dest_dir) +# Original data source +OJ_URL = "https://github.com/cran/bayesm/raw/master/data/orangeJuice.rda" -def maybe_download(dest_dir): +log = logging.getLogger(__name__) + + +def maybe_download(url, dest_directory, filename=None): """Download a file if it is not already downloaded. - Args: - dest_dir (str): Destination directory + dest_directory (str): Destination directory. + url (str): URL of the file to download. + filename (str): File name. Returns: str: File path of the file downloaded. """ + if filename is None: + filename = url.split("/")[-1] + os.makedirs(dest_directory, exist_ok=True) + filepath = os.path.join(dest_directory, filename) + if not os.path.exists(filepath): + r = requests.get(url, stream=True) + total_size = int(r.headers.get("content-length", 0)) + block_size = 1024 + num_iterables = math.ceil(total_size / block_size) + + with open(filepath, "wb") as file: + for data in tqdm(r.iter_content(block_size), total=num_iterables, unit="KB", unit_scale=True,): + file.write(data) + else: + log.debug("File {} already downloaded".format(filepath)) + + return filepath + + +def download_ojdata(dest_dir="."): + """Download orange juice dataset from the original source. + + Args: + dest_dir (str): Directory path for the downloaded file + + Returns: + str: Path of the downloaded file. + """ + url = OJ_URL + rda_path = maybe_download(url, dest_directory=dest_dir) + # Check if data files exist data_exists = True for f in DATA_FILE_LIST: @@ -47,13 +81,21 @@ def maybe_download(dest_dir): data_exists = data_exists and os.path.exists(file_path) if not data_exists: - # Call data download script - print("Starting data download ...") - script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), SCRIPT_NAME) + # Call data loading script + repo_path = git_repo_path() + script_path = os.path.join(repo_path, "fclib", "fclib", "dataset", SCRIPT_NAME) + try: - subprocess.call(["Rscript", script_path, dest_dir]) + print(f"Destination directory: {dest_dir}") + output = subprocess.run( + ["Rscript", script_path, rda_path, dest_dir], stderr=subprocess.PIPE, stdout=subprocess.PIPE + ) + print(output.stdout) + if output.returncode != 0: + raise Exception(f"Subprocess failed - {output.stderr}") + except subprocess.CalledProcessError as e: - print(e.output) + raise e else: print("Data already exists at the specified location.") @@ -113,12 +155,12 @@ def split_train_test(data_dir, n_splits=1, horizon=2, gap=2, first_week=40, last Note that train_*.csv files in /train folder contain all the features in the training period and aux_*.csv files in /train folder contain all the features except 'logmove', 'constant', - 'profit' up until the forecast period end week. Both train_*.csv and aux_*csv can be used for + 'profit' up until the forecast period end week. Both train_*.csv and auxi_*csv can be used for generating forecasts in each split. However, test_*.csv files in /test folder can only be used for model performance evaluation. Example: - data_dir = "/home/vapaunic/forecasting/ojdata" + data_dir = "/home/ojdata" train, test, aux = split_train_test(data_dir=data_dir, n_splits=5, horizon=3, write_csv=True) @@ -174,7 +216,7 @@ def split_train_test(data_dir, n_splits=1, horizon=2, gap=2, first_week=40, last roundstr = "_" + str(i + 1) if n_splits > 1 else "" train_df.to_csv(os.path.join(TRAIN_DATA_DIR, "train" + roundstr + ".csv")) test_df.to_csv(os.path.join(TEST_DATA_DIR, "test" + roundstr + ".csv")) - aux_df.to_csv(os.path.join(TRAIN_DATA_DIR, "aux" + roundstr + ".csv")) + aux_df.to_csv(os.path.join(TRAIN_DATA_DIR, "auxi" + roundstr + ".csv")) train_df_list.append(train_df) test_df_list.append(test_df) @@ -436,9 +478,12 @@ def specify_retail_data_schema( if __name__ == "__main__": - data_dir = "/home/vapaunic/forecasting/ojdata" - download_ojdata(data_dir) + parser = argparse.ArgumentParser() + parser.add_argument("--data-dir", help="Data download directory") + args = parser.parse_args() + + download_ojdata(args.data_dir) # train, test, aux = split_train_test(data_dir=data_dir, n_splits=1, horizon=2, write_csv=True) # print((test[0].week)) diff --git a/fclib/fclib/feature_engineering/feature_utils.py b/fclib/fclib/feature_engineering/feature_utils.py index 55ffcc7e..006fb62f 100644 --- a/fclib/fclib/feature_engineering/feature_utils.py +++ b/fclib/fclib/feature_engineering/feature_utils.py @@ -11,10 +11,17 @@ import calendar import itertools import pandas as pd import numpy as np +import datetime from datetime import timedelta from sklearn.preprocessing import MinMaxScaler +from dateutil.relativedelta import relativedelta -from fclib.feature_engineering.utils import is_datetime_like +ALLOWED_TIME_COLUMN_TYPES = [ + pd.Timestamp, + pd.DatetimeIndex, + datetime.datetime, + datetime.date, +] # 0: Monday, 2: T/W/TR, 4: F, 5:SA, 6: S WEEK_DAY_TYPE_MAP = {1: 2, 3: 2} # Map for converting Wednesday and @@ -25,6 +32,11 @@ SEMI_HOLIDAY_CODE = 8 # days before and after a holiday DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" +def is_datetime_like(x): + """Function that checks if a data frame column x is of a datetime type.""" + return any(isinstance(x, col_type) for col_type in ALLOWED_TIME_COLUMN_TYPES) + + def day_type(datetime_col, holiday_col=None, semi_holiday_offset=timedelta(days=1)): """ Convert datetime_col to 7 day types @@ -1002,3 +1014,81 @@ def normalize_columns(df, seq_cols, scaler=MinMaxScaler()): df_scaled = pd.DataFrame(scaler.fit_transform(df[seq_cols]), columns=seq_cols, index=df.index) df_scaled = pd.concat([df[cols_fixed], df_scaled], axis=1) return df_scaled, scaler + + +def get_datetime_col(df, datetime_colname): + """ + Helper function for extracting the datetime column as datetime type from + a data frame. + + Args: + df: pandas DataFrame containing the column to convert + datetime_colname: name of the column to be converted + + Returns: + pandas.Series: converted column + + Raises: + Exception: if datetime_colname does not exist in the dateframe df. + Exception: if datetime_colname cannot be converted to datetime type. + """ + if datetime_colname in df.index.names: + datetime_col = df.index.get_level_values(datetime_colname) + elif datetime_colname in df.columns: + datetime_col = df[datetime_colname] + else: + raise Exception("Column or index {0} does not exist in the data " "frame".format(datetime_colname)) + + if not is_datetime_like(datetime_col): + datetime_col = pd.to_datetime(df[datetime_colname]) + return datetime_col + + +def get_month_day_range(date): + """ + Returns the first date and last date of the month of the given date. + """ + # Replace the date in the original timestamp with day 1 + first_day = date + relativedelta(day=1) + # Replace the date in the original timestamp with day 1 + # Add a month to get to the first day of the next month + # Subtract one day to get the last day of the current month + last_day = date + relativedelta(day=1, months=1, days=-1, hours=23) + return first_day, last_day + + +def add_datetime(input_datetime, unit, add_count): + """ + Function to add a specified units of time (years, months, weeks, days, + hours, or minutes) to the input datetime. + + Args: + input_datetime: datatime to be added to + unit: unit of time, valid values: 'year', 'month', 'week', + 'day', 'hour', 'minute'. + add_count: number of units to add + + Returns: + New datetime after adding the time difference to input datetime. + + Raises: + Exception: if invalid unit is provided. Valid units are: + 'year', 'month', 'week', 'day', 'hour', 'minute'. + """ + if unit == "Y": + new_datetime = input_datetime + relativedelta(years=add_count) + elif unit == "M": + new_datetime = input_datetime + relativedelta(months=add_count) + elif unit == "W": + new_datetime = input_datetime + relativedelta(weeks=add_count) + elif unit == "D": + new_datetime = input_datetime + relativedelta(days=add_count) + elif unit == "h": + new_datetime = input_datetime + relativedelta(hours=add_count) + elif unit == "m": + new_datetime = input_datetime + relativedelta(minutes=add_count) + else: + raise Exception( + "Invalid backtest step unit, {}, provided. Valid " "step units are Y, M, W, D, h, " "and m".format(unit) + ) + return new_datetime diff --git a/fclib/requirements.txt b/fclib/requirements.txt index 2ee724c7..88eb009e 100644 --- a/fclib/requirements.txt +++ b/fclib/requirements.txt @@ -1,4 +1,5 @@ pandas datetime scikit_learn -numpy \ No newline at end of file +numpy +requests \ No newline at end of file diff --git a/R/forecasting.Rproj b/forecasting.Rproj similarity index 99% rename from R/forecasting.Rproj rename to forecasting.Rproj index 5dfd7033..f06cf89a 100644 --- a/R/forecasting.Rproj +++ b/forecasting.Rproj @@ -10,4 +10,3 @@ NumSpacesForTab: 4 Encoding: UTF-8 RnwWeave: knitr - diff --git a/tests/ci/component_governance.yml b/tests/ci/component_governance.yml new file mode 100644 index 00000000..2a809587 --- /dev/null +++ b/tests/ci/component_governance.yml @@ -0,0 +1,49 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: + - master + - staging + +# no CI trigger +trigger: none + +jobs: +- job: Component_governance + timeoutInMinutes: 20 # how long to run the job before automatically cancelling + pool: + vmImage: 'ubuntu-16.04' + + steps: + - bash: | + python tools/generate_requirements_txt.py + displayName: 'Generate requirements.txt file from generate_conda_file.py' + + - task: ComponentGovernanceComponentDetection@0 + inputs: + scanType: 'Register' + verbosity: 'Verbose' + alertWarningLevel: 'High' + + - task: notice@0 + inputs: + outputformat: 'text' + + - bash: | + ls -la + cat NOTICE.txt + git status + result=$(git status | grep NOTICE.txt) + if [[ $result ]]; then + echo "Notice file modified: $result" + echo `git diff NOTICE.txt` + BRANCH=NOTICE/`date +%s` + git checkout -b $BRANCH + git add NOTICE.txt + git commit -m "Notice file modified." + git push origin $BRANCH + else + echo "Notice file not modified." + fi + displayName: 'Check in notice file if modified.' \ No newline at end of file diff --git a/tests/ci/cpu_integration_tests_linux.yml b/tests/ci/cpu_integration_tests_linux.yml index 2a720ccf..8fc5bb1f 100644 --- a/tests/ci/cpu_integration_tests_linux.yml +++ b/tests/ci/cpu_integration_tests_linux.yml @@ -14,10 +14,10 @@ trigger: jobs: - job: cpu_integration_tests_linux - timeoutInMinutes: 10 # how long to run the job before automatically cancelling + timeoutInMinutes: 60 # how long to run the job before automatically cancelling pool: # vmImage: 'ubuntu-16.04' # hosted machine - name: ForecastingAgents + name: $(Agent_Name) steps: - bash: | diff --git a/tests/ci/cpu_unit_tests_linux.yml b/tests/ci/cpu_unit_tests_linux.yml index 86206e28..3f66cba3 100644 --- a/tests/ci/cpu_unit_tests_linux.yml +++ b/tests/ci/cpu_unit_tests_linux.yml @@ -17,7 +17,7 @@ jobs: timeoutInMinutes: 10 # how long to run the job before automatically cancelling pool: # vmImage: 'ubuntu-16.04' # hosted machine - name: ForecastingAgents + name: $(Agent_Name) steps: - bash: | diff --git a/tests/conftest.py b/tests/conftest.py index d2ad6136..494a9eb4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,10 +5,21 @@ from fclib.common.utils import git_repo_path @pytest.fixture(scope="module") def notebooks(): + """Get paths of example notebooks. + + Returns: + dict: Dictionary including paths of the example notebooks. + """ repo_path = git_repo_path() examples_path = os.path.join(repo_path, "examples") - quick_start_path = os.path.join(examples_path, "00_quick_start") + usecase_path = os.path.join(examples_path, "grocery_sales", "python") + quick_start_path = os.path.join(usecase_path, "00_quick_start") + model_path = os.path.join(usecase_path, "02_model") # Path for the notebooks - paths = {"lightgbm_quick_start": os.path.join(quick_start_path, "lightgbm_point_forecast.ipynb")} + paths = { + "lightgbm_quick_start": os.path.join(quick_start_path, "lightgbm_single_round.ipynb"), + "lightgbm_multi_round": os.path.join(model_path, "lightgbm_multi_round.ipynb"), + "dilatedcnn_multi_round": os.path.join(model_path, "dilatedcnn_multi_round.ipynb"), + } return paths diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py index db6afc43..98441373 100644 --- a/tests/integration/test_notebooks_python.py +++ b/tests/integration/test_notebooks_python.py @@ -2,10 +2,6 @@ # Licensed under the MIT License. import os - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - import pytest import papermill as pm import scrapbook as sb @@ -23,3 +19,31 @@ def test_lightgbm_quick_start(notebooks): assert df.shape[0] == 1 mape = df.loc[df.name == "MAPE"]["data"][0] assert mape == pytest.approx(35.60, abs=ABS_TOL) + + +@pytest.mark.integration +def test_lightgbm_multi_round(notebooks): + notebook_path = notebooks["lightgbm_multi_round"] + output_notebook_path = os.path.join(os.path.dirname(notebook_path), "output.ipynb") + pm.execute_notebook( + notebook_path, output_notebook_path, kernel_name="forecast_cpu", parameters=dict(N_SPLITS=1), + ) + nb = sb.read_notebook(output_notebook_path) + df = nb.scraps.dataframe + assert df.shape[0] == 1 + mape = df.loc[df.name == "MAPE"]["data"][0] + assert mape == pytest.approx(36.0, abs=ABS_TOL) + + +@pytest.mark.integration +def test_dilatedcnn_multi_round(notebooks): + notebook_path = notebooks["dilatedcnn_multi_round"] + output_notebook_path = os.path.join(os.path.dirname(notebook_path), "output.ipynb") + pm.execute_notebook( + notebook_path, output_notebook_path, kernel_name="forecast_cpu", parameters=dict(N_SPLITS=2), + ) + nb = sb.read_notebook(output_notebook_path) + df = nb.scraps.dataframe + assert df.shape[0] == 1 + mape = df.loc[df.name == "MAPE"]["data"][0] + assert mape == pytest.approx(37.7, abs=ABS_TOL) diff --git a/tools/environment.yml b/tools/environment.yml index 27cd6222..efa92ec4 100644 --- a/tools/environment.yml +++ b/tools/environment.yml @@ -3,7 +3,7 @@ # To create the conda environment: # $ conda env create -f environment.yaml -# +# # To update the conda environment: # $ conda env update -f environment.yaml # @@ -16,30 +16,32 @@ channels: - defaults - conda-forge dependencies: - - python=3.6 - - pip - - jupyter - - ipykernel - - scipy==1.1.0 - - numpy==1.16.2 + - python=3.6.10 + - pip>=19.0.3 + - jupyter>=1.0.0 + - ipykernel>=4.6.1 + - jupyter_nbextensions_configurator=0.4.1 + - scipy=1.1.0 + - numpy=1.16.2 - pandas=0.23.4 - xlrd=1.1.0 - urllib3=1.21.1 - scikit-learn=0.20.3 - - pytest + - pytest>=3.6.4 + - tqdm>=4.43.0 + - pylint - papermill>=1.0.1 - matplotlib=3.1.2 - - r-base - - r-bayesm + - r-base>=3.3.0 - pip: - - black - - flake8 - - jupytext==1.3.0 + - black>=18.6b4 + - flake8>=3.3.0 + - jupytext>=1.3.0 - lightgbm==2.3.0 - tensorflow==2.0 - tensorboard==2.1.0 - nteract-scrapbook==0.3.1 - - gitpython==3.0.8 - azureml-sdk[explain,automl]==1.0.85 - statsmodels==0.11.1 - pmdarima==1.1.1 + - gitpython==3.0.8 diff --git a/tools/environment_setup.bat b/tools/environment_setup.bat new file mode 100644 index 00000000..3eb7a824 --- /dev/null +++ b/tools/environment_setup.bat @@ -0,0 +1,24 @@ +REM Copyright (c) Microsoft Corporation. +REM Licensed under the MIT License. + +REM Please follow instructions in this link +REM https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.html +REM to install Miniconda before running this script. + + +echo Update conda +call conda update conda --yes + +echo Create conda environment +call conda env create -f tools/environment.yml + +echo Activate conda environment +call conda activate forecasting_env + +echo Install forecasting utility library +call pip install -e fclib + +echo Register conda environment in Jupyter +call python -m ipykernel install --user --name forecasting_env + +echo Environment setup is done! \ No newline at end of file diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py new file mode 100644 index 00000000..fdd7349f --- /dev/null +++ b/tools/generate_conda_file.py @@ -0,0 +1,165 @@ +#!/usr/bin/python + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# This script creates yaml files to build conda environments +# For generating a conda file for running only python code: +# $ python generate_conda_file.py +# +# For generating a conda file for running python gpu: +# $ python generate_conda_file.py --gpu + + +import argparse +import textwrap +from sys import platform + + +HELP_MSG = """ +To create the conda environment: +$ conda env create -f {conda_env}.yaml + +To update the conda environment: +$ conda env update -f {conda_env}.yaml + +To register the conda environment in Jupyter: +$ conda activate {conda_env} +$ python -m ipykernel install --user --name {conda_env} \ +--display-name "Python ({conda_env})" +""" + + +CHANNELS = ["defaults", "conda-forge"] + +CONDA_BASE = { + "python": "python==3.6.10", + "pip": "pip>=19.1.1", + "ipykernel": "ipykernel>=4.6.1", + "jupyter": "jupyter>=1.0.0", + "jupyter_nbextensions_configurator": "jupyter_nbextensions_configurator>=0.4.1", + "numpy": "numpy>=1.16.2", + "pandas": "pandas>=0.23.4", + "pytest": "pytest>=3.6.4", + "scipy": "scipy>=1.1.0", + "xlrd": "xlrd>=1.1.0", + "urllib3": "urllib3>=1.21.1", + "scikit-learn": "scikit-learn>=0.20.3", + "tqdm": "tqdm>=4.43.0", + "pylint": "pylint>=2.4.4", + "matplotlib": "matplotlib>=3.1.2", + "r-base": "r-base>=3.3.0", + "papermill": "papermill>=1.0.1", +} + + +CONDA_GPU = {} + +PIP_BASE = { + "azureml-sdk": "azureml-sdk[explain,automl]==1.0.85", + "black": "black>=18.6b4", + "nteract-scrapbook": "nteract-scrapbook>=0.3.1", + "pre-commit": "pre-commit>=1.14.4", + "tensorboard": "tensorboard==2.1.0", + "tensorflow": "tensorflow==2.0", + "flake8": "flake8>=3.3.0", + "jupytext": "jupytext>=1.3.0", + "lightgbm": "lightgbm==2.3.0", + "statsmodels": "statsmodels==0.11.1", + "pmdarima": "pmdarima==1.1.1", + "gitpython": "gitpython==3.0.8", +} + +PIP_GPU = {} + +PIP_DARWIN = {} +PIP_DARWIN_GPU = {} + +PIP_LINUX = {} +PIP_LINUX_GPU = {} + +PIP_WIN32 = {} +PIP_WIN32_GPU = {} + +CONDA_DARWIN = {} +CONDA_DARWIN_GPU = {} + +CONDA_LINUX = {} +CONDA_LINUX_GPU = {} + +CONDA_WIN32 = {} +CONDA_WIN32_GPU = {} + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=textwrap.dedent( + """ + This script generates a conda file for different environments. + Plain python is the default, + but flags can be used to support GPU functionality.""" + ), + epilog=HELP_MSG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--name", help="specify name of conda environment") + parser.add_argument("--gpu", action="store_true", help="include packages for GPU support") + args = parser.parse_args() + + # set name for environment and output yaml file + conda_env = "forecasting_cpu" + if args.gpu: + conda_env = "forecasting_gpu" + + # overwrite environment name with user input + if args.name is not None: + conda_env = args.name + + # add conda and pip base packages + conda_packages = CONDA_BASE + pip_packages = PIP_BASE + + # update conda and pip packages based on flags provided + if args.gpu: + conda_packages.update(CONDA_GPU) + pip_packages.update(PIP_GPU) + + # update conda and pip packages based on os platform support + if platform == "darwin": + conda_packages.update(CONDA_DARWIN) + pip_packages.update(PIP_DARWIN) + if args.gpu: + conda_packages.update(CONDA_DARWIN_GPU) + pip_packages.update(PIP_DARWIN_GPU) + elif platform.startswith("linux"): + conda_packages.update(CONDA_LINUX) + pip_packages.update(PIP_LINUX) + if args.gpu: + conda_packages.update(CONDA_LINUX_GPU) + pip_packages.update(PIP_LINUX_GPU) + elif platform == "win32": + conda_packages.update(CONDA_WIN32) + pip_packages.update(PIP_WIN32) + if args.gpu: + conda_packages.update(CONDA_WIN32_GPU) + pip_packages.update(PIP_WIN32_GPU) + else: + raise Exception("Unsupported platform. Must be Windows, Linux, or macOS") + + # write out yaml file + conda_file = "{}.yaml".format(conda_env) + with open(conda_file, "w") as f: + for line in HELP_MSG.format(conda_env=conda_env).split("\n"): + f.write("# {}\n".format(line)) + f.write("name: {}\n".format(conda_env)) + f.write("channels:\n") + for channel in CHANNELS: + f.write("- {}\n".format(channel)) + f.write("dependencies:\n") + for conda_package in conda_packages.values(): + f.write("- {}\n".format(conda_package)) + f.write("- pip:\n") + for pip_package in pip_packages.values(): + f.write(" - {}\n".format(pip_package)) + + print("Generated conda file: {}".format(conda_file)) + print(HELP_MSG.format(conda_env=conda_env)) diff --git a/tools/generate_requirements_txt.py b/tools/generate_requirements_txt.py new file mode 100644 index 00000000..2c6a6e51 --- /dev/null +++ b/tools/generate_requirements_txt.py @@ -0,0 +1,43 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# This file outputs a requirements.txt based on the libraries defined in generate_conda_file.py +from generate_conda_file import ( + CONDA_BASE, + CONDA_GPU, + PIP_BASE, + PIP_GPU, + PIP_DARWIN, + PIP_LINUX, + PIP_WIN32, + CONDA_DARWIN, + CONDA_LINUX, + CONDA_WIN32, + PIP_DARWIN_GPU, + PIP_LINUX_GPU, + PIP_WIN32_GPU, + CONDA_DARWIN_GPU, + CONDA_LINUX_GPU, + CONDA_WIN32_GPU, +) + + +if __name__ == "__main__": + deps = list(CONDA_BASE.values()) + deps += list(CONDA_GPU.values()) + deps += list(PIP_BASE.values()) + deps += list(PIP_GPU.values()) + deps += list(PIP_DARWIN.values()) + deps += list(PIP_LINUX.values()) + deps += list(PIP_WIN32.values()) + deps += list(CONDA_DARWIN.values()) + deps += list(CONDA_LINUX.values()) + deps += list(CONDA_WIN32.values()) + deps += list(PIP_DARWIN_GPU.values()) + deps += list(PIP_LINUX_GPU.values()) + deps += list(PIP_WIN32_GPU.values()) + deps += list(CONDA_DARWIN_GPU.values()) + deps += list(CONDA_LINUX_GPU.values()) + deps += list(CONDA_WIN32_GPU.values()) + with open("requirements.txt", "w") as f: + f.write("\n".join(set(deps))) diff --git a/tools/readme_generator/readme_generator.py b/tools/readme_generator/readme_generator.py deleted file mode 100644 index 136583d3..00000000 --- a/tools/readme_generator/readme_generator.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import csvtomd -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np - - -### Generating performance charts -################################################# - -#Function to plot a performance chart -def plot_perf(x,y,df): - - # extract submission name from submission URL - labels = df.apply(lambda x: x['Submission Name'][1:].split(']')[0], axis=1) - - fig = plt.scatter(x=df[x],y=df[y], label=labels, s=150, alpha = 0.5, - c= ['b', 'g', 'r', 'c', 'm', 'y', 'k']) - plt.xlabel(x) - plt.ylabel(y) - plt.title(y + ' by ' + x) - offset = (max(df[y]) - min(df[y]))/50 - for i,name in enumerate(labels): - ax = df[x][i] - ay = df[y][i] + offset * (-2.5 + i % 5) - plt.text(ax, ay, name, fontsize=10) - - return(fig) - -### Printing the Readme.md file -############################################ -readmefile = '../../Readme.md' -#Write header -#print(file=open(readmefile)) -print('# TSPerf\n', file=open(readmefile, "w")) - -print('TSPerf is a collection of implementations of time-series forecasting algorithms in Azure cloud and comparison of their performance over benchmark datasets. \ -Algorithm implementations are compared by model accuracy, training and scoring time and cost. Each implementation includes all the necessary \ -instructions and tools that ensure its reproducibility.', file=open(readmefile, "a")) - -print('The following table summarizes benchmarks that are currently included in TSPerf.\n', file=open(readmefile, "a")) - -#Read the benchmark table the CSV file and converrt to a table in md format -with open('Benchmarks.csv', 'r') as f: - table = csvtomd.csv_to_table(f, ',') -print(csvtomd.md_table(table), file=open(readmefile, "a")) -print('\n\n\n',file=open(readmefile, "a")) - -print('A complete documentation of TSPerf, along with the instructions for submitting and reviewing implementations, \ -can be found [here](./docs/tsperf_rules.md). The tables below show performance of implementations that are developed so far. Source code of \ -implementations and instructions for reproducing their performance can be found in submission folders, which are linked in the first column.\n', file=open(readmefile, "a")) - -### Write the Energy section -#============================ - -print('## Probabilistic energy forecasting performance board\n\n', file=open(readmefile, "a")) -print('The following table lists the current submision for the energy forecasting and their respective performances.\n\n', file=open(readmefile, "a")) - -#Read the energy perfromane board from the CSV file and converrt to a table in md format -with open('TSPerfBoard-Energy.csv', 'r') as f: - table = csvtomd.csv_to_table(f, ',') -print(csvtomd.md_table(table), file=open(readmefile, "a")) - -#Read Energy Performance Board CSV file -df = pd.read_csv('TSPerfBoard-Energy.csv', engine='python') -#df - -#Plot ,'Pinball Loss' by 'Training and Scoring Cost($)' chart -fig4 = plt.figure(figsize=(12, 8), dpi= 80, facecolor='w', edgecolor='k') #this sets the plotting area size -fig4 = plot_perf('Training and Scoring Cost($)','Pinball Loss',df) -plt.savefig('../../docs/images/Energy-Cost.png') - - -#insetting the performance charts -print('\n\nThe following chart compares the submissions performance on accuracy in Pinball Loss vs. Training and Scoring cost in $:\n\n ', file=open(readmefile, "a")) -print('![EnergyPBLvsTime](./docs/images/Energy-Cost.png)' ,file=open(readmefile, "a")) -print('\n\n\n',file=open(readmefile, "a")) - - -#print the retail sales forcsating section -#======================================== -print('## Retail sales forecasting performance board\n\n', file=open(readmefile, "a")) -print('The following table lists the current submision for the retail forecasting and their respective performances.\n\n', file=open(readmefile, "a")) - -#Read the energy perfromane board from the CSV file and converrt to a table in md format -with open('TSPerfBoard-Retail.csv', 'r') as f: - table = csvtomd.csv_to_table(f, ',') -print(csvtomd.md_table(table), file=open(readmefile, "a")) -print('\n\n\n',file=open(readmefile, "a")) - -#Read Retail Performane Board CSV file -df = pd.read_csv('TSPerfBoard-Retail.csv', engine='python') -#df - -#Plot MAPE (%) by Training and Scoring Cost ($) chart -fig2 = plt.figure(figsize=(12, 8), dpi= 80, facecolor='w', edgecolor='k') #this sets the plotting area size -fig2 = plot_perf('Training and Scoring Cost ($)','MAPE (%)',df) -plt.savefig('../../docs/images/Retail-Cost.png') - - -#insetting the performance charts -print('\n\nThe following chart compares the submissions performance on accuracy in %MAPE vs. Training and Scoring cost in $:\n\n ', file=open(readmefile, "a")) -print('![EnergyPBLvsTime](./docs/images/Retail-Cost.png)' ,file=open(readmefile, "a")) -print('\n\n\n',file=open(readmefile, "a")) - -#insertting build status badge -print('## Build Status\n\n', file=open(readmefile, "a")) -print('| Build Type | Branch | Status | | Branch | Status |' ,file=open(readmefile, "a")) -print('| --- | --- | --- | --- | --- | --- |' ,file=open(readmefile, "a")) -print('| **Python Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/python_unit_tests_base?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=12&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/python_unit_tests_base?branchName=chenhui/python_test_pipeline)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=12&branchName=chenhui/python_test_pipeline) |' ,file=open(readmefile, "a")) -print('| **R Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/Forecasting/r_unit_tests_prototype?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=9&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/Forecasting/r_unit_tests_prototype?branchName=zhouf/r_test_pipeline)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=9&branchName=zhouf/r_test_pipeline) |' ,file=open(readmefile, "a")) -print('\n\n\n',file=open(readmefile, "a")) - - -print('A new Readme.md file has been generated successfully.') - - diff --git a/tools/repo_metrics/placeholder.txt b/tools/repo_metrics/placeholder.txt deleted file mode 100644 index b3a42524..00000000 --- a/tools/repo_metrics/placeholder.txt +++ /dev/null @@ -1 +0,0 @@ -placeholder \ No newline at end of file